def sort(self, e=None): self.words = init_list() self.btn_sort.Disable() self.words.sort(key=lambda x: x.score) save_list(self.words) time.sleep(1) self.btn_sort.Enable()
def main(): # --------- # load data args = get_args() labels = load(args.labels_path) terms = load(args.terms_path) pub_med_ids, _ = read_file(args.documents_path) index2word = load(args.index2word_path) # ------ # Encode params = vars(args) params['dropout'] = args.dropout params['data_size'] = len(labels) params['embedding_dim'] = args.embedding_dim params['num_epochs'] = args.num_epochs params['batch_size'] = args.batch_size params['term_size'] = args.mlp_layer_dims[-1] params['word_vecs_path'] = args.embedded_sentences.split('/')[1].split('.')[0] # get estimator estimator = EncodeEstimator(params) # todo out_dir = "" with h5py.File(args.embedded_sentences, 'r') as f: def sen_gen(): for i in docs_gen(f): yield i[0] def len_gen(): for i in docs_gen(f): yield i[1] if args.test_mode == 2: estimator.train(sen_gen, len_gen, labels, 1) else: estimator.train(sen_gen, len_gen, labels) doc_vecs, pred_labels = estimator.predict(sen_gen, len_gen) # --------- # save data # encoder data os.makedirs(out_dir) # write params to a txt file, except embeddings param_dir = out_dir + '/params.txt' with open(param_dir, 'w') as f: f.write(json.dumps(params)) pred_lab_words = [] for p_id, lab in zip(pub_med_ids, pred_labels): pred_lab = ', '.join([index2word[terms[l]] for l in lab]) line = str(p_id) + '\t' + pred_lab pred_lab_words.append(line) save_list(out_dir + '/pred_labels.txt', pred_lab_words)
def main(input_dir): out_file = input_dir.replace('.txt', '_phrase_embedded.txt') def process_one_docs(line): temp = line.split("\t") pmc_id = temp[0] # clean data text = temp[1].strip('\n. ').lower() # remove '\n', white spaces and the last '.' # remove stop words text = ' '.join( [word for word in text.split() if word not in STOP_WORDS]) blob = TextBlob(text) phrases = [np for np in blob.noun_phrases if 1 <= np.count(' ') <= 2] new_line = embed_phrases(text, phrases) if len(phrases) > 0 else text # test # num_underscore = sum(1 for word in new_line.split() if '_' in word) # num_phrases = len(phrases) # assert num_underscore == num_phrases return pmc_id + '\t' + new_line + '\n' with open(input_dir, encoding='utf-8') as input_file: new_documents = Parallel(n_jobs=-1)( delayed(process_one_docs)(line) for line in input_file) save_list(out_file, new_documents)
def save_index(self, fn): """ Saves a pre-computed index (or indices) so we can save our work. Input: fn - file name of pickled index. """ utils.save_list([self.inverted_idx, self.postingDict, self.documents], fn)
def downloader( mpaa, genre, equivalent_mpaa ): movie = "" download_trailers = [] utils.log( "Starting Trailer Downloader", xbmc.LOGNOTICE ) genre = genre.replace( "_", " / " ) download_trailers = _download_trailers( equivalent_mpaa, mpaa, genre, movie ) utils.log( "Saving List of Downloaded Trailers", xbmc.LOGNOTICE ) base_path = os.path.join( BASE_CURRENT_SOURCE_PATH, "downloaded_trailers.txt" ) utils.save_list( base_path, download_trailers, "Downloaded Trailers" )
def main(): separately_loaded_model = load_array_multiprocess() words = list(separately_loaded_model.keys()) vecs = list(separately_loaded_model.values()) save_list(words, r'data/array_raw/words.txt') np.save(r'data/array_raw/vectors.npy', vecs)
def downloader(mpaa, genre, equivalent_mpaa): movie = "" download_trailers = [] utils.log("Starting Trailer Downloader", xbmc.LOGNOTICE) genre = genre.replace("_", " / ") download_trailers = _download_trailers(equivalent_mpaa, mpaa, genre, movie) utils.log("Saving List of Downloaded Trailers", xbmc.LOGNOTICE) base_path = os.path.join(BASE_CURRENT_SOURCE_PATH, "downloaded_trailers.txt") utils.save_list(base_path, download_trailers, "Downloaded Trailers")
def clean_docs(file_name): start = time.time() with open(file_name) as f: docs = [line for line in f] cleaned = Parallel(n_jobs=-1)(delayed(clean_line)(doc) for doc in docs) save_list('data/' + file_name.split('/')[1].split('.')[0] + '_cleaned.txt', cleaned) print("Finished cleaning data. Time: {}".format(time.time() - start))
def write_results_to_file(out_dir, pub_med_ids, labels, pred_labels, expanded): expanded_labels = [] for p_id, lab, pl, ex in zip(pub_med_ids, labels, pred_labels, expanded): orig = ', '.join(lab) pred_lab = ', '.join(pl) e_lab = ', '.join(ex) line = str(p_id) + '\tORIGINAL: ' + orig + '\tPREDICTED' + pred_lab + '\tEXPANDED: ' + e_lab expanded_labels.append(line) fname = os.path.split(out_dir)[-1] + '_expanded_labels.txt' expanded_labels_dir = os.path.join(out_dir, fname) save_list(expanded_labels_dir, expanded_labels)
def show_frame(self, cont): global words_list, weights if cont == StartPage: save_list(StartPage.word_length, words_list, weights) elif cont == FlashCards: words_list, weights = open_newlist(StartPage.word_length) else: print("should never occur") frame = self.frames[cont] frame.tkraise()
def main(): pub_med_ids, documents = read_file(args.i) settings = vars(args) # --- # w2v wv_dir = get_wv(documents, settings, args.m) min_count = str(settings['min_count']) keyed_vectors = KeyedVectors.load(wv_dir) vocab = keyed_vectors.vocab index2word = keyed_vectors.index2word save_list('data/index2word_mc' + min_count + '.txt', index2word) save('data/index2word_mc' + min_count + '.pickle', index2word) # ----- # tfidf tfidf_model_dir = 'results/models/tfidf_model_mc' + min_count if os.path.isfile(tfidf_model_dir): print("This tfidf model has already been trained.") return labels, terms_tuples, wv2terms, doc_tfidf_reps, tfidf_model = get_tfidf( documents, vocab, args.n) tfidf_model.save(tfidf_model_dir) # ------------ # save to disk # convert to word ids docs = [[vocab[token].index for token in d if token in vocab] for d in documents] terms_txt = ['{}\t{}'.format(index2word[t[0]], t[1]) for t in terms_tuples] # get rid of tfidf value and only keep word id terms = [t[0] for t in terms_tuples] labels_txt = ['{}\t{}'.format(pub_med_id, ', '.join([index2word[terms[l]] for l in lab])) for pub_med_id, lab in zip(pub_med_ids, labels)] doc_tfidf_reps_txt = ['{}\t{}'.format(pub_med_id, ', '.join([index2word[l] for l in lab])) for pub_med_id, lab in zip(pub_med_ids, doc_tfidf_reps)] save_list('data/terms_mc' + min_count + '.txt', terms_txt) save_list('data/labels_mc' + min_count + '.txt', labels_txt) save_list('data/doc_tfidf_reps_mc' + min_count + '.txt', doc_tfidf_reps_txt) save('data/docs_word_indices_mc' + min_count + '.pickle', docs) save('data/labels_mc' + min_count + '.pickle', labels) save('data/doc_tfidf_reps_mc' + min_count + '.pickle', doc_tfidf_reps) save('data/wv2terms_mc' + min_count + '.pickle', wv2terms) save('data/terms_mc' + min_count + '.pickle', terms)
def inference_only(param_path): # ----- # load data with open(param_path) as f: args = json.load(f) args = DotDict(args) out_dir = args.model_dir.replace('model', 'output') doc_vecs_path = os.path.join(out_dir, 'doc_vecs.npy') pub_med_ids, _ = read_file(args.documents_path) labels = load(args.labels_path) index2word = load(args.index2word_path) terms = load(args.terms_path) doc_vecs = np.load(doc_vecs_path) # --------- # Inference doc_tfidf_reps = labels if len(args.doc_tfidf_reps_path) > 0: doc_tfidf_reps = load(args.doc_tfidf_reps_path) fused_docs, expanded, top_k_indices = inference.main( doc_vecs, doc_tfidf_reps, args.k, args.fuse_doc_type) save(os.path.join(out_dir, 'top_k_indices'), top_k_indices) if args.keep_model_files: np.save(os.path.join(out_dir, 'fused_docs'), fused_docs) np.save(os.path.join(out_dir, 'doc_vecs'), doc_vecs) del doc_vecs, top_k_indices, fused_docs # ---------------------------- # Save expanded labels to disk # convert to word ids labels = [[terms[l] for l in lab] for lab in labels] if len(args.doc_tfidf_reps_path) == 0: expanded = [[terms[l] for l in lab] for lab in expanded] expanded_labels = [] for p_id, l, ex in zip(pub_med_ids, labels, expanded): e_words = ', '.join([index2word[e] for e in ex]) original = ', '.join([index2word[i] for i in l]) line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words expanded_labels.append(line) fname = os.path.split(out_dir)[-1] + '_expanded_labels.txt' expanded_labels_dir = os.path.join(out_dir, fname) save_list(expanded_labels_dir, expanded_labels)
def make_docs(): start = time.time() with open('raw_data/docs.txt') as docs62k: pcm_ids = [line.split('\t')[0] for line in docs62k] def find_line(line): if line.split('\t')[0] in pcm_ids: return line with open('raw_data/sampleval_and_SE_good_docs.txt') as raw: docs62k = Parallel(n_jobs=8)(delayed(find_line)(line) for line in raw) save_list('raw_data/raw_docs_62k.txt', docs62k) print("Finished making docs. Time: {}".format(time.time() - start))
def main(): subreddits = [] for i, offset in enumerate(range(0, 500, 100)): print(offset) if i > 0: sleep(2) source = get_url('http://redditmetrics.com/top/offset/{offset}'.format(offset=offset)) soup = BeautifulSoup(source) cells = soup.find_all('td', string=lambda s: s[0:3] == '/r/') new_subreddits = [ cell.get_text()[3:] for cell in cells ] subreddits += new_subreddits print(new_subreddits) subreddits[:] = unique(subreddits) print('Saving these subreddits:', subreddits) save_list(subreddits, 'subreddits.txt') print('Done')
def main(): subreddits = [] for i, offset in enumerate(range(0, 500, 100)): print(offset) if i > 0: sleep(2) source = get_url('http://redditmetrics.com/top/offset/{offset}'.format( offset=offset)) soup = BeautifulSoup(source) cells = soup.find_all('td', string=lambda s: s[0:3] == '/r/') new_subreddits = [cell.get_text()[3:] for cell in cells] subreddits += new_subreddits print(new_subreddits) subreddits[:] = unique(subreddits) print('Saving these subreddits:', subreddits) save_list(subreddits, 'subreddits.txt') print('Done')
def save_postings(self): o = sorted(self.posting_list, key=lambda x: x[0]) self.locations_at_postings[str( self.counter_of_postings)] = utils.save_list( o, str(self.counter_of_postings), self.config.get_out_path()) self.counter_of_postings += 1 self.posting_dict = {} self.posting_list = []
def inference_from_checkpoint( pub_med_ids, docs, init_embed, labels, doc_tfidf_reps, index2word, terms, model, root_output_folder, folder, k): params = vars(args) params['embedding_dim'] = init_embed.shape[1] params['embeddings'] = init_embed # get estimator estimator = EncodeEstimator(params) doc_vecs, _ = estimator.predict(docs) # Get top-k indices for documents top_k_indices = get_docs_neighbors(doc_vecs, model, k) expanded_labels_dir = os.path.join(root_output_folder, folder) os.makedirs(expanded_labels_dir, exist_ok=True) top_k_indices_path = os.path.join(expanded_labels_dir, 'top_k_indices') save(top_k_indices_path, top_k_indices) expanded = get_expanded_terms(top_k_indices, doc_tfidf_reps) labels = [[terms[l] for l in labs] for labs in labels] expanded_labels = [] for p_id, l, ex in zip(pub_med_ids, labels, expanded): e_words = ', '.join([index2word[e] for e in ex]) original = ', '.join([index2word[i] for i in l]) line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words expanded_labels.append(line) expanded_labels_path = os.path.join(expanded_labels_dir, folder + '_expanded_labels_top1.txt') save_list(expanded_labels_path, expanded_labels)
def add_to_xml(self, event=None): self.btn_add.Disable() self.words = init_list() if self.in_xml(): # strange duplicate items return # if add a word that is not in db now to xml, save it to db. try: temp = Item.get(name=self.item.name) except Exception: temp = None if not temp: self.item.save(force_insert=True) # only save() not save to db try: if self.words: self.words.insert(1, self.item.convert()) else: self.words.insert(0, self.item.convert()) save_list(self.words) except Exception as e: print(e)
def add_to_xml(self, e=None): self.btn_add.Disable() self.words = init_list() if self.in_xml(): # strange duplicate items return # if add a word that is not in db now to xml, save it to db. try: temp = Item.get(name=self.item.name) except: temp = None if not temp: self.item.save(force_insert=True) # only save() not save to db try: if self.words: self.words.insert(1, self.item.convert()) else: self.words.insert(0, self.item.convert()) save_list(self.words) except Exception as e: print e
def _save_trigger_list( self ): base_path = os.path.join( BASE_CURRENT_SOURCE_PATH, "trigger_list.txt" ) utils.save_list( base_path, self.trigger_list, "Trigger List" )
def _save_watched_trivia_file( self ): base_path = os.path.join( BASE_CURRENT_SOURCE_PATH, "trivia_watched.txt" ) #print self.watched utils.save_list( base_path, self.watched, "Watched Trivia" )
def main(): # --------- # Load data args = get_args() docs = load(args.docs_path) labels = load(args.labels_path) terms = load(args.terms_path) if args.loss_fn == 'softmax_uniform' or args.loss_fn == 'softmax_skewed_labels': docs, labels = remove_samples_with_empty_labels(docs, labels) if args.test_mode != 0: docs = docs[:100] labels = labels[:100] zipped = list(zip(docs, labels)) random.seed(42) random.shuffle(zipped) training_set_size = int(len(zipped)*0.9) training_set = zipped[:training_set_size] docs_train, labels_train = [[*x] for x in zip(*training_set)] eval_set = zipped[training_set_size:] docs_eval, labels_eval = [[*x] for x in zip(*eval_set)] labels_lm_train = [[terms[l] for l in lab] for lab in labels_train] labels_lm_eval = [[terms[l] for l in lab] for lab in labels_eval] # get params params = vars(args) params['embeddings'] = np.load(args.word_vecs_path) params['embeddings_dim'] = params['embeddings'].shape[1] parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) if args.folder is not None: folder = args.folder params['model_dir'] = os.path.join(parent_path, 'results', 'models', folder) out_dir = os.path.join(parent_path, 'results', 'outputs', folder, 'direct') else: folder = '%d_%s_%s_%s_nl%s_kln%s_dp%s_ep%d_bs%d' % ( int(time.time()), params['model'], params['loss_fn'], os.path.split(args.word_vecs_path)[-1], params['num_layers'], os.path.split(args.labels_path)[1], params['dropout'], params['num_epochs'], params['batch_size']) params['model_dir'] = os.path.join(parent_path, 'results', 'models', folder) out_dir = os.path.join(parent_path, 'results', 'outputs', folder) os.makedirs(out_dir, exist_ok=True) # ------ # Encode estimator = EncodeEstimator(params) if args.folder is None: max_step = 1 if args.test_mode == 2 else None estimator.train_and_eval( docs_train, labels_lm_train, labels_train, docs_eval, labels_lm_eval, labels_eval, max_step=max_step) estimator.batch_size = 128 # takes less time with large batch size doc_vecs, pred_labels = estimator.predict(docs) pub_med_ids, _ = read_file(args.documents_path) index2word = load(args.index2word_path) # save predicted labels to disk pred_lab_words = [] for p_id, lab in zip(pub_med_ids, pred_labels): pred_lab = ', '.join([index2word[terms[l]] for l in lab]) line = str(p_id) + '\t' + pred_lab pred_lab_words.append(line) save_list(os.path.join(out_dir, 'pred_labels.txt'), pred_lab_words) if not args.keep_model_files: shutil.rmtree(params['model_dir'], ignore_errors=True) # write params to a txt file, except embeddings param_dir = os.path.join(out_dir, 'params.txt') if 'embeddings' in params: del params['embeddings'] with open(param_dir, 'w') as f: f.write(json.dumps(params)) del params print("Finished predicting.") if args.no_inference: exit(1) # --------- # Inference doc_tfidf_reps = labels if len(args.doc_tfidf_reps_path) > 0: doc_tfidf_reps = load(args.doc_tfidf_reps_path) fused_docs, expanded, top_k_indices = inference.main( doc_vecs, doc_tfidf_reps, args.k, args.fuse_doc_type) save(os.path.join(out_dir, 'top_k_indices'), top_k_indices) np.save(os.path.join(out_dir, 'doc_vecs'), doc_vecs) if args.keep_model_files: np.save(os.path.join(out_dir, 'fused_docs'), fused_docs) del doc_vecs, top_k_indices, fused_docs # ---------------------------- # Save expanded labels to disk # convert to word ids labels = [[terms[l] for l in lab] for lab in labels] if len(args.doc_tfidf_reps_path) == 0: expanded = [[terms[l] for l in lab] for lab in expanded] expanded_labels = [] for p_id, l, ex in zip(pub_med_ids, labels, expanded): e_words = ', '.join([index2word[e] for e in ex]) original = ', '.join([index2word[i] for i in l]) line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words expanded_labels.append(line) fname = os.path.split(out_dir)[-1] + '_expanded_labels.txt' expanded_labels_dir = os.path.join(out_dir, fname) save_list(expanded_labels_dir, expanded_labels)
def _save_watched( self ): utils.save_list( self.watched_path, self.watched, "Watched Trailers" )
def close_handler(self, e=None): save_list(self.words) self.Destroy() if Status.running: self.master.btn_recite_handler()
a == _y for a, _y in zip( self._batch_input_out(x).argmax(1), y.argmax(1)) ]) / len(y) print("Loss: %f, Accuracy: %f" % (loss, acc)) self.history['epoch'].append(_ + 1) self.history['loss'].append(loss) self.history['acc'].append(acc) if __name__ == '__main__': train_dataset = TrainDataSet(r'data/raw/train.tsv') test_dataset = TestDataSet(r'data/raw/test.tsv') train_feature = train_dataset.bow_feature train_label = train_dataset.labels from keras.utils import to_categorical train_label = to_categorical(train_label) test_feature = test_dataset.bow_feature print(train_feature.shape) print(train_label.shape) model = SoftmaxClassifier(train_feature.shape[1], train_label.shape[1]) model.compile('sgd', 0.05) model.fit(train_feature, train_label, batch_size=256, epoch=30) from utils import save_list save_list(model.history['current_loss'], r'data/batch_loss.txt') save_list(model.history['current_acc'], r'data/batch_acc.txt') save_list(model.history['loss'], r'data/loss.txt') save_list(model.history['acc'], r'data/acc.txt')
print('Example body after pre-processing:', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') title_pp.set_cleaner(textacy_cleaner) # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) print('Example original title:', train_title_raw[0]) print('Example title after pre-processing:', train_title_vecs[0]) save_text("/tmp/train_title_raw.txt", train_title_raw[0]) save_text("/tmp/train_body_raw.txt", train_body_raw[0]) save_list("/tmp/train_title_vecs.txt", train_title_vecs[0]) save_list("/tmp/train_body_vecs.txt", train_body_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs)
def refresh(self): global current_word current_word = {} # clear current_word for fresh start self.update_text(self.message) self.controller.show_frame(StartPage) def speak(self): global current_word if len(current_word) == 0: pass else: speaker = Synthesizer(voice='Alex', device='Built-in') speaker.text = current_word['word'] speaker.talk() def update_text(self, txt): self.text_box.configure(state='normal') self.text_box.delete('1.0', tk.END) self.text_box.insert(tk.END, txt) self.text_box.configure(state='disabled') if __name__ == "__main__": app = Application() app.mainloop() # save the list in case the app window is simply closed save_list(StartPage.word_length, words_list, weights)
def _save_watched_trivia_file(self): base_path = os.path.join(BASE_CURRENT_SOURCE_PATH, "trivia_watched.txt") #print self.watched utils.save_list(base_path, self.watched, "Watched Trivia")
friends_set = set(get_friend_ids(api)) if exclude_followers: friends_set |= set(get_follower_ids(api)) friends_set.add(own_id) for _1 in range(chain_depth): for _2 in range(curQ.qsize()): curUser = curQ.get() curUserFoers = get_followers(api, printing=False, screen_name=curUser) users_to_block = list( filter( lambda x: x.id not in friends_set and x.id not in ids_to_block, curUserFoers, )) if len(users_to_block) == 0: continue ids_to_block |= set(users_to_ids(users_to_block)) for u in users_to_block: curQ.put(u.screen_name) ids_to_block -= friends_set if args.dry_run: print(f'Number of users to block: {len(ids_to_block)}.') if confirm('Save list of user-ids-to-block?', default=False): save_list(f'user_ids_to_block-{own_id}.list', ids_to_block) exit(0) remove_users(api, ids_to_block, unblock=False)
def _save_trigger_list(self): base_path = os.path.join(BASE_CURRENT_SOURCE_PATH, "trigger_list.txt") utils.save_list(base_path, self.trigger_list, "Trigger List")
# Process the docs and save the tokens into # a vocab file from utils import process_docs from utils import save_list from utils import load_doc from collections import Counter vocab = Counter() process_docs('dataset/txt_sentoken/neg', vocab) process_docs('dataset/txt_sentoken/pos', vocab) print("[INFO] Vocab length: {}".format(len(vocab))) min_occurences = 2 tokens = [k for k, c in vocab.items() if c >= min_occurences] print("[INFO] Vocab length after processing: {}".format(len(tokens))) save_list(tokens, 'vocab.txt') vocab_filename = 'vocab.txt' vocab = load_doc(vocab_filename) vocab = set(vocab.split())
def main(): # --------- # load data args = get_args() pub_med_ids, documents = read_file(args.documents_path) labels = load(args.labels_path) terms = load(args.terms_path) if args.test_mode != 0: documents = documents[:100] labels = labels[:100] # ------ # Encode folder = str(int(time.time())) + '_doc2vec' model_dir = '../results/models/' + folder os.makedirs(model_dir, exist_ok=True) out_dir = '../results/outputs/' + folder os.makedirs(out_dir, exist_ok=True) documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)] model = Doc2Vec(documents, vector_size=300, window=10, min_count=5, workers=-1, epochs=20) model.save(model_dir + '/doc2vec.model') doc_vecs = model.docvecs.vectors_docs # --------- # Inference doc_tfidf_reps = labels if len(args.doc_tfidf_reps_path) > 0: doc_tfidf_reps = load(args.doc_tfidf_reps_path) fused_docs, expanded, top_k_indices = inference.main( doc_vecs, doc_tfidf_reps, args.k, args.fuse_doc_type) save(os.path.join(out_dir, 'top_k_indices'), top_k_indices) if args.keep_model_files: np.save(os.path.join(out_dir, 'fused_docs'), fused_docs) np.save(os.path.join(out_dir, 'doc_vecs'), doc_vecs) del doc_vecs, top_k_indices, fused_docs # --------- # save data # convert to word ids labels = [[terms[l] for l in lab] for lab in labels] if len(args.doc_tfidf_reps_path) == 0: expanded = [[terms[l] for l in lab] for lab in expanded] index2word = load(args.index2word_path) expanded_labels = [] for p_id, l, ex in zip(pub_med_ids, labels, expanded): e_words = ', '.join([index2word[e] for e in ex]) original = ', '.join([index2word[i] for i in l]) line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words expanded_labels.append(line) expanded_labels_dir = out_dir + '/' + out_dir.split( '/')[-1] + '_expanded_labels.txt' save_list(expanded_labels_dir, expanded_labels)
import os import numpy as np from utils import save_list, load_txt_raw """ Script: Transfer .txt raw data to .npy data """ # Size: 8824330 # Dim: 200 # dict # word(str): vector(np.ndarray) txt_dirs = r'data/txt_s/' i = 1 for file in os.listdir(txt_dirs): fn = os.path.join(txt_dirs, file) p_model = load_txt_raw(fn) p_words = [] p_vectors = [] for k, v in p_model.items(): p_words.append(k) p_vectors.append(v) save_list(p_words, r'data/array_raw/words_%d.txt' % i) p_vectors = np.array(p_vectors) np.save(r'data/array_raw/vectors_%d.npy' % i, p_vectors) i += 1
def merge_chunks(self): """ performs a K-way merge on the posting files -> N disk accesses writes new posting files to the disk. :return: """ saved_chunks = [] chunks_indices = np.zeros(shape=(len(self.locations_at_postings)), dtype=np.int32) chunk_length = self.postingDict_size // len( self.locations_at_postings) + 1 # inserts the chunks into a chunked list for key in self.locations_at_postings: loaded, offset = utils.load_list(key, self.config.get_out_path(), self.locations_at_postings[key], chunk_length) saved_chunks.append(loaded) self.locations_at_postings[key] = offset building_list = [] all_empty = True # loops through as long as all postings files didn't finish running. while all_empty: should_enter = -1 # loops through as long as one of the chunks is not done. while should_enter == -1: term_to_enter = self.find_term(saved_chunks, chunks_indices) tuples_to_merge = [] indexes_of_the_indexes_to_increase = [] # find all tuples that should be merged and the indices should be increased for idx, term_idx_in_chunk in enumerate(chunks_indices): if term_idx_in_chunk < len(saved_chunks[idx]) and \ saved_chunks[idx][term_idx_in_chunk][0] == term_to_enter: tuples_to_merge.append( saved_chunks[idx][term_idx_in_chunk]) indexes_of_the_indexes_to_increase.append(idx) merged_tuple = self.merge_terms_into_one(tuples_to_merge) appended_term = merged_tuple[0] should_append = True # if it is a named entity and it exists in less than 2 tweets, erase this term. if appended_term in self.entities_dict and self.entities_dict[ appended_term] < 2: should_append = False self.inverted_idx.pop(appended_term, None) # update terms with capital letters if appended_term in self.global_capitals and self.global_capitals[ appended_term]: merged_tuple = (appended_term.upper(), merged_tuple[1]) inverted_val = self.inverted_idx[appended_term] self.inverted_idx.pop(appended_term, None) self.inverted_idx[appended_term.upper()] = inverted_val appended_term = merged_tuple[0] if appended_term in self.inverted_idx and self.inverted_idx[ appended_term][0] == 1: should_append = False self.inverted_idx.pop(appended_term, None) if should_append: self.accumulative_size += len(merged_tuple[1]) building_list.append(merged_tuple) self.inverted_idx[merged_tuple[0]][1] = str( self.counter_of_postings) # increase the indices that the tuple at the specific location have been inserted to the new posting for idx in indexes_of_the_indexes_to_increase: chunks_indices[idx] += 1 should_enter = self.update_should_enter( saved_chunks, chunks_indices) # saving happens as soon as the size reaches given max size of the final posting if self.accumulative_size >= self.max_accumulative: self.merged_dicts.append(str(self.counter_of_postings)) utils.save_list(building_list, str(self.counter_of_postings), self.config.get_out_path()) self.accumulative_size = 0 self.counter_of_postings += 1 building_list = [] # loads new chunks into the save_chunks list in the relevant indices. for index in should_enter: loaded, offset = utils.load_list( str(index), self.config.get_out_path(), self.locations_at_postings[str(index)], chunk_length) saved_chunks[index] = loaded chunks_indices[index] = 0 self.locations_at_postings[str(index)] = offset # checks whether all postings are done. all_empty = False for chunk in saved_chunks: if len(chunk) > 0: all_empty = True break # save of the last posting file. if len(building_list) > 0: self.merged_dicts.append(str(self.counter_of_postings)) utils.save_list(building_list, str(self.counter_of_postings), self.config.get_out_path())
executor = ThreadPoolExecutor(max_workers=80) cancelled = False delete_failed_ids = [] def delete_tweet(tid): if cancelled: return try: print(f'deleting {tid}') api.DestroyFavorite(status_id=tid) except TwitterError as e: if e.message[0]['code'] == 144: return print(e) delete_failed_ids.append(tid) try: for like in likes: executor.submit(delete_tweet, like['like']['tweetId']) executor.shutdown(wait=True) except (KeyboardInterrupt, SystemExit): cancelled = True print('Interrupted, exiting...') save_list('delete_failed_like_ids.list', delete_failed_ids)