Exemplo n.º 1
0
 def sort(self, e=None):
     self.words = init_list()
     self.btn_sort.Disable()
     self.words.sort(key=lambda x: x.score)
     save_list(self.words)
     time.sleep(1)
     self.btn_sort.Enable()
Exemplo n.º 2
0
 def sort(self, e=None):
     self.words = init_list()
     self.btn_sort.Disable()
     self.words.sort(key=lambda x: x.score)
     save_list(self.words)
     time.sleep(1)
     self.btn_sort.Enable()
Exemplo n.º 3
0
def main():
    # ---------
    # load data
    args = get_args()
    labels = load(args.labels_path)
    terms = load(args.terms_path)
    pub_med_ids, _ = read_file(args.documents_path)
    index2word = load(args.index2word_path)

    # ------
    # Encode
    params = vars(args)
    params['dropout'] = args.dropout
    params['data_size'] = len(labels)
    params['embedding_dim'] = args.embedding_dim
    params['num_epochs'] = args.num_epochs
    params['batch_size'] = args.batch_size
    params['term_size'] = args.mlp_layer_dims[-1]
    params['word_vecs_path'] = args.embedded_sentences.split('/')[1].split('.')[0]

    # get estimator
    estimator = EncodeEstimator(params)

    # todo
    out_dir = ""

    with h5py.File(args.embedded_sentences, 'r') as f:
        def sen_gen():
            for i in docs_gen(f):
                yield i[0]

        def len_gen():
            for i in docs_gen(f):
                yield i[1]

        if args.test_mode == 2:
            estimator.train(sen_gen, len_gen, labels, 1)
        else:
            estimator.train(sen_gen, len_gen, labels)

        doc_vecs, pred_labels = estimator.predict(sen_gen, len_gen)

    # ---------
    # save data
    # encoder data
    os.makedirs(out_dir)

    # write params to a txt file, except embeddings
    param_dir = out_dir + '/params.txt'
    with open(param_dir, 'w') as f:
        f.write(json.dumps(params))

    pred_lab_words = []
    for p_id, lab in zip(pub_med_ids, pred_labels):
        pred_lab = ', '.join([index2word[terms[l]]
                              for l in lab])
        line = str(p_id) + '\t' + pred_lab
        pred_lab_words.append(line)

    save_list(out_dir + '/pred_labels.txt', pred_lab_words)
def main(input_dir):
    out_file = input_dir.replace('.txt', '_phrase_embedded.txt')

    def process_one_docs(line):
        temp = line.split("\t")
        pmc_id = temp[0]

        # clean data
        text = temp[1].strip('\n. ').lower()  # remove '\n', white spaces and the last '.'

        # remove stop words
        text = ' '.join(
            [word for word in text.split()
             if word not in STOP_WORDS])

        blob = TextBlob(text)
        phrases = [np for np in blob.noun_phrases if 1 <= np.count(' ') <= 2]
        new_line = embed_phrases(text, phrases) if len(phrases) > 0 else text

        # test
        # num_underscore = sum(1 for word in new_line.split() if '_' in word)
        # num_phrases = len(phrases)
        # assert num_underscore == num_phrases

        return pmc_id + '\t' + new_line + '\n'

    with open(input_dir, encoding='utf-8') as input_file:
        new_documents = Parallel(n_jobs=-1)(
            delayed(process_one_docs)(line) for line in input_file)

    save_list(out_file, new_documents)
Exemplo n.º 5
0
 def save_index(self, fn):
     """
     Saves a pre-computed index (or indices) so we can save our work.
     Input:
           fn - file name of pickled index.
     """
     utils.save_list([self.inverted_idx, self.postingDict, self.documents],
                     fn)
def downloader( mpaa, genre, equivalent_mpaa ):
    movie = ""
    download_trailers = []
    utils.log( "Starting Trailer Downloader", xbmc.LOGNOTICE )
    genre = genre.replace( "_", " / " )
    download_trailers = _download_trailers( equivalent_mpaa, mpaa, genre, movie )
    utils.log( "Saving List of Downloaded Trailers", xbmc.LOGNOTICE )
    base_path = os.path.join( BASE_CURRENT_SOURCE_PATH, "downloaded_trailers.txt" )
    utils.save_list( base_path, download_trailers, "Downloaded Trailers" )
def main():
    
    separately_loaded_model = load_array_multiprocess()

    words = list(separately_loaded_model.keys())
    vecs = list(separately_loaded_model.values())

    save_list(words, r'data/array_raw/words.txt')
    np.save(r'data/array_raw/vectors.npy', vecs)
Exemplo n.º 8
0
def downloader(mpaa, genre, equivalent_mpaa):
    movie = ""
    download_trailers = []
    utils.log("Starting Trailer Downloader", xbmc.LOGNOTICE)
    genre = genre.replace("_", " / ")
    download_trailers = _download_trailers(equivalent_mpaa, mpaa, genre, movie)
    utils.log("Saving List of Downloaded Trailers", xbmc.LOGNOTICE)
    base_path = os.path.join(BASE_CURRENT_SOURCE_PATH,
                             "downloaded_trailers.txt")
    utils.save_list(base_path, download_trailers, "Downloaded Trailers")
Exemplo n.º 9
0
def clean_docs(file_name):
    start = time.time()
    with open(file_name) as f:
        docs = [line for line in f]

    cleaned = Parallel(n_jobs=-1)(delayed(clean_line)(doc) for doc in docs)

    save_list('data/' + file_name.split('/')[1].split('.')[0] + '_cleaned.txt', cleaned)

    print("Finished cleaning data. Time: {}".format(time.time() - start))
def write_results_to_file(out_dir, pub_med_ids, labels, pred_labels, expanded):
    expanded_labels = []
    for p_id, lab, pl, ex in zip(pub_med_ids, labels, pred_labels, expanded):
        orig = ', '.join(lab)
        pred_lab = ', '.join(pl)
        e_lab = ', '.join(ex)
        line = str(p_id) + '\tORIGINAL: ' + orig + '\tPREDICTED' + pred_lab + '\tEXPANDED: ' + e_lab
        expanded_labels.append(line)

    fname = os.path.split(out_dir)[-1] + '_expanded_labels.txt'
    expanded_labels_dir = os.path.join(out_dir, fname)
    save_list(expanded_labels_dir, expanded_labels)
    def show_frame(self, cont):

        global words_list, weights

        if cont == StartPage:
            save_list(StartPage.word_length, words_list, weights)

        elif cont == FlashCards:
            words_list, weights = open_newlist(StartPage.word_length)
        else:
            print("should never occur")

        frame = self.frames[cont]
        frame.tkraise()
Exemplo n.º 12
0
def main():
    pub_med_ids, documents = read_file(args.i)
    settings = vars(args)

    # ---
    # w2v
    wv_dir = get_wv(documents, settings, args.m)

    min_count = str(settings['min_count'])

    keyed_vectors = KeyedVectors.load(wv_dir)
    vocab = keyed_vectors.vocab
    index2word = keyed_vectors.index2word

    save_list('data/index2word_mc' + min_count + '.txt', index2word)
    save('data/index2word_mc' + min_count + '.pickle', index2word)

    # -----
    # tfidf
    tfidf_model_dir = 'results/models/tfidf_model_mc' + min_count
    if os.path.isfile(tfidf_model_dir):
        print("This tfidf model has already been trained.")
        return
    labels, terms_tuples, wv2terms, doc_tfidf_reps, tfidf_model = get_tfidf(
        documents, vocab, args.n)

    tfidf_model.save(tfidf_model_dir)

    # ------------
    # save to disk

    # convert to word ids
    docs = [[vocab[token].index for token in d if token in vocab]
            for d in documents]

    terms_txt = ['{}\t{}'.format(index2word[t[0]], t[1])
                 for t in terms_tuples]

    # get rid of tfidf value and only keep word id
    terms = [t[0] for t in terms_tuples]

    labels_txt = ['{}\t{}'.format(pub_med_id,
                                  ', '.join([index2word[terms[l]]
                                             for l in lab]))
                  for pub_med_id, lab in zip(pub_med_ids, labels)]

    doc_tfidf_reps_txt = ['{}\t{}'.format(pub_med_id,
                                          ', '.join([index2word[l]
                                                     for l in lab]))
                          for pub_med_id, lab in zip(pub_med_ids, doc_tfidf_reps)]

    save_list('data/terms_mc' + min_count + '.txt', terms_txt)
    save_list('data/labels_mc' + min_count + '.txt', labels_txt)
    save_list('data/doc_tfidf_reps_mc' + min_count + '.txt', doc_tfidf_reps_txt)

    save('data/docs_word_indices_mc' + min_count + '.pickle', docs)
    save('data/labels_mc' + min_count + '.pickle', labels)
    save('data/doc_tfidf_reps_mc' + min_count + '.pickle', doc_tfidf_reps)
    save('data/wv2terms_mc' + min_count + '.pickle', wv2terms)
    save('data/terms_mc' + min_count + '.pickle', terms)
def inference_only(param_path):
    # -----
    # load data
    with open(param_path) as f:
        args = json.load(f)

    args = DotDict(args)
    out_dir = args.model_dir.replace('model', 'output')
    doc_vecs_path = os.path.join(out_dir, 'doc_vecs.npy')

    pub_med_ids, _ = read_file(args.documents_path)
    labels = load(args.labels_path)
    index2word = load(args.index2word_path)
    terms = load(args.terms_path)
    doc_vecs = np.load(doc_vecs_path)

    # ---------
    # Inference
    doc_tfidf_reps = labels
    if len(args.doc_tfidf_reps_path) > 0:
        doc_tfidf_reps = load(args.doc_tfidf_reps_path)

    fused_docs, expanded, top_k_indices = inference.main(
        doc_vecs, doc_tfidf_reps, args.k, args.fuse_doc_type)

    save(os.path.join(out_dir, 'top_k_indices'), top_k_indices)
    if args.keep_model_files:
        np.save(os.path.join(out_dir, 'fused_docs'), fused_docs)
        np.save(os.path.join(out_dir, 'doc_vecs'), doc_vecs)
    del doc_vecs, top_k_indices, fused_docs

    # ----------------------------
    # Save expanded labels to disk
    # convert to word ids
    labels = [[terms[l] for l in lab] for lab in labels]

    if len(args.doc_tfidf_reps_path) == 0:
        expanded = [[terms[l] for l in lab] for lab in expanded]

    expanded_labels = []
    for p_id, l, ex in zip(pub_med_ids, labels, expanded):
        e_words = ', '.join([index2word[e] for e in ex])
        original = ', '.join([index2word[i] for i in l])
        line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words
        expanded_labels.append(line)

    fname = os.path.split(out_dir)[-1] + '_expanded_labels.txt'
    expanded_labels_dir = os.path.join(out_dir, fname)
    save_list(expanded_labels_dir, expanded_labels)
Exemplo n.º 14
0
def make_docs():
    start = time.time()
    with open('raw_data/docs.txt') as docs62k:
        pcm_ids = [line.split('\t')[0] for line in docs62k]

    def find_line(line):
        if line.split('\t')[0] in pcm_ids:
            return line

    with open('raw_data/sampleval_and_SE_good_docs.txt') as raw:
        docs62k = Parallel(n_jobs=8)(delayed(find_line)(line) for line in raw)

    save_list('raw_data/raw_docs_62k.txt', docs62k)

    print("Finished making docs. Time: {}".format(time.time() - start))
def main():
    subreddits = []
    for i, offset in enumerate(range(0, 500, 100)): 
        print(offset)
        if i > 0: sleep(2)
        source = get_url('http://redditmetrics.com/top/offset/{offset}'.format(offset=offset))
        soup = BeautifulSoup(source)
        cells = soup.find_all('td', string=lambda s: s[0:3] == '/r/')
        new_subreddits = [ cell.get_text()[3:] for cell in cells ]
        subreddits += new_subreddits
        print(new_subreddits)

    subreddits[:] = unique(subreddits)
    print('Saving these subreddits:', subreddits)
    save_list(subreddits, 'subreddits.txt')
    print('Done')
Exemplo n.º 16
0
def main():
    subreddits = []
    for i, offset in enumerate(range(0, 500, 100)):
        print(offset)
        if i > 0: sleep(2)
        source = get_url('http://redditmetrics.com/top/offset/{offset}'.format(
            offset=offset))
        soup = BeautifulSoup(source)
        cells = soup.find_all('td', string=lambda s: s[0:3] == '/r/')
        new_subreddits = [cell.get_text()[3:] for cell in cells]
        subreddits += new_subreddits
        print(new_subreddits)

    subreddits[:] = unique(subreddits)
    print('Saving these subreddits:', subreddits)
    save_list(subreddits, 'subreddits.txt')
    print('Done')
Exemplo n.º 17
0
 def save_postings(self):
     o = sorted(self.posting_list, key=lambda x: x[0])
     self.locations_at_postings[str(
         self.counter_of_postings)] = utils.save_list(
             o, str(self.counter_of_postings), self.config.get_out_path())
     self.counter_of_postings += 1
     self.posting_dict = {}
     self.posting_list = []
Exemplo n.º 18
0
def inference_from_checkpoint(
        pub_med_ids,
        docs,
        init_embed,
        labels,
        doc_tfidf_reps,
        index2word,
        terms,
        model,
        root_output_folder,
        folder,
        k):
    params = vars(args)
    params['embedding_dim'] = init_embed.shape[1]
    params['embeddings'] = init_embed

    # get estimator
    estimator = EncodeEstimator(params)

    doc_vecs, _ = estimator.predict(docs)
    
    # Get top-k indices for documents
    top_k_indices = get_docs_neighbors(doc_vecs, model, k)
    
    expanded_labels_dir = os.path.join(root_output_folder, folder)
    os.makedirs(expanded_labels_dir, exist_ok=True)

    top_k_indices_path = os.path.join(expanded_labels_dir, 'top_k_indices')
    save(top_k_indices_path, top_k_indices)

    expanded = get_expanded_terms(top_k_indices, doc_tfidf_reps)

    labels = [[terms[l] for l in labs] for labs in labels]

    expanded_labels = []
    for p_id, l, ex in zip(pub_med_ids, labels, expanded):
        e_words = ', '.join([index2word[e] for e in ex])
        original = ', '.join([index2word[i] for i in l])
        line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words
        expanded_labels.append(line)

    expanded_labels_path = os.path.join(expanded_labels_dir, folder + '_expanded_labels_top1.txt')
    save_list(expanded_labels_path, expanded_labels)
Exemplo n.º 19
0
 def add_to_xml(self, event=None):
     self.btn_add.Disable()
     self.words = init_list()
     if self.in_xml():  # strange duplicate items
         return
     # if add a word that is not in db now to xml, save it to db.
     try:
         temp = Item.get(name=self.item.name)
     except Exception:
         temp = None
     if not temp:
         self.item.save(force_insert=True)  # only save() not save to db
     try:
         if self.words:
             self.words.insert(1, self.item.convert())
         else:
             self.words.insert(0, self.item.convert())
         save_list(self.words)
     except Exception as e:
         print(e)
Exemplo n.º 20
0
 def add_to_xml(self, e=None):
     self.btn_add.Disable()
     self.words = init_list()
     if self.in_xml():  # strange duplicate items
         return
     # if add a word that is not in db now to xml, save it to db.
     try:
         temp = Item.get(name=self.item.name)
     except:
         temp = None
     if not temp:
         self.item.save(force_insert=True)  # only save() not save to db
     try:
         if self.words:
             self.words.insert(1, self.item.convert())
         else:
             self.words.insert(0, self.item.convert())
         save_list(self.words)
     except Exception as e:
         print e
 def _save_trigger_list( self ):
     base_path = os.path.join( BASE_CURRENT_SOURCE_PATH, "trigger_list.txt" )
     utils.save_list( base_path, self.trigger_list, "Trigger List" )
 def _save_watched_trivia_file( self ):
     base_path = os.path.join( BASE_CURRENT_SOURCE_PATH, "trivia_watched.txt" )
     #print self.watched
     utils.save_list( base_path, self.watched, "Watched Trivia" )
def main():
    # ---------
    # Load data
    args = get_args()
    docs = load(args.docs_path)
    labels = load(args.labels_path)
    terms = load(args.terms_path)

    if args.loss_fn == 'softmax_uniform' or args.loss_fn == 'softmax_skewed_labels':
        docs, labels = remove_samples_with_empty_labels(docs, labels)

    if args.test_mode != 0:
        docs = docs[:100]
        labels = labels[:100]

    zipped = list(zip(docs, labels))
    random.seed(42)
    random.shuffle(zipped)
    training_set_size = int(len(zipped)*0.9)
    training_set = zipped[:training_set_size]
    docs_train, labels_train = [[*x] for x in zip(*training_set)]
    eval_set = zipped[training_set_size:]
    docs_eval, labels_eval = [[*x] for x in zip(*eval_set)]
    labels_lm_train = [[terms[l] for l in lab] for lab in labels_train]
    labels_lm_eval = [[terms[l] for l in lab] for lab in labels_eval]

    # get params
    params = vars(args)
    params['embeddings'] = np.load(args.word_vecs_path)
    params['embeddings_dim'] = params['embeddings'].shape[1]

    parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

    if args.folder is not None:
        folder = args.folder
        params['model_dir'] = os.path.join(parent_path, 'results', 'models', folder)
        out_dir = os.path.join(parent_path, 'results', 'outputs', folder, 'direct')
    else:
        folder = '%d_%s_%s_%s_nl%s_kln%s_dp%s_ep%d_bs%d' % (
            int(time.time()), params['model'], params['loss_fn'],
            os.path.split(args.word_vecs_path)[-1],
            params['num_layers'], os.path.split(args.labels_path)[1],
            params['dropout'], params['num_epochs'], params['batch_size'])

        params['model_dir'] = os.path.join(parent_path, 'results', 'models', folder)
        out_dir = os.path.join(parent_path, 'results', 'outputs', folder)
    os.makedirs(out_dir, exist_ok=True)

    # ------
    # Encode
    estimator = EncodeEstimator(params)

    if args.folder is None:
        max_step = 1 if args.test_mode == 2 else None
        estimator.train_and_eval(
            docs_train, labels_lm_train, labels_train,
            docs_eval, labels_lm_eval, labels_eval, max_step=max_step)

    estimator.batch_size = 128  # takes less time with large batch size
    doc_vecs, pred_labels = estimator.predict(docs)

    pub_med_ids, _ = read_file(args.documents_path)
    index2word = load(args.index2word_path)

    # save predicted labels to disk
    pred_lab_words = []
    for p_id, lab in zip(pub_med_ids, pred_labels):
        pred_lab = ', '.join([index2word[terms[l]] for l in lab])
        line = str(p_id) + '\t' + pred_lab
        pred_lab_words.append(line)

    save_list(os.path.join(out_dir, 'pred_labels.txt'), pred_lab_words)

    if not args.keep_model_files:
        shutil.rmtree(params['model_dir'], ignore_errors=True)

    # write params to a txt file, except embeddings
    param_dir = os.path.join(out_dir, 'params.txt')
    if 'embeddings' in params:
        del params['embeddings']
    with open(param_dir, 'w') as f:
        f.write(json.dumps(params))
    del params

    print("Finished predicting.")
    if args.no_inference:
        exit(1)

    # ---------
    # Inference
    doc_tfidf_reps = labels
    if len(args.doc_tfidf_reps_path) > 0:
        doc_tfidf_reps = load(args.doc_tfidf_reps_path)

    fused_docs, expanded, top_k_indices = inference.main(
        doc_vecs, doc_tfidf_reps, args.k, args.fuse_doc_type)

    save(os.path.join(out_dir, 'top_k_indices'), top_k_indices)
    np.save(os.path.join(out_dir, 'doc_vecs'), doc_vecs)
    if args.keep_model_files:
        np.save(os.path.join(out_dir, 'fused_docs'), fused_docs)
    del doc_vecs, top_k_indices, fused_docs

    # ----------------------------
    # Save expanded labels to disk
    # convert to word ids
    labels = [[terms[l] for l in lab] for lab in labels]

    if len(args.doc_tfidf_reps_path) == 0:
        expanded = [[terms[l] for l in lab] for lab in expanded]

    expanded_labels = []
    for p_id, l, ex in zip(pub_med_ids, labels, expanded):
        e_words = ', '.join([index2word[e] for e in ex])
        original = ', '.join([index2word[i] for i in l])
        line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words
        expanded_labels.append(line)

    fname = os.path.split(out_dir)[-1] + '_expanded_labels.txt'
    expanded_labels_dir = os.path.join(out_dir, fname)
    save_list(expanded_labels_dir, expanded_labels)
Exemplo n.º 24
0
 def _save_watched( self ):
     utils.save_list( self.watched_path, self.watched, "Watched Trailers" )
Exemplo n.º 25
0
 def close_handler(self, e=None):
     save_list(self.words)
     self.Destroy()
     if Status.running:
         self.master.btn_recite_handler()
Exemplo n.º 26
0
                a == _y for a, _y in zip(
                    self._batch_input_out(x).argmax(1), y.argmax(1))
            ]) / len(y)

            print("Loss: %f, Accuracy: %f" % (loss, acc))
            self.history['epoch'].append(_ + 1)
            self.history['loss'].append(loss)
            self.history['acc'].append(acc)


if __name__ == '__main__':
    train_dataset = TrainDataSet(r'data/raw/train.tsv')
    test_dataset = TestDataSet(r'data/raw/test.tsv')

    train_feature = train_dataset.bow_feature
    train_label = train_dataset.labels
    from keras.utils import to_categorical
    train_label = to_categorical(train_label)
    test_feature = test_dataset.bow_feature
    print(train_feature.shape)
    print(train_label.shape)

    model = SoftmaxClassifier(train_feature.shape[1], train_label.shape[1])
    model.compile('sgd', 0.05)
    model.fit(train_feature, train_label, batch_size=256, epoch=30)
    from utils import save_list
    save_list(model.history['current_loss'], r'data/batch_loss.txt')
    save_list(model.history['current_acc'], r'data/batch_acc.txt')
    save_list(model.history['loss'], r'data/loss.txt')
    save_list(model.history['acc'], r'data/acc.txt')
Exemplo n.º 27
0
print('Example body after pre-processing:', train_body_vecs[0])

# Instantiate a text processor for the titles, with some different parameters.
title_pp = processor(append_indicators=True,
                     keep_n=4500,
                     padding_maxlen=12,
                     padding='post')
title_pp.set_cleaner(textacy_cleaner)

# process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw)

print('Example original title:', train_title_raw[0])
print('Example title after pre-processing:', train_title_vecs[0])

save_text("/tmp/train_title_raw.txt", train_title_raw[0])
save_text("/tmp/train_body_raw.txt", train_body_raw[0])
save_list("/tmp/train_title_vecs.txt", train_title_vecs[0])
save_list("/tmp/train_body_vecs.txt", train_body_vecs[0])

# Save the preprocessor.
with open(args.output_body_preprocessor_dpkl, 'wb') as f:
    dpickle.dump(body_pp, f)

with open(args.output_title_preprocessor_dpkl, 'wb') as f:
    dpickle.dump(title_pp, f)

# Save the processed data.
np.save(args.output_train_title_vecs_npy, train_title_vecs)
np.save(args.output_train_body_vecs_npy, train_body_vecs)
    def refresh(self):
        global current_word
        current_word = {}  # clear current_word for fresh start
        self.update_text(self.message)
        self.controller.show_frame(StartPage)

    def speak(self):
        global current_word
        if len(current_word) == 0:
            pass
        else:
            speaker = Synthesizer(voice='Alex', device='Built-in')
            speaker.text = current_word['word']
            speaker.talk()

    def update_text(self, txt):

        self.text_box.configure(state='normal')
        self.text_box.delete('1.0', tk.END)
        self.text_box.insert(tk.END, txt)
        self.text_box.configure(state='disabled')


if __name__ == "__main__":

    app = Application()
    app.mainloop()
    # save the list in case the app window is simply closed
    save_list(StartPage.word_length, words_list, weights)
Exemplo n.º 29
0
 def _save_watched_trivia_file(self):
     base_path = os.path.join(BASE_CURRENT_SOURCE_PATH,
                              "trivia_watched.txt")
     #print self.watched
     utils.save_list(base_path, self.watched, "Watched Trivia")
Exemplo n.º 30
0
    friends_set = set(get_friend_ids(api))
    if exclude_followers:
        friends_set |= set(get_follower_ids(api))
    friends_set.add(own_id)

    for _1 in range(chain_depth):
        for _2 in range(curQ.qsize()):
            curUser = curQ.get()
            curUserFoers = get_followers(api,
                                         printing=False,
                                         screen_name=curUser)
            users_to_block = list(
                filter(
                    lambda x: x.id not in friends_set and x.id not in
                    ids_to_block,
                    curUserFoers,
                ))
            if len(users_to_block) == 0:
                continue
            ids_to_block |= set(users_to_ids(users_to_block))
            for u in users_to_block:
                curQ.put(u.screen_name)

    ids_to_block -= friends_set
    if args.dry_run:
        print(f'Number of users to block: {len(ids_to_block)}.')
        if confirm('Save list of user-ids-to-block?', default=False):
            save_list(f'user_ids_to_block-{own_id}.list', ids_to_block)
        exit(0)
    remove_users(api, ids_to_block, unblock=False)
 def _save_trigger_list(self):
     base_path = os.path.join(BASE_CURRENT_SOURCE_PATH, "trigger_list.txt")
     utils.save_list(base_path, self.trigger_list, "Trigger List")
Exemplo n.º 32
0
 def close_handler(self, e=None):
     save_list(self.words)
     self.Destroy()
     if Status.running:
         self.master.btn_recite_handler()
Exemplo n.º 33
0
 def _save_watched( self ):
     utils.save_list( self.watched_path, self.watched, "Watched Trailers" )
Exemplo n.º 34
0
# Process the docs and save the tokens into
# a vocab file
from utils import process_docs
from utils import save_list
from utils import load_doc
from collections import Counter

vocab = Counter()
process_docs('dataset/txt_sentoken/neg', vocab)
process_docs('dataset/txt_sentoken/pos', vocab)
print("[INFO] Vocab length: {}".format(len(vocab)))

min_occurences = 2
tokens = [k for k, c in vocab.items() if c >= min_occurences]
print("[INFO] Vocab length after processing: {}".format(len(tokens)))
save_list(tokens, 'vocab.txt')

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
Exemplo n.º 35
0
def main():
    # ---------
    # load data
    args = get_args()
    pub_med_ids, documents = read_file(args.documents_path)
    labels = load(args.labels_path)
    terms = load(args.terms_path)

    if args.test_mode != 0:
        documents = documents[:100]
        labels = labels[:100]

    # ------
    # Encode
    folder = str(int(time.time())) + '_doc2vec'
    model_dir = '../results/models/' + folder
    os.makedirs(model_dir, exist_ok=True)
    out_dir = '../results/outputs/' + folder
    os.makedirs(out_dir, exist_ok=True)

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    model = Doc2Vec(documents,
                    vector_size=300,
                    window=10,
                    min_count=5,
                    workers=-1,
                    epochs=20)
    model.save(model_dir + '/doc2vec.model')
    doc_vecs = model.docvecs.vectors_docs

    # ---------
    # Inference
    doc_tfidf_reps = labels
    if len(args.doc_tfidf_reps_path) > 0:
        doc_tfidf_reps = load(args.doc_tfidf_reps_path)

    fused_docs, expanded, top_k_indices = inference.main(
        doc_vecs, doc_tfidf_reps, args.k, args.fuse_doc_type)

    save(os.path.join(out_dir, 'top_k_indices'), top_k_indices)
    if args.keep_model_files:
        np.save(os.path.join(out_dir, 'fused_docs'), fused_docs)
        np.save(os.path.join(out_dir, 'doc_vecs'), doc_vecs)
    del doc_vecs, top_k_indices, fused_docs

    # ---------
    # save data

    # convert to word ids
    labels = [[terms[l] for l in lab] for lab in labels]

    if len(args.doc_tfidf_reps_path) == 0:
        expanded = [[terms[l] for l in lab] for lab in expanded]

    index2word = load(args.index2word_path)
    expanded_labels = []
    for p_id, l, ex in zip(pub_med_ids, labels, expanded):
        e_words = ', '.join([index2word[e] for e in ex])
        original = ', '.join([index2word[i] for i in l])
        line = str(p_id) + '\tORIGINAL: ' + original + '\tEXPANDED: ' + e_words
        expanded_labels.append(line)

    expanded_labels_dir = out_dir + '/' + out_dir.split(
        '/')[-1] + '_expanded_labels.txt'
    save_list(expanded_labels_dir, expanded_labels)
Exemplo n.º 36
0
import os

import numpy as np
from utils import save_list, load_txt_raw
"""
Script: Transfer .txt raw data to .npy data

"""
# Size: 8824330
# Dim: 200

# dict
# word(str): vector(np.ndarray)

txt_dirs = r'data/txt_s/'
i = 1
for file in os.listdir(txt_dirs):
    fn = os.path.join(txt_dirs, file)
    p_model = load_txt_raw(fn)
    p_words = []
    p_vectors = []

    for k, v in p_model.items():
        p_words.append(k)
        p_vectors.append(v)
    save_list(p_words, r'data/array_raw/words_%d.txt' % i)
    p_vectors = np.array(p_vectors)
    np.save(r'data/array_raw/vectors_%d.npy' % i, p_vectors)
    i += 1
Exemplo n.º 37
0
    def merge_chunks(self):
        """
        performs a K-way merge on the posting files -> N disk accesses
        writes new posting files to the disk.
        :return:
        """
        saved_chunks = []
        chunks_indices = np.zeros(shape=(len(self.locations_at_postings)),
                                  dtype=np.int32)
        chunk_length = self.postingDict_size // len(
            self.locations_at_postings) + 1
        #   inserts the chunks into a chunked list
        for key in self.locations_at_postings:
            loaded, offset = utils.load_list(key, self.config.get_out_path(),
                                             self.locations_at_postings[key],
                                             chunk_length)
            saved_chunks.append(loaded)
            self.locations_at_postings[key] = offset

        building_list = []
        all_empty = True

        # loops through as long as all postings files didn't finish running.
        while all_empty:
            should_enter = -1

            # loops through as long as one of the chunks is not done.
            while should_enter == -1:
                term_to_enter = self.find_term(saved_chunks, chunks_indices)
                tuples_to_merge = []
                indexes_of_the_indexes_to_increase = []

                # find all tuples that should be merged and the indices should be increased
                for idx, term_idx_in_chunk in enumerate(chunks_indices):
                    if term_idx_in_chunk < len(saved_chunks[idx]) and \
                            saved_chunks[idx][term_idx_in_chunk][0] == term_to_enter:
                        tuples_to_merge.append(
                            saved_chunks[idx][term_idx_in_chunk])
                        indexes_of_the_indexes_to_increase.append(idx)

                merged_tuple = self.merge_terms_into_one(tuples_to_merge)
                appended_term = merged_tuple[0]
                should_append = True
                # if it is a named entity and it exists in less than 2 tweets, erase this term.
                if appended_term in self.entities_dict and self.entities_dict[
                        appended_term] < 2:
                    should_append = False
                    self.inverted_idx.pop(appended_term, None)
                # update terms with capital letters
                if appended_term in self.global_capitals and self.global_capitals[
                        appended_term]:
                    merged_tuple = (appended_term.upper(), merged_tuple[1])
                    inverted_val = self.inverted_idx[appended_term]
                    self.inverted_idx.pop(appended_term, None)
                    self.inverted_idx[appended_term.upper()] = inverted_val
                appended_term = merged_tuple[0]
                if appended_term in self.inverted_idx and self.inverted_idx[
                        appended_term][0] == 1:
                    should_append = False
                    self.inverted_idx.pop(appended_term, None)
                if should_append:
                    self.accumulative_size += len(merged_tuple[1])
                    building_list.append(merged_tuple)
                    self.inverted_idx[merged_tuple[0]][1] = str(
                        self.counter_of_postings)

                # increase the indices that the tuple at the specific location have been inserted to the new posting
                for idx in indexes_of_the_indexes_to_increase:
                    chunks_indices[idx] += 1

                should_enter = self.update_should_enter(
                    saved_chunks, chunks_indices)

                # saving happens as soon as the size reaches given max size of the final posting
                if self.accumulative_size >= self.max_accumulative:
                    self.merged_dicts.append(str(self.counter_of_postings))
                    utils.save_list(building_list,
                                    str(self.counter_of_postings),
                                    self.config.get_out_path())
                    self.accumulative_size = 0
                    self.counter_of_postings += 1
                    building_list = []
            # loads new chunks into the save_chunks list in the relevant indices.
            for index in should_enter:
                loaded, offset = utils.load_list(
                    str(index), self.config.get_out_path(),
                    self.locations_at_postings[str(index)], chunk_length)
                saved_chunks[index] = loaded
                chunks_indices[index] = 0
                self.locations_at_postings[str(index)] = offset

            # checks whether all postings are done.
            all_empty = False
            for chunk in saved_chunks:
                if len(chunk) > 0:
                    all_empty = True
                    break

        # save of the last posting file.
        if len(building_list) > 0:
            self.merged_dicts.append(str(self.counter_of_postings))
            utils.save_list(building_list, str(self.counter_of_postings),
                            self.config.get_out_path())
Exemplo n.º 38
0
executor = ThreadPoolExecutor(max_workers=80)

cancelled = False

delete_failed_ids = []


def delete_tweet(tid):
    if cancelled:
        return
    try:
        print(f'deleting {tid}')
        api.DestroyFavorite(status_id=tid)
    except TwitterError as e:
        if e.message[0]['code'] == 144:
            return
        print(e)
        delete_failed_ids.append(tid)


try:
    for like in likes:
        executor.submit(delete_tweet, like['like']['tweetId'])
    executor.shutdown(wait=True)
except (KeyboardInterrupt, SystemExit):
    cancelled = True
    print('Interrupted, exiting...')

save_list('delete_failed_like_ids.list', delete_failed_ids)