Пример #1
0
def run(args):
    doc = read_txt(args.path_to_doc)
    doc_tokens = [
        process_text(entry,
                     lower=not args.cased,
                     remove_stopwords=args.remove_stopwords,
                     remove_punctuation=args.remove_punctuation)
        for entry in doc
    ]

    all_tokens = []
    for entry_tokens in doc_tokens:
        all_tokens += entry_tokens

    rare_tokens, selected_tokens = get_rare_tokens(all_tokens,
                                                   args.min_freq,
                                                   args.max_tokens,
                                                   return_non_rare=True)
    if args.remove_rare:
        doc_tokens = [
            filter_tokens(entry_tokens, set(rare_tokens))
            for entry_tokens in doc_tokens
        ]

    gu = GloVeUtility(args.path_to_glove)

    vectorizer = CountVectorizer(ngram_range=(args.ngram_lower,
                                              args.ngram_upper),
                                 vocabulary=selected_tokens)
    count_vector = vectorizer.fit_transform(
        [" ".join(entry_tokens) for entry_tokens in doc_tokens])

    csr_mat = count_vector.T * count_vector
    csr_mat.setdiag(0)

    cooccur_ar = csr_mat.toarray()

    mittens_model = Mittens(n=gu.d, max_iter=args.iter)
    embeddings = mittens_model.fit(cooccur_ar,
                                   vocab=selected_tokens,
                                   initial_embedding_dict=gu.vector_dict)

    filename = args.path_to_glove.split(os.path.sep)[-1]
    os.makedirs(args.output, exist_ok=True)

    embeddings_dict = dict(zip(selected_tokens, embeddings))
    progress_bar.std_print("\nTrained on {} tokens.".format(
        len(embeddings_dict)))

    if args.save_new_only:
        savepath = os.path.join(args.output, "new_" + filename)
        embeddings_list = [
            " ".join([key] + [str(val) for val in embeddings_dict[key]])
            for key in embeddings_dict
        ]
        write_txt(savepath, embeddings_list)
    else:
        savepath = os.path.join(args.output, filename)
        gu.add_replace_vectors(embeddings_dict)
        gu.save_vectors(savepath)
Пример #2
0
    def glove_finetuned_embeddings(self):
        helper._print_header('Getting fine-tuned GloVe embeddings')
        self.glove_download_pretrained_model()
        sentences = self.get_enron_sentences()
        vocab = helper.get_or_build(FLAGS.enron_emails_vocab_path,
                                    self.build_vocab, sentences)
        # idx2word = {i: word for word, i in word2idx.items()}
        print(len(vocab))
        cooccur = helper.get_or_build(FLAGS.enron_emails_cooccur_path,
                                      self.build_cooccur,
                                      vocab,
                                      sentences,
                                      type='numpy')
        print(np.shape(cooccur))
        pretrained_embeddings = self.glove2dict(self.word_embed_file_path)
        helper._print_subheader('Starting Mittens model...')
        mittens_model = Mittens(n=self.dimensions,
                                max_iter=1000,
                                display_progress=1,
                                log_dir=FLAGS.glove_dir + 'mittens/')
        finetuned_embeddings = mittens_model.fit(
            cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings)
        print(finetuned_embeddings)

        return 'test', 'test', 'test'
Пример #3
0
def train_mittens(coocc_ar, oov_vocabs, pre_glove, emb_dim=cfg['embeddings']['emb_dim'], max_iter=200,
                  glove_oov_save_path=None, dataset_dir=dataset_dir,
                  embedding_file=cfg["embeddings"]["embedding_file"],
                  dataset_name=cfg['data']['train'] + cfg['data']['test']):
    """

    :param coocc_ar:
    :param oov_vocabs:
    :param pre_glove:
    :param emb_dim:
    :param max_iter:
    :param glove_oov_save_path:
    :param dataset_dir:
    :param embedding_file:
    :param dataset_name:
    :return:
    """
    mittens_model = Mittens(n=emb_dim, max_iter=max_iter)

    new_embeddings = mittens_model.fit(
        coocc_ar,
        vocab=oov_vocabs,
        initial_embedding_dict=pre_glove)

    newglove = dict(zip(oov_vocabs, new_embeddings))
    if glove_oov_save_path is None:
        glove_oov_save_path = join(dataset_dir, embedding_file + dataset_name +
                                   '_oov.pkl')
    f = open(glove_oov_save_path, "wb")
    pickle.dump(newglove, f)
    f.close()

    return newglove
Пример #4
0
def create_monthly_glove_models(begin_month=None):
    model = Mittens(n=300, max_iter=1000)
    vocab, embedding = glove2dict(TRUNCATED_GLOVE_EMBEDDING)
    months = arrow.Arrow.span_range('month', arrow.get(START_MONTH), arrow.get(END_MONTH))
    for begin, end in months:
        print("Training mittens model for {}".format(begin.format("YYYY-MM")))
        print("  loading cooccurrence matrix")
        coo_matrix = np.load(get_month_cooccurrence_matrix_filepath(begin.year, begin.month))
        print("  training")
        embedding = model.fit(coo_matrix, vocab=vocab, initial_embedding_dict=embedding)
        print("  saving")
        np.save(get_month_glove_embedding_filepath(begin.year, begin.month), embedding)
Пример #5
0
def loadGloVe(co_occur, dim_embed, vocab, is_fine):
    if is_fine:
        glove_original = glove2dict(glove_filename)

        mittens_model = Mittens(n=dim_embed, max_iter=5000)

        embeddings = mittens_model.fit(np.asarray(co_occur),
                                       vocab=vocab,
                                       initial_embedding_dict=glove_original)
    else:
        glove_model = GloVe(n=dim_embed, max_iter=5000)
        embeddings = glove_model.fit(np.asarray(co_occur))

    return embeddings
Пример #6
0
def glove_embedding(filename, vocab_file, cooccurence_file, domain):
    gv = Glove()
    out_dir = './preprocessed_data/' + domain
    if vocab_file and cooccurence_file:
        vocab = gv.load_vocab_in_order(vocab_file)
        cooccurence = gv.load_cooccurence_matrix(cooccurence_file)
        logger.info('get pre-trained glove embedding')
        original_embedding = gv.get_original_embedding(
            './pretrained_embeddings/glove.6B/glove.6B.300d.txt')
        mittens_model = Mittens(n=300, max_iter=1000)
        logger.info('Start fine tuning...')
        new_embeddings = mittens_model.fit(
            cooccurence,
            vocab=vocab,
            initial_embedding_dict=original_embedding)
        fin = open(out_dir + '/fine_tuned_glove_300', 'wb')
        pickle.dump(new_embeddings, fin)
        fin.close()
        logger.info('Fine tuning complete')
    else:
        logger.info('Load english data')
        fin = codecs.open(filename, 'r', 'utf-8')
        corpus = []
        for line in fin:
            corpus.append(line)
        vocab = gv.build_vocab(corpus)
        vocab_file = out_dir + '/vocab.pkl'
        createPath(vocab_file)
        outfile = open(vocab_file, 'wb')
        pickle.dump(vocab, outfile)
        outfile.close()
        logger.info("Fetching cooccurrence list..")
        cooccurrences = gv.build_cooccur(vocab, corpus)
        cooccurrences = gv.convert_cooccurence_matrix(cooccurrences,
                                                      len(vocab))
        cooccurrence_file = out_dir + '/cooccurrence.pkl'
        outfile = open(cooccurrence_file, 'wb')
        pickle.dump(cooccurrences, outfile)
        outfile.close()
        logger.info("Cooccurrence list fetch complete (%i pairs).\n",
                    cooccurrences.shape[0])
Пример #7
0
def batch_finetune(finetune_glove, batch_word, dimension):
    oov = [token for token in batch_word if token not in finetune_glove.keys()]

    en_doc = [' '.join(batch_word)]

    corp_vocab = list(set(oov))
    cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
    X = cv.fit_transform(en_doc)
    Xc = (X.T * X)
    Xc.setdiag(0)
    coocc_ar = Xc.toarray()

    mittens_model = Mittens(n=dimension, max_iter=1800)
    new_embeddings = mittens_model.fit(
      coocc_ar,
      vocab=corp_vocab,
      initial_embedding_dict=finetune_glove)

    newglove = dict(zip(corp_vocab, new_embeddings))
    finetune_glove.update(newglove)
    return finetune_glove
Пример #8
0
 def train_and_save_finetuned_embeddings(self):
     sentences = self.get_enron_sentences()
     vocab = self.build_vocab(sentences)
     if not os.path.isfile(directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH):
         # idx2word = {i: word for word, i in word2idx.items()}
         cooccur = self.build_cooccur(vocab, sentences)
         pretrained_embeddings = self.glove2dict(
             directories.GLOVE_EMBEDDING_FILE_PATH)
         helper._print(
             f'{len([v for v in vocab.keys() if v in pretrained_embeddings.keys()])} words in common with the pretrained set'
         )
         helper._print_subheader('Building model...')
         mittens_dir = directories.GLOVE_DIR + 'mittens/'
         if not os.path.isdir(mittens_dir):
             os.makedirs(mittens_dir)
         mittens_model = Mittens(n=self.dimensions,
                                 xmax=100,
                                 max_iter=10000,
                                 display_progress=10,
                                 learning_rate=0.05,
                                 alpha=0.75,
                                 tol=1e-4,
                                 log_dir=mittens_dir,
                                 mittens=0.1)
         helper._print_subheader('Training Mittens model...')
         finetuned_embeddings = mittens_model.fit(
             cooccur,
             vocab=vocab,
             initial_embedding_dict=pretrained_embeddings)
         print()
         helper._print_subheader(
             'Done training finetuned embeddings! Merging with pre-trained embeddings...'
         )
         resulting_embeddings = pretrained_embeddings
         for word, weights in zip(vocab.keys(), finetuned_embeddings):
             resulting_embeddings[word] = weights
         self.dict2glove(resulting_embeddings,
                         directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH)
         return vocab, cooccur, resulting_embeddings
     return vocab, None, None
Пример #9
0
def glove_embedding(filename, vocab_file, cooccurence_file, lang):
    gv = Glove()
    if vocab_file and cooccurence_file:
        vocab = gv.load_vocab_in_order(vocab_file)
        cooccurence = gv.load_cooccurence_matrix(cooccurence_file)
        logger.info('get pre-trained glove embedding')
        original_embedding = gv.get_original_embedding(config.glove_pretrained_emb[lang])
        mittens_model = Mittens(n=300, max_iter=1000)
        logger.info('Start fine tuning...')
        new_embeddings = mittens_model.fit(cooccurence, vocab=vocab,
        initial_embedding_dict=original_embedding)
        fin = open(config.glove_fine_tuned_emb[lang], 'wb')
        pickle.dump(new_embeddings, fin)
        fin.close()
        logger.info('Fine tuning complete')
    else:
        if lang == 'de':
            logger.info('Load german data')
        elif lang == 'en':
            logger.info('Load english data')
        fin = codecs.open(filename, 'r', 'utf-8')
        corpus = []
        for line in fin:
            corpus.append(line)
        vocab = gv.build_vocab(corpus)
        vocab_file = config.glove_fine_tuned_vocab[lang]
        createPath(vocab_file)
        outfile = open(vocab_file, 'wb')
        pickle.dump(vocab, outfile)
        outfile.close()
        logger.info("Fetching cooccurrence list..")
        cooccurrences = gv.build_cooccur(vocab, corpus)
        cooccurrences = gv.convert_cooccurence_matrix(cooccurrences, len(vocab))
        cooccurrence_file = config.glove_fine_tuned_cooccurance[lang]
        #outfile = open(cooccurrence_file, 'wb')
        joblib.dump(cooccurrences, cooccurrence_file)
        #outfile.close()
        logger.info("Cooccurrence list fetch complete (%i pairs).\n", cooccurrences.shape[0])
Пример #10
0
print("Creating co-occurance matrix")

co_matrix = np.zeros((5000, 5000))
for word1, word2 in co_dict.keys():
    co_matrix[top_5k[word1], top_5k[word2]] = co_dict[(word1, word2)]


def glove2dict(glove_filename):
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {
            line[0]: np.array(list(map(float, line[1:])))
            for line in reader
        }
    return embed


print("Training GloVe")

original_embeddings = glove2dict("glove.6B/glove.6B.200d.txt")
vocab_array = vocab.keys()
mittens_model = Mittens(n=200, max_iter=2000)
new_embeddings = mittens_model.fit(co_matrix,
                                   vocab=top_5k.keys(),
                                   initial_embedding_dict=original_embeddings)

np.save('GloVe_wine_5k.npy', new_embeddings)

print("Done")
Пример #11
0
# corp_vocab = list(set(oov) - set(oov_rare))
#corp_vocab = get_freqw(all_texts_tokenized_clean, 10000)
#pickle.dump(corp_vocab, open("vocab_clpsych_10000.pkl", "wb+"))
corp_vocab = pickle.load(open("all_vocab_clpsych_erisk_stop_40000.pkl", "rb"))
original_glove = {k: v for k, v in pre_glove.items() if k in corp_vocab}
pickle.dump(original_glove,
            open("original_glove_clpsych_erisk_stop_40000.pkl", "wb+"))

# Train with mittens
print("Computing cooccurrence matrix...")
#cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
#X = cv.fit_transform([all_texts])
#Xc = (X.T * X)
#Xc.setdiag(0)
#coocc_ar = Xc.toarray()
#pickle.dump(coocc_ar, open("coocc_mat_clpsych_erisk_stop_40000.pkl", "wb+"), protocol=4)
coocc_ar = pickle.load(open("coocc_mat_clpsych_erisk_stop_40000.pkl", "rb"))
#coocc_ar = pickle.load(open("coocc_mat_clpsych_oov2.pkl", "rb"))

print("Training with mittens...")
mittens_model = Mittens(n=100, max_iter=1000, mittens=0.2)
new_embeddings = mittens_model.fit(coocc_ar,
                                   vocab=corp_vocab,
                                   initial_embedding_dict=pre_glove)

print("Serializing embeddings...")
newglove = dict(zip(corp_vocab, new_embeddings))
f = open("finetuned_glove_clpsych_erisk_stop_40000.pkl", "wb")
pickle.dump(newglove, f)
f.close()
Пример #12
0
def fine_tune_glove(ID,
                    train_type,
                    doc_name="data/fine_tune_docs/pro_from_collection",
                    glove_file="glove.6B.50d.txt",
                    iteration=2000,
                    glove_dim=50,
                    restrict=0,
                    normal=True,
                    stop_word_list='english'):
    """
    The wrapper function for fine tuning GloVe
    ID: identifier for the experiment
    train_type: one of "pro", "con", "all"
    doc_name: doc_name for the training reviews
    glove_file: public glove file to use
    iteration: how many iteration to train
    glove_dim: dimension for glove embedding
    restrict: restrict the number of documents to be read, reads all if restrict is 0
    normal: whether to normalize the coocurrence matrix or not

    return: nothing, saves embedding in three files
    """
    assert (train_type in ["pro", "con", "all"])
    #read sentences
    print("reading training file")
    docs = read_doc(doc_name, restrict=restrict)
    #create coocurrence matrix
    if stop_word_list != 'english':
        stop_word_file = "data/fine_tune_docs/" + train_type + "_stop_words"
        stop_word_list = read_stop_word(stop_word_file)
    coocur_model = Cooccurrence(ngram_range=(1, 1),
                                stop_words=stop_word_list,
                                normalize=normal)
    Xc = coocur_model.fit_transform(docs)  # co-occurrence matrix
    Xc = np.squeeze(np.asarray(Xc.todense()))
    print(Xc.shape)
    #read public GloVe embedding
    print("reading glove original embedding")
    original_embedding = simple_glove2dict(glove_file)
    #create vocab
    print("creating vocabulary")
    vocab = create_word_list(coocur_model.vocabulary_)
    print("vocab_size:", len(vocab))
    #prepare for fine tune
    mittens_model = Mittens(n=glove_dim, max_iter=iteration)

    #fine tune GloVe!
    print("training started")
    new_embeddings = mittens_model.fit(
        Xc, vocab=vocab, initial_embedding_dict=original_embedding)

    print("training finished")
    #storing it in a way that can be used at https://projector.tensorflow.org/
    with open(
            "result/" + ID + "_" + train_type + "_" + str(iteration) +
            "_embedding.tsv", "w") as f:
        for array in new_embeddings:
            for number in array:
                f.write(str(number) + "\t")
            f.write("\n")

    with open(
            "result/" + ID + "_" + train_type + "_" + str(iteration) +
            "_vocab.tsv", "w") as f2:
        for word in vocab:
            f2.write(word + "\n")

    #storing it in a way for the common glove readers
    #this should be ready to be read by simple_glove2dict above
    # and glove2dict function in /utils/vec_function
    with open(
            "result/" + ID + "_" + train_type + "_" + str(iteration) +
            "_word2vectorGloVe." + str(glove_dim) + "d.txt", "w") as f3:
        for index, word in enumerate(vocab):
            f3.write(word + " ")
            for number in new_embeddings[index]:
                f3.write(str(number) + " ")
            f3.write("\n")
    print("file written")
pragmatic_vocab = intersection(pragmatic_list, vocab_total)
union_vocab = union(vocab_top_5000, pragmatic_vocab)
pd.DataFrame(union_vocab).to_csv('union_vocab.csv')

cooccurence_matrix = get_cooccurence_matrix(vocab_path, text_data_path,
                                            stock_list_path)
df = pd.DataFrame(cooccurence_matrix)
df.to_csv('coocur_union.csv')

vocab = pd.read_csv('union_vocab.csv')
vocabulary = dict(zip(vocab.iloc[:, 1], range(0, len(vocab))))
vocab = vocabulary.keys()
cooccurrence = pd.read_csv('coocur_union.csv').iloc[:, 1:].to_numpy()

mittens_model = Mittens(n=300, max_iter=1000)
original_embedding = glove2dict('../../data/glove.6B.300d.txt')
new_embeddings = mittens_model.fit(cooccurrence,
                                   vocab=vocab,
                                   initial_embedding_dict=original_embedding)

print("MITTENS TRAINED")
filename_train = "../../data/train_pk.json"
filename_test = "../../data/test_pk.json"
filename_val = "../../data/val_pk.json"

maxlen = 0
maxlen = max(maxlen, get_max_len(filename_train))
maxlen = max(maxlen, get_max_len(filename_val))
maxlen = max(maxlen, get_max_len(filename_test))