def nlp_preprocess(filepath_dict: dict,
                   col: str,
                   df=None,
                   verbose: bool = True,
                   overwrite_interim: bool = True) -> pd.DataFrame:
    def clean_doc(corpus):
        '''
        generator function to read in docs from the file,
        and substitute and remove substrings
        '''
        for doc in corpus:
            yield au_tu.remove_substrings(au_tu.clean_tokens(
                doc,
                tokens=to_replace_dict,
                whole_words_only=whole_words_only,
                ignore_case=ignore_case,
            ),
                                          to_remove_list=to_remove_list,
                                          whole_words_only=whole_words_only,
                                          ignore_case=ignore_case)

    def tokenize_entities(parsed_doc):
        txt = parsed_doc.text
        for ent in parsed_doc.ents:
            txt = txt[:ent.start_char] + ent.text.replace(
                ' ', '_') + txt[ent.end_char:]
        return txt

    def cleaned_doc_corpus(corpus):
        '''
        generator function to use spaCy to parse docs, clean docs,
        tokenize named entities, and yield documents
        '''
        for parsed_doc in nlp.pipe(clean_doc(corpus),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):
            yield tokenize_entities(parsed_doc)

    def punct_space_more(token):
        '''
        helper function to eliminate tokens that are
        pure punctuation or whitespace or digits or only 1 character
        '''
        return (
            token.is_punct or token.is_space or token.is_digit
            or token.text == "'s" or token.lemma_ == '-PRON-' or
            # token.lemma_ == 'say' or
            # token.lemma_ == 'tell' or
            # token.lemma_ == 'be' or
            len(token.text) <= 1)

    def line_doc(filename):
        '''
        generator function to read in docs from the file,
        un-escape the original line breaks in the text,
        and do additional cleaning
        '''
        def hyp_to_us(doc):
            return re.sub(r'\b-\b', '_', doc)

        def remove_punct(doc):
            # keep: alphanumberic (w), spaces (s), single quote, underscore
            return re.sub(r'[^\w\s\'_]+', '', doc)

        # with codecs.open(filename, encoding='utf_8') as f:
        with smart_open(filename) as f:
            for doc in f:
                yield remove_punct(hyp_to_us(doc.decode())).replace(
                    '\\n', '\n')

    def lemmatized_sentence_corpus(filename):
        '''
        generator function to use spaCy to parse docs,
        lemmatize the text, and yield sentences
        '''
        for parsed_doc in nlp.pipe(line_doc(filename),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            for sent in parsed_doc.sents:
                yield ' '.join([
                    token.lemma_ for token in sent
                    if not punct_space_more(token)
                ])

    if verbose:
        logger.info(f'Working on text from: {col}')

    # # debug - only getting from the sample dataframe here
    # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy()

    df_phrased = df.loc[df[col].notnull(),
                        ['tfa_master_uid', 'app_year', col]].copy()

    nlp = spacy.load('en', disable=[])

    # clean text and tokenize entities
    if verbose:
        logger.info('Cleaning docs...')
    df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values))
    # remove 'the_' from NER tokens
    df_phrased[col] = df_phrased[col].apply(
        lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()]))
    if verbose:
        logger.info('\tDone.')

    # create & open a new file in write mode
    if verbose:
        logger.info('Saving documents, one per line...')
    doc_count = 0
    with codecs.open(filepath_dict['doc_txt_filepath'], 'w',
                     encoding='utf_8') as doc_txt_file:
        for doc in df_phrased[[col]].apply(lambda x: ' '.join(x),
                                           axis=1).tolist():
            # write the doc as a line in the new file
            # escape newline characters in the original doc text
            doc_txt_file.write(doc.replace('\n', '\\n') + '\n')
            doc_count += 1
    if verbose:
        logger.info(
            f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}"
        )

    nlp = spacy.load('en', disable=['ner'])

    # lemmatize and save sentences

    if overwrite_interim:
        if verbose:
            logger.info(
                f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}"
            )
        # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f:
        with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f:
            for sentence in lemmatized_sentence_corpus(
                    filepath_dict['doc_txt_filepath']):
                f.write(sentence + '\n')
            if verbose:
                logger.info('Done.')
        unigram_sentences = LineSentence(
            filepath_dict['unigram_sentences_filepath'])

        if verbose:
            logger.info('Unigram examples:')
            for unigram_sentence in it.islice(unigram_sentences, 10, 20):
                logger.info(u' '.join(unigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding bigram phrases')
        # create the bigram model
        bigram = Phrases(unigram_sentences,
                         min_count=phrase_min_count,
                         threshold=phrase_threshold,
                         max_vocab_size=phrase_max_vocab_size,
                         progress_per=phrase_progress_per,
                         scoring=phrase_scoring,
                         common_terms=phrase_common_terms)
        bigram_model = Phraser(bigram)
        bigram_model.save(filepath_dict['bigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}"
            )
        # save bigram sentences
        with codecs.open(filepath_dict['bigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for unigram_sentence in unigram_sentences:
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                f.write(bigram_sentence + '\n')

        bigram_sentences = LineSentence(
            filepath_dict['bigram_sentences_filepath'])
        if verbose:
            logger.info('Bigram examples:')
            for bigram_sentence in it.islice(bigram_sentences, 10, 20):
                logger.info(u' '.join(bigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding trigram phrases')
        # create the trigram model
        trigram = Phrases(bigram_sentences,
                          min_count=phrase_min_count,
                          threshold=phrase_threshold,
                          max_vocab_size=phrase_max_vocab_size,
                          progress_per=phrase_progress_per,
                          scoring=phrase_scoring,
                          common_terms=phrase_common_terms)
        trigram_model = Phraser(trigram)
        trigram_model.save(filepath_dict['trigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}"
            )
        # create trigram sentences
        with codecs.open(filepath_dict['trigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')

        trigram_sentences = LineSentence(
            filepath_dict['trigram_sentences_filepath'])
        if verbose:
            logger.info('Trigram examples:')
            for trigram_sentence in it.islice(trigram_sentences, 10, 20):
                logger.info(u' '.join(trigram_sentence))
                logger.info('=' * 30)

    if verbose:
        logger.info(
            f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}"
        )
    # using saved models, write transformed text out to a new file, one doc per line
    with codecs.open(filepath_dict['trigram_docs_filepath'],
                     'w',
                     encoding='utf_8') as f:
        for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            # removing punctuation and whitespace
            unigram_doc = [
                token.lemma_ for token in parsed_doc
                if not punct_space_more(token)
            ]

            # apply the first-order and second-order phrase models
            bigram_doc = bigram_model[unigram_doc]
            trigram_doc = trigram_model[bigram_doc]

            # remove any remaining stopwords
            trigram_doc = [
                term for term in trigram_doc
                if term not in nlp.Defaults.stop_words
            ]

            #extend the stop workds
            stop_words_extend = [
                'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say',
                'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done',
                'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want',
                'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also',
                'may', 'take', 'come'
            ]
            trigram_doc = [
                term for term in trigram_doc if term not in stop_words_extended
            ]

            # write the transformed doc as a line in the new file
            trigram_doc = ' '.join(trigram_doc)
            f.write(trigram_doc + '\n')
    if verbose:
        logger.info('Done.')

    # put the text back in the dataframe
    trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath'])

    if len([doc for doc in trigram_docs]) == df_phrased.shape[0]:
        for i, doc in enumerate(trigram_docs):
            df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc)
    else:
        raise ValueError(
            'Different number of processed and original documents')

    # save dataframe
    if verbose:
        logger.info('Saving NLP processed data: {}'.format(
            filepath_dict['filepath_out']))
    df_phrased.to_csv(filepath_dict['filepath_out'])

    return df_phrased
示例#2
0
# -*- coding: utf-8 -*-
# train_word2vec_model.py用于训练模型
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # check and process input arguments
    if len(sys.argv) < 4:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp1, outp2 = sys.argv[1:4]
    model = Word2Vec(LineSentence(inp),
                     size=400,
                     window=5,
                     min_count=5,
                     workers=multiprocessing.cpu_count())
    # trim unneeded model memory = use(much) less RAM
    #model.init_sims(replace=True)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)
示例#3
0
# -*- coding: utf-8 -*-

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import time

# 训练模型
t0 = int(time.time())
sentences = LineSentence('wiki.zh.word.text')
# size:词向量长度
# window:词向量上下文最大距离,默认为 5,对于一般的语料,这个值推荐在 [5, 10] 之间
# sg:如果设置为 0,则是 CBOW 模型,如果是 1,则是 Skip-Gram 模型,默认为 0
# hs:如果是 0,则是负采样(Negative Sampling),如果是 1,则是 Hierarchical Softmax,默认为 0
# negative:即使用 Negative Sampling 时负采样的个数,默认是 5,推荐在 [3, 10] 之间
# min_count:需要计算词向量的最小词频,这个值可以去掉一些很生僻的低频词,默认值为 5
# iter:随机梯度下降法中迭代的最大次数,默认是 5
# alpha:在随机梯度下降法中迭代的初始步长,默认是0.025
# min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha 给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由 iter、alpha、min_alpha 一起得出
model = Word2Vec(sentences, size=128, window=5, min_count=5, workers=4)
print('训练耗时 %d s' % (int(time.time()) - t0))

# 保存模型
model.save('gensim_128')
示例#4
0
# line = f.readline()
# lastline=""
# while line:
#     try:
#         if lastline!=line:
#             wf.write(line)
#             lastline = line
#         line = f.readline()
#     except UnicodeDecodeError:
#         print('error')
#         f.seek(2,1)
# f.close()
# wf.close()

if mode_append == False:
    sentences = LineSentence(words_file)
    model = Word2Vec(sentences, size=150, hs=1, window=5)
    model.save(model_file)
    print(model.most_similar('天才'))
else:
    # i=0
    model = Word2Vec.load(model_file)
    print('load finish')
    model.build_vocab(corpus_file=words_file, update=True)
    # print(model2.corpus_total_words)
    # model2.build_vocab(corpus_file='./data/zh.seg.txt', update=True)
    print('build finish')
    model.train(corpus_file=words_file,
                total_examples=model.corpus_count,
                epochs=model.iter,
                total_words=model.corpus_total_words)
示例#5
0
imp.reload(sys)

origin_file = "../create_dataset/data/clotho_csv_files/clotho_captions_development.csv"
output_file = "./target_development.txt"
rd = '[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、::~@#¥%……&*()《》<>]+'

print("prepare data")
ori = codecs.open(origin_file, 'r', encoding='utf-8')
out = codecs.open(output_file, 'w', encoding='utf-8')
print("read files")

text = pd.read_csv(origin_file, index_col=None)[[
    'caption_1', 'caption_2', 'caption_3', 'caption_4', 'caption_5'
]].values.tolist()

out_text = [re.sub(rd, ' ', t.lower()) for caption in text for t in caption]
out_text = '\n'.join(out_text)
out.writelines(out_text)

ori.close()
out.close()

print("Generated word vector")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = LineSentence(output_file)
model = Word2Vec(sentences, size=192, min_count=1, iter=1000, workers=8)
model.train(sentences, total_examples=model.corpus_count, epochs=1000)
model.save("./w2v_192.mod")
print("Done")
#_*_coding:utf-8_*_

from gensim import models
from gensim.models.word2vec import LineSentence
import codecs
import multiprocessing
import logging
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

word = '../data/raw_data/word.csv'
article = '../data/raw_data/article.csv'

file_word = codecs.open(word, 'r', 'utf-8')
model = models.Word2Vec(LineSentence(file_word),
                        sg=0,
                        size=192,
                        window=5,
                        min_count=5,
                        workers=multiprocessing.cpu_count())
#model.save('../data/model/word.bin')
model.wv.save_word2vec_format('../data/model/word/word.txt')

file_article = codecs.open(article, 'r', 'utf-8')
model = models.Word2Vec(LineSentence(file_article),
                        sg=0,
                        size=192,
                        window=5,
                        min_count=5,
                        workers=multiprocessing.cpu_count())
示例#7
0
#coding: utf8
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

sentences_file = '/home/parallels/dev/ai-nlp/lesson-five/cut_words_new'
output_model_file = '/home/parallels/dev/ai-nlp/lesson-five/word_to_vec_model_new'

#model = Word2Vec(LineSentence(sentences_file), window=5, workers=multiprocessing.cpu_count())
model = Word2Vec(LineSentence(sentences_file),
                 size=200,
                 window=5,
                 min_count=5,
                 workers=4)
model.save(output_model_file)
示例#8
0
def train_w2v():
    with open('./data/reduced_zhwiki.txt', 'r', encoding='utf8') as f:
        # 使用gensim的Word2Vec类来生成词向量
        model = Word2Vec(LineSentence(f), sg=0, size=192, window=5,
                         min_count=5, workers=4)
        model.save('./data/zhwiki_news.word2vec')
config_pattern = "size{}window{}sg{}min_count{}negative{}iter{}"
config_str = config_pattern.format(args.size, args.window, args.sg,
                                   args.min_count, args.negative, args.iter)
outputfile1 = outputpath + config_str + ".model"
outputfile2 = outputpath + config_str + ".vector"
############### end of config #################

logging.basicConfig(filename=config_str + '.log',
                    filemode='w',
                    format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger = logging.getLogger()
logger.info("running train process in custom: %s" % args.train)

model = Word2Vec(
    LineSentence(inpputfile),
    size=args.size,
    window=args.window,
    min_count=args.
    min_count,  # with 0.35 billion corpus, #3000 can retain 9228 unique words
    workers=args.workers,  # multiprocessing.cpu_count()
    #sample=args.sample,
    sg=args.sg,
    #hs=args.hs,
    negative=args.
    negative,  # follow tensorflow's word2vec_optimized.py num_neg_samples 25
    iter=args.iter)

# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(outputfile1)
def stool_simulator(total_epoch, special_epoch_count, restricted_vocab_name):
    # corpus_file = '/Users/zzcoolj/Code/GoW/data/training data/Wikipedia-Dumps_en_20170420_prep/AA/wiki_01.txt'
    corpus_file = 'input/enwiki-1G.txt'
    xlsx_path = 'output/test1G-vocab50000-stool-iter' + str(total_epoch) + '-first' + str(special_epoch_count) \
                + 'EpochInitial-' + str(restricted_vocab_name) + '.xlsx'
    df = pd.DataFrame(columns=[
        # word embeddings file name
        'file name',
        # wordsim353
        'wordsim353_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # simlex999
        'simlex999_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # MTURK-771
        'MTURK771_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # questions-words
        'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total'
    ])
    line_number_in_xlsx = 0
    lr = 0.025
    alphas = alpha_splitter(start=lr, epochs=total_epoch)
    print('alphas', alphas)

    # special starting epochs (final notIn)
    restricted_vocab = read_file_to_dict('../word_embeddings_evaluator/data/distinct-tokens/' +
                                         str(restricted_vocab_name) + '.txt')
    restricted_type = 1
    params = {
        'alpha': lr,
        'min_alpha': alphas[special_epoch_count],
        'size': 200,
        'window': 5,
        'iter': special_epoch_count,
        'max_vocab_size': 50000,
        'sample': 1e-4,
        'sg': 1,  # 1 for skip-gram
        'hs': 0,  # If 0, and negative is non-zero, negative sampling will be used.
        'negative': 5,
        'workers': 3,

        'restricted_vocab': restricted_vocab,  # [modified] ATTENTION: It must be a dictionary not a list!
        'restricted_type': restricted_type  # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn
    }
    print('special epochs half', special_epoch_count)
    gs_model = Word2Vec(LineSentence(corpus_file), **params)
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch_count) + '-half')

    # special starting epochs (final in)
    print('special epochs entire', special_epoch_count)
    gs_model.restricted_type = 2
    gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                   start_alpha=lr, end_alpha=alphas[special_epoch_count])
    line_number_in_xlsx += 1
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch_count) + '-entire')

    # original ending epochs
    print('roof epochs')
    gs_model.restricted_type = 0
    gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=total_epoch-special_epoch_count,
                   start_alpha=alphas[special_epoch_count], end_alpha=alphas[-1])
    line_number_in_xlsx += 1
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(total_epoch))

    writer = pd.ExcelWriter(xlsx_path)
    df.to_excel(writer, 'Sheet1')
    writer.save()
def iteration_simulator(total_epoch, special_epoch_count, restricted_vocab_name, jumps):
    # corpus_file = '/Users/zzcoolj/Code/GoW/data/training data/Wikipedia-Dumps_en_20170420_prep/AA/wiki_01.txt'
    corpus_file = 'input/enwiki-1G.txt'
    xlsx_path = 'output/test1G-vocab50000-original-iter' + str(total_epoch) + '-last' + str(special_epoch_count) \
                + 'EpochInitial-' + str(restricted_vocab_name) + '-jump'+''.join(str(x) for x in jumps)+'.xlsx'
    df = pd.DataFrame(columns=[
        # word embeddings file name
        'file name',
        # wordsim353
        'wordsim353_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # simlex999
        'simlex999_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # MTURK-771
        'MTURK771_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # questions-words
        'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total'
    ])
    line_number_in_xlsx = 0

    # epoch 0
    lr = 0.025
    alphas = alpha_splitter(start=lr, epochs=total_epoch)
    print('alphas', alphas)
    min_alpha = alphas[1]
    restricted_vocab = read_file_to_dict('../word_embeddings_evaluator/data/distinct-tokens/' +
                                         str(restricted_vocab_name) + '.txt')
    restricted_type = 0
    params = {
        'alpha': lr,
        'min_alpha': min_alpha,
        'size': 200,
        'window': 5,
        'iter': 0,  # TODO NOW
        'max_vocab_size': 50000,
        'sample': 1e-4,
        'sg': 1,  # 1 for skip-gram
        'hs': 0,  # If 0, and negative is non-zero, negative sampling will be used.
        'negative': 5,
        'workers': 3,

        'restricted_vocab': restricted_vocab,  # [modified] ATTENTION: It must be a dictionary not a list!
        'restricted_type': restricted_type  # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn
    }
    print('cur_epoch', 0)
    gs_model = Word2Vec(LineSentence(corpus_file), **params)
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch0')
    gs_model.epochs = 1  # TODO NOW

    # # epoch 0.5
    # gs_model.restricted_type = 2
    # gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
    #                start_alpha=lr, end_alpha=min_alpha)
    # df.loc[1] = evaluate(gs_model.wv, 'X-iter0.5')

    # epoch 1+
    # gs_model.restricted_type = 0
    for cur_epoch in range(1, total_epoch-special_epoch_count):
        print('cur_epoch', cur_epoch)
        start_alpha = alphas[cur_epoch]
        end_alpha = alphas[cur_epoch+1]
        print('start_alpha', start_alpha)
        print('end_alpha', end_alpha)
        gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                       start_alpha=start_alpha, end_alpha=end_alpha)
        line_number_in_xlsx += 1
        df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch'+str(cur_epoch))

    # # save common base model
    # write_to_pickle(gs_model, xlsx_path.split('.xlsx')[0]+'-base')

    for special_epoch in range(total_epoch-special_epoch_count, total_epoch):
        print('special epoch', special_epoch)
        start_alpha = alphas[special_epoch]
        end_alpha = alphas[special_epoch+1]
        print('start_alpha', start_alpha)
        print('end_alpha', end_alpha)
        # final special epochs 0.5
        gs_model.restricted_type = 1
        gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                       start_alpha=start_alpha, end_alpha=end_alpha)
        line_number_in_xlsx += 1
        df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch'+str(special_epoch)+'-half')

        # final special epochs final
        if special_epoch not in jumps:
            gs_model.restricted_type = 2
            gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                           start_alpha=start_alpha, end_alpha=end_alpha)
            line_number_in_xlsx += 1
            df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch)+'-entire')

    # # baseline (final original word2vec epochs)
    # gs_model_base = read_pickle(xlsx_path.split('.xlsx')[0] + '-base')
    # gs_model_base.restricted_type = 0
    # for baseline_epoch in range(total_epoch - special_epoch_count, total_epoch):
    #     print('baseline epoch', baseline_epoch)
    #     start_alpha = alphas[baseline_epoch]
    #     end_alpha = alphas[baseline_epoch + 1]
    #     print('start_alpha', start_alpha)
    #     print('end_alpha', end_alpha)
    #     gs_model_base.train(LineSentence(corpus_file), total_examples=gs_model_base.corpus_count, epochs=gs_model_base.iter,
    #                         start_alpha=start_alpha, end_alpha=end_alpha)
    #     line_number_in_xlsx += 1
    #     df.loc[line_number_in_xlsx] = evaluate(gs_model_base.wv, 'epoch' + str(baseline_epoch)+'-baseline')

    writer = pd.ExcelWriter(xlsx_path)
    df.to_excel(writer, 'Sheet1')
    writer.save()
    # MTURK-771
    'MTURK771_Pearson correlation', 'Pearson pvalue',
    'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
    # questions-words
    'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total'
])

for i in range(5):
    params = {
        'alpha': 0.025,
        'min_alpha': 0.0001,
        'size': 200,
        'window': 5,
        'iter': 5,
        'max_vocab_size': 50000,
        'sample': 1e-4,
        'sg': 1,  # 1 for skip-gram
        'hs': 0,  # If 0, and negative is non-zero, negative sampling will be used.
        'negative': 5,
        'workers': 3,

        'restricted_vocab': None,  # [modified] ATTENTION: It must be a dictionary not a list!
        'restricted_type': 0  # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn
    }
    gs_model = Word2Vec(LineSentence(corpus_file), **params)
    df.loc[i] = evaluate(gs_model.wv, str(i))

writer = pd.ExcelWriter(xlsx_path)
df.to_excel(writer, 'Sheet1')
writer.save()
示例#13
0
def word2vec_training(text_file):
    sentences = LineSentence(text_file)
    model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=16)
    model.wv.save("merge_with_unk.kv")
    # model.wv.save_word2vec_format("merge_with_unk_vector.txt", binary=False)
    return model
示例#14
0
 def _train_phrase_detection_model(self, input_filepath, output_filepath):
     sentences = LineSentence(input_filepath)
     model = Phraser(Phrases(sentences))
     self._save_sentences(sentences, model, output_filepath)
     return model
示例#15
0
def build_dataset(train_data_path, test_data_path):
    '''
    数据加载+预处理
    :param train_data_path:训练集路径
    :param test_data_path: 测试集路径
    :return: 训练数据 测试数据  合并后的数据
    '''
    # 1.加载数据
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    print('train data size {},test data size {}'.format(
        len(train_df), len(test_df)))

    # 2. 空值剔除
    train_df.dropna(subset=['Report'], inplace=True)

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)

    # 3.多线程, 批量数据处理
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)

    # 4. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue',
                                   'Report']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    test_df['merged'] = test_df[['Question',
                                 'Dialogue']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(
        len(train_df), len(test_df), len(merged_df)))

    # 5.保存处理好的 训练 测试集合
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

    train_df.to_csv(train_seg_path, index=None, header=False)
    test_df.to_csv(test_seg_path, index=None, header=False)

    # 6. 保存合并数据
    merged_df.to_csv(merger_seg_path, index=None, header=False)

    # 7. 训练词向量
    print('start build w2v model')
    wv_model = Word2Vec(LineSentence(merger_seg_path),
                        size=embedding_dim,
                        sg=1,
                        workers=8,
                        iter=wv_train_epochs,
                        window=5,
                        min_count=5)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 训练集 验证集划分
    X_train, X_val, y_train, y_val = train_test_split(
        train_df['X'],
        train_df['Report'],
        test_size=0.002,  # 8W*0.002
    )

    X_train.to_csv(train_x_seg_path, index=None, header=False)
    y_train.to_csv(train_y_seg_path, index=None, header=False)
    X_val.to_csv(val_x_seg_path, index=None, header=False)
    y_val.to_csv(val_y_seg_path, index=None, header=False)

    test_df['X'].to_csv(test_x_seg_path, index=None, header=False)

    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))

    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))

    # 10. 保存pad oov处理后的,数据和标签
    train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
    train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
    test_df['X'].to_csv(test_x_pad_path, index=None, header=False)
    #
    # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len))

    # 11. 词向量再次训练
    # print('start retrain w2v model')
    # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('1/3')
    # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('2/3')
    # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)

    # 保存词向量模型
    wv_model.save(save_wv_model_path)
    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    # 12. 更新vocab
    vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
    reverse_vocab = {
        index: word
        for index, word in enumerate(wv_model.wv.index2word)
    }

    # 保存字典
    save_dict(save_vocab_path, vocab)
    save_dict(reverse_vocab_path, reverse_vocab)

    # 13. 保存词向量矩阵
    embedding_matrix = wv_model.wv.vectors
    np.save(embedding_matrix_path, embedding_matrix)

    # 14. 数据集转换 将词转换成索引  [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231
    # vocab = Vocab()

    train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab))
    train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab))
    test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab))

    # 15. 数据转换成numpy数组
    # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
    train_X = np.array(train_ids_x.tolist())
    train_Y = np.array(train_ids_y.tolist())
    test_X = np.array(test_ids_x.tolist())

    # 保存数据
    np.save(train_x_path, train_X)
    np.save(train_y_path, train_Y)
    np.save(test_x_path, test_X)
    return train_X, train_Y, test_X
示例#16
0
    executor = Parallel(n_jobs=n_jobs,
                        backend="multiprocessing",
                        prefer="processes")
    do = delayed(partial(tokenize_sentence_corpus, corpus_out_path))
    tasks = (do(i, batch) for i, batch in enumerate(partitions))

    executor(tasks)


# process_texts(documents_path, year='2020', court='01', corpus_out_path=unigram_sentences_path, batch_size=8, n_jobs=2,
#               debug=True)

stop_words = get_custom_stop_words()

pruned_words, counters, total_words = Phrases.learn_vocab(
    sentences=LineSentence(unigram_sentences_path),
    max_vocab_size=800000000,
    common_terms=stop_words,
    progress_per=100)

counters = sorted(counters.items(),
                  key=lambda key_value: key_value[1],
                  reverse=True)

count = 0
for key, value in counters:
    count += 1
    print(any2unicode(key), value)
print(count)

bigram_model = Phrases(LineSentence(unigram_sentences_path),
示例#17
0
def train_model(corpus_path, outpath):
    logger.info('Training model...')
    model = Word2Vec(LineSentence(corpus_path), workers=cpu_count())
    model.save(outpath)
示例#18
0
# -*-coding=utf-8-*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import codecs

filename = 'html.txt'
sentences = LineSentence(filename)
model = Word2Vec(sentences, size=128, window=5, min_count=5, workers=4)
model.save('word_embedding_128')

items = model.most_similar('中国')
for item in items:
    print(item[0], item[1])

print(model.similarity('男人', '女人'))

filename = 'wikizhword.text'
f = codecs.open(filename, 'r', encoding='utf-8')
line = 20
for _ in range(line):
    print(f.readline())
# sentences = LineSentence(f)
# model = Word2Vec(sentences,size=128,window=5,min_count=5,workers=4)
# model.save('word_embedding_128')
#
# #model=Word2Evc.load('word_embedding_128')
# items = model.most_similar('中国')
# for item in items:
# 	print(item[0],item[1])
#
#
示例#19
0
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import jieba
import os
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

os.chdir('_normal')
lines = []
for file in os.listdir():
    lines += LineSentence(file)

model = Word2Vec(lines, size=100)  # 训练skip-gram模型; 默认window=5

os.chdir('..')

model.save('save.model')
示例#20
0
import os
import codecs
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import pandas as pd

import settings

trigram_sentences = LineSentence(
    os.path.join(settings.DATA_PATH, 'trigram_sentences.txt'))
word2vec_filepath = os.path.join(settings.DATA_PATH, 'word2vec_model')

if 0 == 1:
    text2vec = Word2Vec(trigram_sentences,
                        size=100,
                        window=5,
                        min_count=20,
                        sg=1,
                        workers=4)
    text2vec.save(word2vec_filepath)
    for i in range(1, 12):
        text2vec.train(trigram_sentences)
        text2vec.save(word2vec_filepath)

text2vec = Word2Vec.load(word2vec_filepath)
text2vec.init_sims()

print('{} training epochs so far.'.format(text2vec.train_count))
print('{:,} terms in the text2vec vocabulary.'.format(len(text2vec.vocab)))
示例#21
0
import logging
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

path = '/Users/chaofeng/Documents/GitHub/advancedTM_Red/data/'
trigram_filename = path + 'wikidata-20200213-truthy-BETA.trigrams.bz2'
#trigram_filename = 'wikidata-20200213-truthy-BETA.trigrams.bz2'
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

sentences = LineSentence(trigram_filename)

filename = path + 'wikidata_cbow300_iter15'
w2v = Word2Vec(sentences, size=300, window=1, min_count=20, workers=20, iter=1)
w2v.save(filename)
示例#22
0
import os
import sys
root_path = "/home/ubuntu/answerbot-tool/src"
sys.path.append(root_path)
from gensim.models.word2vec import Word2Vec, LineSentence
from utils.time_utils import get_current_time

corpus_fpath = '../_1_preprocessing/corpus.txt'

print 'start time : ', get_current_time()
sentences = LineSentence(corpus_fpath)
print "begin training..."

# size is the dimensionality of the feature vectors.
# window is the maximum distance between the current and predicted word within a sentence.
# min_count = ignore all words with total frequency lower than this.
# workers = use this many worker threads to train the model (=faster training with multicore machines).

model = Word2Vec(sentences,
                 size=200,
                 window=5,
                 min_count=0,
                 workers=4,
                 iter=100)

model.save('model')
print 'end time : ', get_current_time()
示例#23
0
# -*- coding: utf-8 -*-
"""
 Usage: ./train.py <path to processed wiki> <path to output file>

 Adapted from http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim
"""

import logging
import os.path
import sys
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp1 = sys.argv[1:4]

    # NOTE: it doesn't shuffle data between passes, which might degrade performance
    model = Word2Vec(LineSentence(inp), size=300, negative=5, workers=5)

    model.save_word2vec_format(outp1, binary=False)
示例#24
0
#coding:utf-8
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
import logging

inFile = 'corpus.txt'
outFile = 'output_demoModel.out'
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

model = word2vec.Word2Vec(LineSentence(inFile),
                          size=100,
                          window=3,
                          min_count=1)
print model.wv[u'理']
示例#25
0
文件: w2v_rnn.py 项目: yangzhip/2
#读入评论内容
all = pd.read_csv('Train_Data.csv', encoding='utf-8')
comment = all[all['text'].notnull()]  #仅读取非空评论
comment = comment['text']
comment = np.array(comment)
for i in range(len(comment)):
    comment[i] = r.sub('', str(comment[i]))
comment = pd.DataFrame(comment)
comment['words'] = comment[0]
del comment[0]
comment['mark'] = all['negative']
comment = comment[3000:]
comment['words'] = comment['words'].apply(cw)  #评论分词
with open('result.txt', 'w', encoding="utf-8") as f2:
    f2.write(str(comment['words']))
sentences = LineSentence("D:/PythonCode/yang/test/result.txt")
path = get_tmpfile("D:/PythonCode/yang/test/w2v_model.bin")  # 创建临时文件
model = Word2Vec(sentences, hs=1, min_count=1, window=10, size=100)
# 模型储存与加载1
model.save(path)
model = Word2Vec.load("D:/PythonCode/yang/test/w2v_model.bin")
# d2v_train = pd.concat([pn['words'], comment['words']], ignore_index = True)
#
# w = [] #将所有词语整合在一起
# for i in d2v_train:
#     w.extend(i)
#
# dict = pd.DataFrame(pd.Series(w).value_counts()) #统计词的出现次数
# del w,d2v_train
# dict['id']=list(range(1,len(dict)+1))
#
示例#26
0
print("Création d'index ...")
#sauvegarder dans un dossier
os.mkdir(documentsDict)
for f in documents:
    out_file=open(join(documentsDict,f), 'w')
    out_file.write(str(documents[f]))
    out_file.close()

out_file=open(doc_freqTerm, 'w')
out_file.write(str(doc_freq))
out_file.close()
print("corpus traité avec succès \n")

#2 apprentissage
print("Apprentissage ...")
sentences = LineSentence(corpusAsSentences)
model = Word2Vec(sentences, size=dimConcept, window=win, min_count=minc, workers=4) # lancer la génération du vocabulaire
model.save_word2vec_format(index+'/word2vec'+str(dimConcept)+'_win'+str(win)+'_min'+str(minc)+'.txt', fvocab=None, binary=False)
print("vocabulaire ok \n")

#3 representation des documents en vecteurs  
print("Documents to vectors ...")
os.mkdir(matDoc)
for f in listdir(collection): # ici lire tte la collection 
    doc=documents[f]
    tdoc=0 # taille du document à calculer :
    tdoc=sum(doc.values())
    vec_doc=numpy.zeros(dimConcept)
    mat_doc={}
    for word in doc:
        if(word in model.vocab):
with open("news_data_8.txt", "w") as text_file:
    text_file.write(kalimat_panjang)

import multiprocessing
import logging
import os.path
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

namaFileInput = "news_data_8.txt"
namaFileOutput = "hoax_en_case"

model = Word2Vec(LineSentence(namaFileInput),
                 size=400,
                 window=5,
                 min_count=5,
                 workers=multiprocessing.cpu_count())

# trim unneeded model memory = use (much) less RAM
model.init_sims(replace=True)
model.save(namaFileOutput)
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import remove_stopwords

# EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment"
EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment"
csv_file = EXP_HOME + "/stackoverflow/eclipse/eclipse-qa.csv"

CUSTOM_FILTERS = [
    lambda x: x.lower(), strip_multiple_whitespaces, strip_punctuation,
    remove_stopwords, strip_non_alphanum
]
sentences = LineSentence(open(csv_file, 'r'),
                         max_sentence_length=100000,
                         limit=None)
pre_processed = list()
for sentence in sentences:
    # print(' '.join(sentence))
    temp = ' '.join(sentence)
    pp_sentence = preprocess_string(temp, CUSTOM_FILTERS)
    # print(pp_sentence)
    pre_processed.append(' '.join(pp_sentence))

# saving the pre-processed to the file
myFile = open(pp_raw_code, 'w')
for line in pre_processed:
    myFile.write("%s\n" % line)

print("Corpus preprocessed successfully!")
示例#29
0
def dict_to_text():
    data=read_post_data()
    text={}
    for i,t in data.items():
        text[i]=split_word(t)
    return text

if __name__ == '__main__':
    #输入文件:每篇文章转换位1行text文本,并且去掉了标点符号等内容
    #中文数据,与英文处理过程相似,也分两个步骤,不过需要对中文数据特殊处理一下,包括繁简转换,中文分词,去除非utf-8字符等。
    inp=r'f:\python\data\jiebatext.txt'#输入文件
    outp1=r"F:\python\data\text.model"#gensim中默认格式的word2vec model
    outp2 =r"F:\python\data\text.vector"#原始c版本word2vec的vector格式的模型

    model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5,
            workers=multiprocessing.cpu_count())
#    参数解释:
#1.sg=1是skip-gram算法,对低频词敏感;默认sg=0为CBOW算法。
#2.size是输出词向量的维数,值太小会导致词映射因为冲突而影响结果,值太大则会耗内存并使算法计算变慢,一般值取为100到200之间。
#3.window是句子中当前词与目标词之间的最大距离,3表示在目标词前看3-b个词,后面看b个词(b在0-3之间随机)。
#4.min_count是对词进行过滤,频率小于min-count的单词则会被忽视,默认值为5。
#5.negative和sample可根据训练结果进行微调,sample表示更高频率的词被随机下采样到所设置的阈值,默认值为1e-3。
#6.hs=1表示层级softmax将会被使用,默认hs=0且negative不为0,则负采样将会被选择使用。
#7.workers控制训练的并行,此参数只有在安装了Cpython后才有效,否则只能使用单核。

#word2vec的两种形式:CBOW和Skip-gram模型
#CBOW去除了上下文各词的词序信息,使用上下文各词的平均值。
#skip-gram和CBOW正好相反,它使用单一的焦点词作为输入,经过训练然后输出它的目标上下文

    # trim unneeded model memory = use(much) less RAM
示例#30
0
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
model = Word2Vec(
    LineSentence('data/simple.reg.txt'),
    size=400,
    window=5,
    min_count=5,
    workers=multiprocessing.cpu_count() - 2,
)

outp1 = 'data/simple.zh.text.model'
outp2 = 'data/simple.zh.text.vector'
model.save(outp1)
model.wv.save_word2vec_format(outp2)