示例#1
0
    def get_trigrams(self):
        """
        Builds unigram, bigram, and trigram models respectively.
        Writes the text of each model to a seperate file.

        """
        unigram_sentences = LineSentence(self.unigram_sentences_filepath)
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(self.bigram_model_filepath)
        bigram_model = Phrases.load(self.bigram_model_filepath)
        with open(self.bigram_sentences_filepath, 'w', encoding="utf-8") as f:
            for unigram_sentence in unigram_sentences:
                bigram_sent = " ".join(bigram_model[unigram_sentence])  # a bit confused by this.
                f.write(bigram_sent)
        bigram_sentences = LineSentence(self.bigram_sentences_filepath)
        trigram_model = Phrases(bigram_sentences)
        trigram_model.save(self.trigram_model_filepath)
        trigram_model = Phrases.load(self.trigram_model_filepath)
        with open(self.trigram_sentences_filepath, 'w', encoding="utf-8") as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = " ".join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')
        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        with open(self.trigram_articles_filepath, 'w', encoding="utf-8") as f:
            for parsed_article in self.line_article("../data/article_texts"):
                unigram_article = [token.lemma_ for token in self.nlp(parsed_article)
                                   if not self.punct_space(token)]
                bigram_article = bigram_model[unigram_article]
                trigram_article = trigram_model[bigram_article]
                trigram_article = [term for term in trigram_article
                                    if term not in STOP_WORDS]
                trigram_article = " ".join(trigram_article)
                f.write(trigram_article + '\n')
def train_bigram(unigram_txt_filepath, bigram_model_filepath, savebigram,
                 bigram_txt_filepath):
    print('reading unigram text file.....')

    unigram_txt = LineSentence(unigram_txt_filepath)
    print('training bigram model.....')

    bigram_model = Phrases(unigram_txt)
    print('saving bigram model.....')

    bigram_model.save(bigram_model_filepath)

    # load the finished model from disk
    # bigram_model = Phrases.load(bigram_model_filepath)

    if savebigram:
        print('saving bigram processed text file....')

        with codecs.open(bigram_txt_filepath, 'w', encoding='utf_8') as f:
            i = 0
            for unigram_sentence in tqdm(unigram_txt):

                bigram_sentence = u' '.join(bigram_model[unigram_sentence])

                f.write(bigram_sentence + '\n')

                i = i + 1

                if (i % 10000 == 0):
                    print('Bigram Processed ' + str(i) + ' articles')
def _phrase_detection_(fpath=fpathroot + fpathappend,
                       passes=2,
                       returnmodels=True,
                       threshold=10.):
    """
    This function does pharse modeling. User specifies the number of passes.
    Each pass detects longer phrases. The maximum detectable phrase length for
    each pass, n, is 2*n.

    Returns the list of models by default. Also saves models and intermediary
    phrased sentences for each pass.
    """
    generpath = fpath + '_sent_gram_0.txt'
    ngram = list()
    for it in range(passes):
        gen = LineSentence(generpath)
        gram = Phrases(gen, threshold=threshold)
        ngram.append(gram)
        modelpath = fpath + 'phrase_model_gram_' + str(it + 1)
        generpath = fpath + 'sent_gram_' + str(it + 1) + '.txt'
        gram.save(modelpath)
        # Write sentence gram
        with codecs.open(generpath, 'w', encoding='utf_8') as f:
            for sent in gen:
                new_sent = u' '.join(gram[sent])
                f.write(new_sent + '\n')

    if returnmodels == True:
        return ngram
def main():
    # -------------------------------------------------------------------------------
    # Parameters

    # the script will most likely work if we swap the TEXTS variable
    # with any iterable of text (where one element represents a document,
    # and the whole iterable is the corpus)
    newsgroups_train = fetch_20newsgroups(subset='train')
    TEXTS = newsgroups_train.data

    # spacy's english model for text preprocessing
    NLP = spacy.load('en')

    # a set of stopwords built-in to spacy, we can always
    # expand this set for the problem that we are working on,
    # here we include python built-in string punctuation mark
    STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(
        ENGLISH_STOP_WORDS)

    # create a directory called 'model' to store all outputs in later section
    MODEL_DIR = 'model'
    UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
    PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
    BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
    WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')

    # -------------------------------------------------------------------------------
    logger.info('job started')
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if not os.path.exists(UNIGRAM_PATH):
        logger.info('preprocessing text')
        export_unigrams(UNIGRAM_PATH,
                        texts=TEXTS,
                        parser=NLP,
                        stopwords=STOPWORDS)

    if os.path.exists(PHRASE_MODEL_CHECKPOINT):
        phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
    else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        sentences = LineSentence(BIGRAM_PATH)
        word2vec = Word2Vec(sentences, workers=cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
示例#5
0
文件: phrases.py 项目: afcarl/factory
def train_phrases(paths, out='data/bigram_model.phrases', **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    max_vocab_size = 40000000

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.)

    print('Saving...')
    bigram.save(out)

    print('Some examples:')
    docs = [
        ['the', 'new', 'york', 'times', 'is', 'a', 'newspaper'],
        ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'],
        ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'],
        ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'],
        ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington']
    ]
    for r in bigram[docs]:
        print(r)
示例#6
0
def trigrams(corpus, output_prefix):
    print("----- Trigrams -----")
    if os.path.exists(output_prefix + "_trigram_phrases"):
        trigram_phrases = Phrases.load(output_prefix + "_trigram_phrases")
        print("Loaded trigram phrases")
    else:
        bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"])
        trigram_phrases = Phrases(bigram_phrases[corpus], min_count=CONFIG["trigram_phrase_min_count"], threshold=CONFIG["trigram_phrase_threshold"], delimiter=CONFIG["trigram_phrase_delimiter"])
        trigram_phrases.save(output_prefix + "_trigram_phrases")
    trigram_transformer = Phraser(trigram_phrases)
    dct = Dictionary(trigram_transformer[corpus])
    dct.save(output_prefix + "_dictionary_trigram")
    print("Training tf-idf from trigrams")
    bow_corpus = [dct.doc2bow(line) for line in trigram_transformer[corpus]]
    tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc')
    tfidf.save(output_prefix + "_tfidf_trigram")
    print("Training word2vec model with trigram")
    start_time = time()
    trigram_model = gensim.models.Word2Vec(trigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'],
                                   min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'],
                                   negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'],
                                   iter=CONFIG['train_epoch'])
    trigram_model.save(output_prefix + "_trigram")
    print("Time :", format_time(time() - start_time))
    return trigram_model
示例#7
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    in_dir, model_out = sys.argv[1:]
    sentences = Corpus(in_dir)
    phrases = Phrases(sentences)
    phrases.save(model_out)
def generate_bow(corpus_filename, category, use_bigrams, no_above, no_below):
    if not os.path.exists('./data/%s' % category):
        os.makedirs('./data/%s' % category)

    tokens = [
        utils.tokenize(line)
        for line, label in zip(open('./data/%s.csv' % corpus_filename),
                               open('./data/corpus-labels.csv'))
        if category in label
    ]
    print 'First token', tokens[1]

    category_filename = corpus_filename.replace('corpus', 'category')

    #Each category gets its own dictionary and its own corpus, but uses the same bigram model
    #that was computed on all the abstracts
    if use_bigrams:
        if not os.path.exists('./data/%s/bigram.bin' % category):
            bigram = Phrases(
                utils.tokenize(line)
                for line, label in zip(open('./data/%s.csv' % corpus_filename),
                                       open('./data/corpus-labels.csv'))
                if category in label)
            Phrases.save(bigram, './data/%s/bigram.bin' % category)
        else:
            bigram = Phrases.load('./data/%s/bigram.bin')

        tokens = [bigram[token] for token in tokens]
        print 'First bigram token', tokens[1]

    #Make the dictionary, a collection of statistics about all tokens in the corpus
    #This is the mapping from words to their id's. It's the lookup table for features.
    dictionary = corpora.Dictionary(tokens)

    # words that appear only once
    dictionary.filter_extremes(
        no_above, no_below)  #no_above=0.05, no_below=10 yielded good results
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()

    # store the dictionary, for future reference
    dictionary.save('./data/%s/%s.dict' % (category, category_filename))

    # memory-friendly bag-of-words class
    class BOW(object):
        def __iter__(self):
            for line, label in zip(open('./data/%s.csv' % corpus_filename),
                                   open('./data/corpus-labels.csv')):
                # assume there's one document per line, tokens separated by whitespace
                if category in label:
                    yield dictionary.doc2bow(utils.tokenize(line))
                else:
                    pass

    # Now we can make a bag of words and do something with it by iterating over it
    arxiv_bow = BOW()
    corpora.MmCorpus.serialize('./data/%s/%s.mm' %
                               (category, category_filename),
                               arxiv_bow)  # store to disk, for later use
示例#9
0
def get_bigram_model():
    model_exists = os.path.exists(bigram_model_filepath)
    if model_exists:
        bigram_model = Phrases.load(bigram_model_filepath)
    else:
        unigram_sentences = get_unigram_sentences()
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(bigram_model_filepath)
    return bigram_model
def main():
    # -------------------------------------------------------------------------------
    # Parameters

    # the script will most likely work if we swap the TEXTS variable
    # with any iterable of text (where one element represents a document,
    # and the whole iterable is the corpus)
    newsgroups_train = fetch_20newsgroups(subset = 'train')
    TEXTS = newsgroups_train.data

    # spacy's english model for text preprocessing
    NLP = spacy.load('en')

    # a set of stopwords built-in to spacy, we can always
    # expand this set for the problem that we are working on,
    # here we include python built-in string punctuation mark
    STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(ENGLISH_STOP_WORDS)

    # create a directory called 'model' to store all outputs in later section
    MODEL_DIR = 'model'
    UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
    PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
    BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
    WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')

    # -------------------------------------------------------------------------------
    logger.info('job started')
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if not os.path.exists(UNIGRAM_PATH):
        logger.info('preprocessing text')
        export_unigrams(UNIGRAM_PATH, texts = TEXTS, parser = NLP, stopwords = STOPWORDS)

    if os.path.exists(PHRASE_MODEL_CHECKPOINT):
        phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
    else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        sentences = LineSentence(BIGRAM_PATH)
        word2vec = Word2Vec(sentences, workers = cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
示例#11
0
def generateBigrams(sentences):
    bigram_transformer = Phrases(sentences, min_count=20, threshold=500)
    bigram_transformer.save("bigrams", pickle_protocol=3)

    fd = open("bigrams.txt", 'a')
    for phrase, score in bigram_transformer.export_phrases(sentences):
        fd.write(u'{0}   {1}'.format(phrase, score))
    fd.close()

    return bigram_transformer
示例#12
0
def learnMultiword(ret):
    print("Learning multiword expressions")
    bigram = Phrases(ret)
    bigram.save("phrase_all.model")

    print("Sanity checking multiword expressions")
    test = "i like donald trump and hate muslims , go hillary , i like jesus , jesus , against , abortion "
    sent = test.split(" ")
    print(bigram[sent])
    return bigram[ret]
def learnMultiword(ret):
    print("Learning multiword expressions")
    bigram = Phrases(ret)
    bigram.save("phrase_all.model")

    print("Sanity checking multiword expressions")
    test = "i like donald trump and hate muslims , go hillary , i like jesus , jesus , against , abortion "
    sent = test.split(" ")
    print(bigram[sent])
    return bigram[ret]
示例#14
0
def train_phrase_model(cleaned_filepath, bigram_model_filepath,
                       run_or_load_flag):

    if run_or_load_flag:
        unigram_sentences = LineSentence(cleaned_filepath)
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(bigram_model_filepath)
    else:
        bigram_model = Phrases.load(bigram_model_filepath)

    return bigram_model
示例#15
0
def extract_phrases(doc_words, save=False):
    logging.info("Extracting phrases...")
    global bigram
    global trigram
    bigram = Phrases(doc_words, threshold=5, min_count=5)
    trigram = Phrases(bigram[doc_words], threshold=3, min_count=3)
    if save:
        bigram.save("../model/bigram.model")
        trigram.save("../model/trigram.model")

    return trigram[bigram[doc_words]]
def bigram():
    # One time use, prepare bigram model
    unigram_sentences = LineSentence('../Dataset/unigram_sentences_all.txt')
    bigram_model_filepath = '../Models/bigram_model_all'
    bigram_sentences_filepath = '../Dataset/bigram_sentences_all.txt'
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save('../Models/bigram_model_all')
    with codecs.open('../Dataset/bigram_sentences_all.txt',
                     'w',
                     encoding='utf_8') as f:
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')
def trigram():
    # One time use, prepare trigram model
    bigram_sentences = LineSentence('../Dataset/bigram_sentences_all.txt')
    trigram_model_filepath = '../Models/trigram_model_all'
    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)
    trigram_sentences_filepath = '../Dataset/trigram_sentences_all.txt'
    trigram_reviews_filepath = '../Dataset/trigram_transformed_reviews_all.txt'
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')
    trigram_sentences = LineSentence(trigram_sentences_filepath)
示例#18
0
def generate_trigram(input_file_path, bigram_save_path, trigram_save_path, final_file_path):
    unigram_sentence  = LineSentence(input_file_path)
    bigram = Phrases(unigram_sentence)
    bigram.save(bigram_save_path)

    # temp strore bigram sentance to get the trigram
    temp_file_path = os.path.join(Config.temp_directory,  "temp_bigram.txt")
    __gram_to_text(unigram_sentence, bigram, temp_file_path)

    bigram_sentences = LineSentence(temp_file_path)
    trigram = Phrases(bigram_sentences)
    trigram.save(trigram_save_path)
    __gram_to_text(bigram_sentences, trigram, final_file_path)
示例#19
0
def trainPhrasesModel(tweets):
    """
    Train phrases model, experimental, not used
    :param tweets: list of tokenised tweets
    :return:
    """
    print("Learning multiword expressions")
    bigram = Phrases(tweets)
    bigram.save("../out/phrase_all.model")

    print("Sanity checking multiword expressions")
    test = "i like donald trump , go hillary clinton , i like jesus , jesus , legalisation abortion "
    sent = test.split(" ")
    print(bigram[sent])
    return bigram[tweets]
 def generate_bigrams_trigrams(self):
     unigram_sentences = LineSentence(self.unigram_sentences_filepath)
     bigram_model = Phrases(unigram_sentences)
     bigram_model.save(self.bigram_model_filepath)
     f = open(self.bigram_sentences_filepath, 'w')
     for unigram_sentence in unigram_sentences:
         bigram_sentence = u' '.join(bigram_model[unigram_sentence])
         f.write(bigram_sentence + '\n')
     f.close()
     bigram_sentences = LineSentence(self.bigram_sentences_filepath)
     trigram_model = Phrases(bigram_sentences)
     trigram_model.save(self.trigram_model_filepath)
     f = open(self.trigram_sentences_filepath, 'w')
     for bigram_sentence in bigram_sentences:
         trigram_sentence = u' '.join(trigram_model[bigram_sentence])
         f.write(trigram_sentence + '\n')
示例#21
0
def phrase_detect_train(sentances,min_count,threshold,common_terms,phrase_model_save_path = None):
    """
    input:
        sentances: tokenized sentances 
    """
    print('Transform sentances to trigrams .........\n')
    bi_phrases = Phrases(sentances, min_count=min_count, threshold=threshold,common_terms=common_terms)
    bigram_transformer = Phraser(bi_phrases)
    if phrase_model_save_path is not None:
        bi_phrases.save(phrase_model_save_path)
        bigram_transformer.save(phrase_model_save_path+'_transformer')

    sentances = list(bigram_transformer[sentances]) 
    ## if you want to check pharses list
    pharses_list = list(bigram_transformer.phrasegrams)
    print('Phrase model training done.')
    return sentances
示例#22
0
def generate_bow(corpus_filename, use_bigrams, no_above, no_below):

    tokens = [
        utils.tokenize(line)
        for line in open('./data/%s.csv' % corpus_filename)
    ]
    print 'First token', tokens[1]

    if use_bigrams:

        if not os.path.exists('./data/bigram.bin'):
            print "data/bigram.bin doesn't exist. Generating and saving bigram model. This could take a while."
            bigram = Phrases(
                tokenize(line)
                for line in open('./data/%s.csv' % corpus_filename))
            bigram.save('./data/bigram.bin')

        bigram = Phrases.load('./data/bigram.bin')
        tokens = [bigram[token] for token in tokens]
        print 'First bigram token', tokens[1]

    #Make the dictionary, a collection of statistics about all tokens in the corpus
    #This is the mapping from words to their id's. It's the lookup table for features.
    dictionary = corpora.Dictionary(tokens)

    # words that appear only once
    dictionary.filter_extremes(
        no_above, no_below)  #no_above=0.05, no_below=10 yielded good results
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()

    # store the dictionary, for future reference
    dictionary.save('./data/%s.dict' % corpus_filename)

    # memory-friendly bag-of-words class
    class BOW(object):
        def __iter__(self):
            for token in tokens:
                # assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(token)

    # Now we can make a bag of words and do something with it by iterating over it
    arxiv_bow = BOW()
    corpora.MmCorpus.serialize('./data/%s.mm' % corpus_filename,
                               arxiv_bow)  # store to disk, for later use
示例#23
0
def train():
    sents = LineSentence(args.sents)

    bi_path = os.path.join(args.save_dir, bi_model)
    print 'Bigram: ', bi_path
    if os.path.exists(bi_path):
        bigram = Phrases.load(bi_path)
    else:
        bigram = Phrases(sents,
                         min_count=args.min_count,
                         threshold=args.bi_threshold)
        bigram.save(bi_path)

    tri_path = os.path.join(args.save_dir, bi_model + '_' + tri_model)
    print 'Trigram: ', tri_path
    trigram = Phrases(bigram[sents],
                      min_count=args.min_count,
                      threshold=args.tri_threshold)
    trigram.save(tri_path)
def extract_phrases(reviews_sents, reviews_docs, save=False):
    logging.info("Extracting phrases...")
    bigram = Phrases(reviews_sents, threshold=5, min_count=5)
    trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3)
    if save:
        with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout:
            ph_dic = {}
            for phrase, score in bigram.export_phrases(reviews_sents):
                ph_dic[phrase] = score
            for phrase, score in trigram.export_phrases(bigram[reviews_sents]):
                ph_dic[phrase] = score
            for phrase, score in ph_dic.items():
                if re.search(r'\d+', phrase):  # remove digits
                    continue
                phrase = b"_".join(phrase.split(b' '))
                fout.write(phrase + b'\n')
        bigram.save("../model/bigram.model")
        trigram.save("../model/trigram.model")

    return trigram[bigram[reviews_docs]]
示例#25
0
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {
        'max_vocab_size': 40000000,
        'threshold': 8.
    }.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs)

    print('Saving...')
    bigram.save(out)
示例#26
0
def train_phrases(paths,
                  out='data/bigram_model.phrases',
                  tokenizer=word_tokenize,
                  **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {'max_vocab_size': 40000000, 'threshold': 8.}.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize),
                     **kwargs)

    print('Saving...')
    bigram.save(out)
示例#27
0
def personel_information():
#name,phone number, address, linkedin, github


#Eric’s section (name, email, phone, university, major, gpa) Done



#import path & variables
resume_path = r"/Users/yiyangzhou/Desktop/Yiyang (Eric) Zhou Resume 2017 Fall.txt"

resume_file = open(resume_path).read()
resume_file2 = open(resume_path).read()
resume_file2 = resume_file2.lower()
#change path
major_df = pandas.read_excel('majors.xlsx')
major_df.columns
major_file = major_df['Majors'].values
major_lower = [item.lower() for item in major_file]
tokenizer = RegexpTokenizer(r'\w+')
resume_token = tokenizer.tokenize(resume_file)
resume_token2 = tokenizer.tokenize(resume_file2)
major_distinct = []
dictionary = {'Name': 5}
regular_expression = re.compile(r"/BA|BS|Bachelor of Science|Bachelor of Arts|BBA |B/A|Bachelor of Business Administration/", re.IGNORECASE)
bach_major_result = re.search(regular_expression, resume_file)
regular_expression_two = re.compile(r"minor|Minor", re.IGNORECASE)
minor_result = re.search(regular_expression_two, resume_file)
regular_expression_three = re.compile(r"Master|master", re.IGNORECASE)
master_major_result = re.search(regular_expression_three, resume_file)
regular_expression_four = re.compile(r"university", re.IGNORECASE)
university_major_result = re.search(regular_expression_four, resume_file)
updated_majors1 = []
indexes_majors1 = []
updated_majors2 = []
indexes_majors2 = []
updated_majors3 = []
indexes_majors3 = []
updated_majors4 = []
indexes_majors4 = []
majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4

university_df1 = pandas.read_excel('China_University.xlsx')
university_df2 = pandas.read_excel('India_University.xlsx')
university_df3 = pandas.read_excel('US_University.xlsx')
university_file1 = university_df1['Universities'].values
university_file2 = university_df2['Universities'].values
university_file3 = university_df3['Universities'].values
university_lower1 = [item.lower() for item in university_file1]
university_lower2 = [item.lower() for item in university_file2]
university_lower3 = [item.lower() for item in university_file3]
university_combined = university_lower1 + university_lower2 + university_lower3


#extract name finished
def extract_first_name(resume):
    name = resume.split('\n', 1)[0]
    first_name = name.split(' ', 1)[0]
    return (first_name)
    print (first_name)

def extract_last_name(resume):
    name = resume.split('\n', 1)[0]
    last_name = name.split(' ', 1)[-1]
    return (last_name)
    print (last_name)

def extract_name(resume):
    name = extract_first_name(resume_file) + extract_last_name(resume_file)
    print (name)

#extract email finished
def extract_email(resume):
    regular_expression = re.compile(r"(\w+[.|\w])*@(\w+[.])*\w+", re.IGNORECASE)
    result = re.search(regular_expression, resume)
    if result:
        result = result.group()
    print (result)

#extract phone number finished

def check_phone_number1(resume):
    resume2 = "".join(c for c in resume if c not in ('!','.','-','(',')',' ','+',))
    result = re.findall(r"\d{10}", resume2)
    result = ''.join(result)
    return (result)

def check_phone_number2(resume):
    resume2 = "".join(c for c in resume if c not in ('!','.','-','(',')',' ','+',))
    result = re.findall(r"\d{11}", resume2)
    result = ''.join(result)
    result = result[1:11]
    return (result)

def extract_phone_number(resume):
    try:
        return check_phone_number1(resume)
        print (check_phone_number1(resume))
    except:
        return check_phone_number2(resume)
        print (check_phone_number2(resume))


def personel_information(resume):
    print(extract_name(resume))
    print(extract_email(resume))
    print(extract_phone_number(resume))

#execution of extracting name, email, phone number
personel_information(resume_file)



#major, University, gpa
def get_bigrams(input):
    n = 2
    result = []
    bigrams = ngrams(input, n)
    for grams in bigrams:
        x = "%s %s" % grams
        result.append(x)
    return (result)
    print (result)


def get_threegrams(input):
    n = 3
    result = []
    threegrams = ngrams(input, n)
    for grams in threegrams:
        x = "%s %s %s" % grams
        result.append(x)
    return (result)
    print (result)

def get_fourgrams(input):
    n = 4
    result = []
    fourgrams = ngrams(input, n)
    for grams in fourgrams:
        x = "%s %s %s %s" % grams
        result.append(x)
    return (result)
    print (result)

def get_fivegrams(input):
    n = 5
    result = []
    fivegrams = ngrams(input, n)
    for grams in fivegrams:
        x = "%s %s %s %s %s" % grams
        result.append(x)
    return (result)
    print (result)

def get_sixgrams(input):
    n = 6
    result = []
    sixgrams = ngrams(input, n)
    for grams in sixgrams:
        x = "%s %s %s %s %s %s" % grams
        result.append(x)
    return (result)
    print (result)

def get_majors(a,b):
    majors=[]
    for x in a:
        if x in b:
            majors.append(x)
    return (majors)
    print (majors)

def get_majors2(a,b):
    unigram_major = get_majors(a, b)
    bigram_major = get_majors(get_bigrams(a), b)
    threegram_major = get_majors(get_threegrams(a), b)
    combined_majors_list = unigram_major + bigram_major + threegram_major
    for i in combined_majors_list:
        if i not in major_distinct:
            major_distinct.append(i)
    print (major_distinct)

def get_majors_index(major_distinct):
    for i, element in enumerate(major_distinct):
        x = resume_file2.find(element)
        dictionary[element] = x
    del dictionary['Name']
    print(dictionary)

def get_bach_index(bach_major_result):
    if bach_major_result:
        bach_major_result = bach_major_result.group()
    print (bach_major_result)
    if bach_major_result is not None:
        bach_major_index = resume_file.find(bach_major_result)
    return(bach_major_index)
    print(bach_major_index)

def get_minor_index(minor_result):
   if minor_result:
       minor_result = minor_result.group()
   print (minor_result)
   if minor_result is not None:
       minor_index = resume_file.find(minor_result)
   return(minor_index)
   print(minor_index)

def get_master_index(master_major_result):
    if master_major_result:
        master_major_result = master_major_result.group()
    print (master_major_result)
    if master_major_result is not None:
        master_major_index = resume_file.find(master_major_result)
    return(master_major_index)
    print(master_major_index)

def get_university_index(university_major_result):
    if university_major_result:
        university_major_result = university_major_result.group()
    print (university_major_result)
    if university_major_result is not None:
        university_major_index = resume_file.find(university_major_result)
    return(university_major_index)
    print(university_major_index)

def get_bach_major(dictionary):
    bach_major_index = get_bach_index(bach_major_result)
    upper_bound = bach_major_index +100
    for k, v in dictionary.items():
        if (bach_major_index < v < upper_bound):
            updated_majors1.append(k)
            indexes_majors1.append(v)
    print(updated_majors1)
    print(indexes_majors1)


def get_master_major(dictionary):
    master_major_index = get_master_index(master_major_result)
    upper_bound = master_major_index +100
    for k, v in dictionary.items():
        if (master_major_index < v < upper_bound):
            updated_majors2.append(k)
            indexes_majors2.append(v)
    print(updated_majors2)
    print(indexes_majors2)

def get_minor(dictionary):
    minor_index = get_minor_index(minor_result)
    upper_bound = minor_index +100
    for k, v in dictionary.items():
        if (minor_index < v < upper_bound):
            updated_majors3.append(k)
            indexes_majors3.append(v)
    print(updated_majors3)
    print(indexes_majors3)

def get_university_major(dictionary):
    university_major_index = get_university_index(university_major_result)
    upper_bound = university_major_index +100
    for k, v in dictionary.items():
        if (university_major_index < v < upper_bound):
            updated_majors4.append(k)
            indexes_majors4.append(v)
    print(updated_majors4)
    print(indexes_majors4)



def extract_major(majors_minors_all):
    majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4
    majors_minors_final_list = list(dedupe(majors_minors_all))
    return (majors_minors_final_list)
    print (majors_minors_final_list)


#execution of extracting majors:

get_majors(resume_token2, major_lower)
get_majors2(resume_token2, major_lower)
get_majors_index(major_distinct)
get_bach_index(bach_major_result)
get_minor_index(minor_result)
get_master_index(master_major_result)
get_university_index(university_major_result)
get_bach_major(dictionary)
get_master_major(dictionary)
get_minor(dictionary)
get_university_major(dictionary)
print (extract_major(majors_minors_all))

#extract University:
def get_university(a,b):
    resume_university=[]
    for x in a:
        if x in b:
            resume_university.append(x)
    return (resume_university)
    print (resume_university)

def extract_university(resume_token_lower,university_combined):
    unigram_university = get_university(resume_token_lower, university_combined)
    bigram_university = get_university(get_bigrams(resume_token_lower), university_combined)
    threegram_university = get_university(get_threegrams(resume_token_lower), university_combined)
    fourgram_university = get_university(get_fourgrams(resume_token_lower), university_combined)
    fivegram_university = get_university(get_fivegrams(resume_token_lower), university_combined)
    sixgram_university = get_university(get_sixgrams(resume_token_lower), university_combined)
    combined_university_extraction = set(bigram_university + threegram_university + fourgram_university + fivegram_university + sixgram_university)
    print (combined_university_extraction)

#execution of extracting university:
extract_university(resume_token2,university_combined)

#extract GPA:
def extract_GPA(resume):
    result = re.search(r'(GPA|gpa): ?\d.\d{1,}',resume)
    if result:
        result = result.group(0)
    return (result)
    print (result)

#execution of extracting GPA:
extract_GPA(resume_file)


#HENRY

#Extracting Address

import re
import usaddress

#extract the address
def extract_address (text):
    text = text.replace('\n', ' ')
    regex = re.compile(r"[0-9]+ .*[.,-]? .*[.,-]? ([A-Z]{2}|\w+)[.,-]? [0-9]{5}(-[0-9]{4})?")
    result = re.search(regex, text)
    if result:
        result = result.group()
    return result

#Parse the address components
def parse_address(result):
    address = usaddress.tag(result)
    return address

#2. Extracting Company

import codecs
import os
import pandas as pd
from fuzzywuzzy.process import dedupe
import spacy
from nltk.corpus import stopwords


filename = 'BrandonThomasResume.txt'
#Open file
def open_file(filename):
    resume = open(filename, 'r', errors='ignore').read()
    return resume
resume = open_file(filename)

#Read the Work_Experience_List
data = pd.read_excel("Work Experience.xlsx", header=0)
experience_list = list(data['Example'])

#Find the experience header
def find_exp_header (resume):
    exp_header_list=[]
    for word in experience_list:
        if resume.find(word) != -1:
            exp_header_list.append(word)

    #remove duplicates of experience header
    exp_header = list(dedupe(exp_header_list))
    return exp_header

exp_header = find_exp_header(resume)
exp_header = (exp_header[0], resume.find(exp_header[0]))

#Find next section header
def find_next_section (resume):
    #Find all capitalized words
    next_section_upper = re.findall(r'([A-Z]{3,}( [A-Z]+)?( [A-Z]+)?( [A-Z]+)?)',
                                   resume[(exp_header[1] + len(exp_header[0])+ 1):])
    next_section_upper = list((itertools.chain.from_iterable(next_section_upper)))

    #Find all words with the first letter capitalized
    next_section_lower = re.findall(r'([A-Z]{1}\w+( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?)',
                                    resume[(exp_header[1] + len(exp_header[0])+ 1):])
    next_section_lower = list((itertools.chain.from_iterable(next_section_lower)))

    #Combine into a list
    next_section_list = next_section_upper + next_section_lower

    #if one of the items matches items in section list, that item is the next section header
    next_section=()
    for item in next_section_list:
        if item in section_list and (resume[resume.find(item)+len(item)]=='\n' or resume[resume.find(item)-1]=='\n'):
            next_section = (item, resume.find(item))
            break
    return next_section

next_section = find_next_section(resume)

# Get the section of Work_Experience
def get_workexp_section(resume):
    if next_section:
        workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):next_section[1]])
    else:
        workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):])
    return workexp_section

workexp_section = get_workexp_section(resume)
workexp_section = workexp_section.split('\n')

#Remove the detail and get the experience information
def get_exp_info(work_exp):
    company_info=[]
    temp_str=''
    for i, sent in enumerate(work_exp):
        if sent != '':
            #Everything before the bullet will be put into one sentence, for one company
            if not sent.startswith(('•','', u'\uf095', '§', '§')):
                temp_str += sent + ' '
            else:
                if not work_exp[i-1].startswith(('•','', u'\uf095', '§', '§')):
                    company_info.append(temp_str)
                    temp_str=''
    return company_info

company_info = get_exp_info(workexp_section)

#Print the company info
for i, company in enumerate(company_info):
    company = company.replace('\t', '')
    print('\nCompany {}:'.format(i+1), company)

nlp = spacy.load('en')

#Parse company info components
def extract_exp_info(company_info, filename):
    count = 0
    print(filename)
    for i, sent in enumerate(company_info):
        sent = sent.replace('\t', '')
        parsed_sent = nlp(sent)
        print('\nCompany {}'.format(i+1))

        company=''
        location=''
        time=''
        role=''
        for i ,token in enumerate(parsed_sent):
            if token.ent_type_ =='ORG':
                company += ' ' + str(token)
            elif token.ent_type_ =='GPE':
                location += ' ' + str(token)
            elif token.ent_type_ =='DATE' or token.ent_type_ =='TIME':
                time += ' ' + str(token)
            elif token.ent_type_ =='':
                if str(token).isalpha() and str(token) not in stopwords.words('english'):
                    role += ' ' + str(token)

        print('Company: {}'.format(company))
        print('Location: {}'.format(location))
        print('Time: {}'.format(time))
        print('Role: {}'.format(role))

extract_exp_info(company_info, filename)


#3. Extract Skills (Just Skills)

import nltk
import pandas as pd
import os
import codecs
from gensim.models import Phrases
import re

#Read the Skill_List.xlsx
data = pd.read_excel("Skills.xlsx", header=0)
skill_list = list(data['Skill Names'])
skill_list = set(skill_list)
skill_list= [skill.lower() for skill in skill_list]

filename ='all_text1.txt'
trained_resume_path = os.path.join('Trained Resumes', filename)

resume_text = open(trained_resume_path, 'r', encoding='utf_8').read()
special_characters = ['!','#', '$', '%','&','*','-', '/', '=','?',
                      '^','.','_','`', '{', '|', '}','~', "'", ',', '(',')', ':', '•', '§' ]

# Processing text
def resume_processing (resume_text):
    #tokenize sentences
    resume_sents = nltk.sent_tokenize(resume_text)

    #tokenize words
    resume_words = [nltk.word_tokenize(sent) for sent in resume_sents]

    #remove stopwords and special characters
    processed_resume=[]
    for sentence in resume_words:
        sent = [w.lower() for w in sentence
                          if w.lower() not in stopwords.words('english') and w.lower() not in special_characters]
        processed_resume.append(sent)

    return processed_resume

unigram_resume = resume_processing(resume_text)

#Create bigram model
bigram_model_path = 'bigram_model'

bigram_model = Phrases(unigram_resume)
bigram_model.save(bigram_model_path)

# Create bigram words
def create_bigram (unigram_resume):
    bigram_model = Phrases.load(bigram_model_path)
    bigram_resume = [bigram_model[sentence] for sentence in unigram_resume]
    return bigram_resume

bigram_resume = create_bigram(unigram_resume)

#Create trigram model
trigram_model_path = 'trigram_model'

trigram_model = Phrases(bigram_resume)
trigram_model.save(trigram_model_path)

# Create trigram words
def create_trigram (bigram_resume):
    trigram_model = Phrases.load(trigram_model_path)
    trigram_resume = [trigram_model[sentence] for sentence in bigram_resume]
    return trigram_resume

trigram_resume = create_trigram(bigram_resume)

#Normalize bigram/trigram words
def normalize_words (trigram_resume):
    for sentence in trigram_resume:
        for i, word in enumerate(sentence):
            if len(re.findall(r'\w+\_\w+', word))!= 0:
                sentence[i] = re.sub('_', ' ', word)
    return trigram_resume

normalized_resume = normalize_words(trigram_resume)

#label skills in the resume
def labeled_word (sentence):
    labels=[]
    for word in sentence:
        if word in skill_list:
            labels.append((word, 'skill'))
        else:
            labels.append((word, 'not skill'))
    return labels

labeled_words=[labeled_word(sentence) for sentence in normalized_resume]

#Get 25 similar words based on word2vec model
def similar_prob(word):
    count = 0
    terms = get_related_terms(word,25)
    for w in terms:
        if skill_series.isin([w]).any():
            count+=1
    return count/25

#Check if the word is in skill clusters, based on KMeans algorithm
def in_skill_cluster(word):
    if word in skills:
        return True
    return False

#extract featurres of skills
def extract_features (sentence, i):
    features={}
    #first feature: evaluate if that word is in skill list
    features["({})in_skill_list".format(sentence[i])]= (sentence[i] in skill_list)

    if sentence[i] in res2vec.wv.vocab:
        features["probality_of_similar_words_skills"] = similar_prob(sentence[i])
        features["in_skill_cluster"] = in_skill_cluster(sentence[i])

    #if the word is in begining of the sentence, return <Start> for prev_word
    if i==0 and len(sentence)-1 != 0:
        features["prev_word_in_skill_list"]= '<Start>'
        features["next_word_in_skill_list"]= (sentence[i+1] in skill_list)

    #if the word is in begining of the sentence, return <End> for next_word
    elif i == len(sentence)-1 and  i != 0:
        features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list)
        features["next_word_in_skill_list"]= '<End>'

    #if the sentence has only 1 word, return False for both prev_word and next_word
    elif i==0 and len(sentence)-1 == 0:
        features["prev_word_in_skill_list"]= False
        features["next_word_in_skill_list"]= False
    else:
        features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list)
        features["next_word_in_skill_list"]= (sentence[i+1] in skill_list)
    return features

featuresets=[]
for labeled_sent in labeled_words:
    unlabeled_sent = [word[0] for word in labeled_sent]
    for i, (w, label) in enumerate(labeled_sent):
        featuresets.append((extract_features(unlabeled_sent, i), label))

#Save the features in a file
featuresets_file = 'features_file.txt'
file = open(featuresets_file, 'w', encoding='utf_8')
file.write('\n'.join('%s %s' % item for item in featuresets ))

size = int(len(featuresets)*0.1)
train_set = featuresets[size:]
test_set = featuresets[:size]

#Train the data with NaiveBayes model
classifier = nltk.NaiveBayesClassifier.train(train_set)

#Evaluate the accuracy
nltk.classify.accuracy(classifier, test_set)

#Extract the skills
def extract_skills(normalized_test_res, resume_number, filename):
    skills =[]
    for sent in normalized_test_res:
        for (i,_) in enumerate(sent):
            if classifier.classify(extract_features(sent, i))=='skill':
                skills.append(sent[i])
                extracted_skills = set(skills)
    print('\nResume {}:{} ({} skills)\n'.format(resume_number+1,filename, len(extracted_skills)), extracted_skills)


#VAIBHAV

#Import Statements
import csv
import re

#Email Address (Finished)
def check_email(string_to_search):
    regular_expression = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,3}", re.IGNORECASE)
    result = re.search(regular_expression, string_to_search)
    if result:
        result = result.group()
    return result
    #except:
     #   result=0
      #  return result

#LinkedIn Address(Finished)
def check_linkedin(string_to_search):
    regular_expression1 = re.compile(r"https://"
                                    r"[A-Z]{2,3}"
                                    r".linkedin.com/in/"
                                    r"[-_a-z 0-9]{5,30}", re.IGNORECASE)
    result = re.search(regular_expression1, string_to_search)
    try:
        result = result.group()
        return result
    except:
        regular_expression1 = re.compile(r"[A-Z]{2,3}"
                                        r".linkedin.com/in/"
                                        r"[-_a-z 0-9]{5,30}", re.IGNORECASE)
        result = re.search(regular_expression1, string_to_search)
        try:
            result=result.group()
            return result
        except:
            regular_expression1 = re.compile(r"[A-Z]{2,3}"
                                        r".linkedin.com/"
                                        r"[-_a-z 0-9]{5,30}", re.IGNORECASE)
            result = re.search(regular_expression1, string_to_search)
            try:
                result=result.group()
                return result
            except:
                return None

#GitHub Address (Finished)
def check_GitHub(string_to_search):
    regular_expression = re.compile(r"https://github.com/"
                                    r"[-_A-Z0-9]{5,30}", re.IGNORECASE)
    result = re.search(regular_expression, string_to_search)
    try:
        result = result.group()
        return result
    except:
        return None

#Contact Number (Finished)
def check_phone_number(string_to_search):
    try:
        regular_expression = re.compile(r"\(?"  # open parenthesis
                                        r"(\d{3})?"  # area code
                                        r"\)?"  # close parenthesis
                                        r"[\s\.-]{0,2}?"  # area code, phone separator
                                        r"(\d{3})"  # 3 digit exchange
                                        r"[\s\.-]{0,2}"  # separator bbetween 3 digit exchange, 4 digit local
                                        r"(\d{4})",  # 4 digit local
                                        re.IGNORECASE)
        result = re.search(regular_expression, string_to_search)
        if result:
            result = result.groups()
            result = "-".join(result)
        return result
    except:
        return None

def main():
#    with open('Resume_Test.txt', 'r',encoding="utf8") as myfile:
    with open('Resume_Test.txt', 'r') as myfile:
        data=myfile.read().replace('\n',' **** ')
    result=check_email(data)
    result_L=check_linkedin(data)
    result_P=check_phone_number(data)
    result_G=check_GitHub(data)
    print("Email Address:",result)
    print("Contact Number:",result_P)
    print("Linkedin Profile:",result_L)
    print("GitHub Profile:",result_G)
    #print(data)
main()


#Ashish (LinkedIn Profiles and Every other URL in the file)

# import all headers
import re
import os

# function to extract all URLs
# implemented using regex
def extract_URLs(parsedResume):
    parsedResume = parsedResume.replace('\n', ' ')
    regex = regex = re.compile('(?:(?:https?|ftp|file)://|www\.|ftp\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]', re.IGNORECASE)
    result = re.findall(regex, parsedResume)
    #if result:
        #result = result.group()
    return result

# function to extract LinkedIN Profile
# implemented using regex
def extract_linkedin(parsedResume):
    parsedResume = parsedResume.replace('\n', ' ')
    regex = re.compile(r"https://www.linkedin.com/in/([a-zA-Z]|[0-9]|[-])+/?")
    result = re.search(regex, parsedResume)
    if result:
        result = result.group()
    return result

# TESTING
# path where all resumes are located
test_resume_path = '/Users/Ashish/Desktop/Internship/Personal/Test Resumes'
counter = 0

print("URLs in Test Resumes")
for filename in os.listdir(test_resume_path):
    # print(filename)
    if '.txt' in filename:
        counter = counter + 1
        resume_path= os.path.join('Test Resumes', filename)
        test_resume = open(resume_path, 'r').read()

        print("Resume ", (counter), ":")
        print("All URLs => ", extract_URLs(test_resume))
        print("LinkedIn Profiles => ", extract_linkedin(test_resume))
示例#28
0
    ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'],
    ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'],
    ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'],
    ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington']
]


# Change to use less memory. Default is 40m.
max_vocab_size = 40000000

# Train up to trigrams.
print('Training bigrams...')
bigram = Phrases(doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.)

print('Saving...')
bigram.save('bigram_model.phrases')

print('Training trigrams...')
trigram = Phrases(bigram[doc_stream(paths, n)], max_vocab_size=max_vocab_size, threshold=10.)

print('Saving...')
trigram.save('trigram_model.phrases')
print('Done.')


#print('Loading bigrams...')
#bigram = Phrases.load('bigram_model.phrases')

#print('Loading trigrams...')
#trigram = Phrases.load('trigram_model.phrases')
示例#29
0
import os
import codecs
import itertools as it
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

import settings

lemmatized = LineSentence(os.path.join(settings.DATA_PATH, 'lemmatized.txt'))
bigram_model_filepath = os.path.join(settings.DATA_PATH, 'bigram_model')

if 0 == 1:
    bigram_model = Phrases(lemmatized)
    bigram_model.save(bigram_model_filepath)

bigram_model = Phrases.load(bigram_model_filepath)
bigram_sentences_filepath = os.path.join(settings.DATA_PATH,
                                         'bigram_sentences.txt')

if 0 == 1:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized:
            bigram_sentence = ' '.join(bigram_model[sentence])
            f.write(bigram_sentence + '\n')

bigram_sentences = LineSentence(bigram_sentences_filepath)
trigram_model_filepath = os.path.join(settings.DATA_PATH,'trigram_model')

if 0 == 1:
    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)
示例#30
0
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        text = tokenizer.tokenize(text)
                        for sent in text:
                            yield nltk.word_tokenize(sent.lower())
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            pass

f = open("tokenizer.pk", "rb")
tokenizer = pickle.load(f)
f.close()

print 'BIGRAMS'
bigram = Phrases(next_note(tokenizer), delimiter='')
bigram.save('bigrams.pk')

print 'TRIGRAMS'
trigram = Phrases(bigram[next_note(tokenizer)], delimiter='')
trigram.save('trigrams.pk')

print '4GRAMS'
ngram = Phrases(trigram[next_note(tokenizer)], delimiter='')
ngram.save('ngrams.pk')

示例#31
0
def link_twoWords(file):
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_filepath)
    bigram_model = Phrases.load(bigram_model_filepath)
示例#32
0
#%%

########################
### bigram and trigram##
########################
data_path = os.path.join('data','tokenized_sentances.p')
bigram_model_path = os.path.join('data','bigram_model')
trigram_model_path = os.path.join('data','trigram_model')

#%%
## first train a bigram and trigram model on a large dataset and save them
sentances = pickle.load(open(data_path,'rb'))
#%%
print('Training bigram model .........\n')
bi_phrases = Phrases(sentances, min_count=5, threshold=15)
bi_phrases.save(bigram_model_path)
bigram_transformer = Phraser(bi_phrases)
bigram_transformer.save(os.path.join('data','bigram_transformer'))
sentances = list(bigram_transformer[sentances]) 
tri_phrases = Phrases(sentances, min_count=5, threshold=10)
bi_phrases.save(trigram_model_path)
trigram_transformer = Phraser(tri_phrases)
trigram_transformer.save(os.path.join('data','trigram_transformer'))
#sentances = list(trigram_transformer[sentances])

## if you want to check pharses list
pharses_list = list(tri_phrases.vocab.keys())

#print('Dump to Pickle')
#pickle.dump(sentances,open(out_path, "wb"))
示例#33
0
# build unigram sentence corpus(to train bigram model later)

if 1 == 0:
    with codecs.open(unigram_sentences_path, 'w', encoding='utf-8') as f:
        for sentence in lemmatized_sentence_corpus(replaced_documents):
            f.write(sentence + '\n')

    from gensim.models import Phrases
    from gensim.models.word2vec import LineSentence

    # load unigram sentence corpus
    unigram_sentences = LineSentence(unigram_sentences_path)

    # training bigram_model
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_path)


def lemmatized_document(list_of_documents):
    """
    generator function to use spaCy to parse documents,
    lemmatize them, and yield, one per document
    """

    for parsed_review in nlp.pipe(list_of_documents,
                                  batch_size=10000,
                                  n_threads=4):
        yield [
            token.lemma_ for token in parsed_review
            if not punct_stop_space(token)
        ]
示例#34
0
def phrases():
    p = Phrases(sentences=process_corpus('/Users/valeriyischenko/local/projects/lingua_hack/Text'))
    p.save('../wiki/text_phrase_model_p3')
示例#35
0
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        text = tokenizer.tokenize(text)
                        for sent in text:
                            yield nltk.word_tokenize(sent.lower())
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            pass


f = open("tokenizer.pk", "rb")
tokenizer = pickle.load(f)
f.close()

print 'BIGRAMS'
bigram = Phrases(next_note(tokenizer), delimiter='')
bigram.save('bigrams.pk')

print 'TRIGRAMS'
trigram = Phrases(bigram[next_note(tokenizer)], delimiter='')
trigram.save('trigrams.pk')

print '4GRAMS'
ngram = Phrases(trigram[next_note(tokenizer)], delimiter='')
ngram.save('ngrams.pk')
示例#36
0
def tokenize_stem_stop(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for word in nltk.word_tokenize(text) if word not in stopwords]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z0-9]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

def makesent(path):
    result = []
    for line in open(path):
        result.append(tokenize_stem_stop(line.rsplit(' ', 1)[0].lower()))
    return result

sentences = makesent('/Users/yjiang/Documents/pythonWorkspace/freqCounter/data/agg_self_in_200.txt')
print(sentences)
bigram = Phrases(sentences, min_count=1, threshold=1)
bi_sent = bigram[sentences]
trigram = Phrases(bigram[sentences], min_count=1, threshold=1)

#==============================================================================
# for phrase, score in trigram.export_phrases(bi_sent):
#     print(u'{0}   {1}'.format(phrase, score))
#==============================================================================

bigram.save('model/bigram')
trigram.save('model/trigram')