def get_trigrams(self): """ Builds unigram, bigram, and trigram models respectively. Writes the text of each model to a seperate file. """ unigram_sentences = LineSentence(self.unigram_sentences_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(self.bigram_model_filepath) bigram_model = Phrases.load(self.bigram_model_filepath) with open(self.bigram_sentences_filepath, 'w', encoding="utf-8") as f: for unigram_sentence in unigram_sentences: bigram_sent = " ".join(bigram_model[unigram_sentence]) # a bit confused by this. f.write(bigram_sent) bigram_sentences = LineSentence(self.bigram_sentences_filepath) trigram_model = Phrases(bigram_sentences) trigram_model.save(self.trigram_model_filepath) trigram_model = Phrases.load(self.trigram_model_filepath) with open(self.trigram_sentences_filepath, 'w', encoding="utf-8") as f: for bigram_sentence in bigram_sentences: trigram_sentence = " ".join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence(self.trigram_sentences_filepath) with open(self.trigram_articles_filepath, 'w', encoding="utf-8") as f: for parsed_article in self.line_article("../data/article_texts"): unigram_article = [token.lemma_ for token in self.nlp(parsed_article) if not self.punct_space(token)] bigram_article = bigram_model[unigram_article] trigram_article = trigram_model[bigram_article] trigram_article = [term for term in trigram_article if term not in STOP_WORDS] trigram_article = " ".join(trigram_article) f.write(trigram_article + '\n')
def train_bigram(unigram_txt_filepath, bigram_model_filepath, savebigram, bigram_txt_filepath): print('reading unigram text file.....') unigram_txt = LineSentence(unigram_txt_filepath) print('training bigram model.....') bigram_model = Phrases(unigram_txt) print('saving bigram model.....') bigram_model.save(bigram_model_filepath) # load the finished model from disk # bigram_model = Phrases.load(bigram_model_filepath) if savebigram: print('saving bigram processed text file....') with codecs.open(bigram_txt_filepath, 'w', encoding='utf_8') as f: i = 0 for unigram_sentence in tqdm(unigram_txt): bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') i = i + 1 if (i % 10000 == 0): print('Bigram Processed ' + str(i) + ' articles')
def _phrase_detection_(fpath=fpathroot + fpathappend, passes=2, returnmodels=True, threshold=10.): """ This function does pharse modeling. User specifies the number of passes. Each pass detects longer phrases. The maximum detectable phrase length for each pass, n, is 2*n. Returns the list of models by default. Also saves models and intermediary phrased sentences for each pass. """ generpath = fpath + '_sent_gram_0.txt' ngram = list() for it in range(passes): gen = LineSentence(generpath) gram = Phrases(gen, threshold=threshold) ngram.append(gram) modelpath = fpath + 'phrase_model_gram_' + str(it + 1) generpath = fpath + 'sent_gram_' + str(it + 1) + '.txt' gram.save(modelpath) # Write sentence gram with codecs.open(generpath, 'w', encoding='utf_8') as f: for sent in gen: new_sent = u' '.join(gram[sent]) f.write(new_sent + '\n') if returnmodels == True: return ngram
def main(): # ------------------------------------------------------------------------------- # Parameters # the script will most likely work if we swap the TEXTS variable # with any iterable of text (where one element represents a document, # and the whole iterable is the corpus) newsgroups_train = fetch_20newsgroups(subset='train') TEXTS = newsgroups_train.data # spacy's english model for text preprocessing NLP = spacy.load('en') # a set of stopwords built-in to spacy, we can always # expand this set for the problem that we are working on, # here we include python built-in string punctuation mark STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set( ENGLISH_STOP_WORDS) # create a directory called 'model' to store all outputs in later section MODEL_DIR = 'model' UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt') PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model') BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt') WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec') # ------------------------------------------------------------------------------- logger.info('job started') if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if not os.path.exists(UNIGRAM_PATH): logger.info('preprocessing text') export_unigrams(UNIGRAM_PATH, texts=TEXTS, parser=NLP, stopwords=STOPWORDS) if os.path.exists(PHRASE_MODEL_CHECKPOINT): phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT) else: logger.info('training phrase model') # use LineSetence to stream text as oppose to loading it all into memory unigram_sentences = LineSentence(UNIGRAM_PATH) phrase_model = Phrases(unigram_sentences) phrase_model.save(PHRASE_MODEL_CHECKPOINT) if not os.path.exists(BIGRAM_PATH): logger.info('converting words to phrases') export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model) if os.path.exists(WORD2VEC_CHECKPOINT): word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT) else: logger.info('training word2vec') sentences = LineSentence(BIGRAM_PATH) word2vec = Word2Vec(sentences, workers=cpu_count()) word2vec.save(WORD2VEC_CHECKPOINT) logger.info('job completed')
def train_phrases(paths, out='data/bigram_model.phrases', **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. max_vocab_size = 40000000 print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.) print('Saving...') bigram.save(out) print('Some examples:') docs = [ ['the', 'new', 'york', 'times', 'is', 'a', 'newspaper'], ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'], ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'], ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'], ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington'] ] for r in bigram[docs]: print(r)
def trigrams(corpus, output_prefix): print("----- Trigrams -----") if os.path.exists(output_prefix + "_trigram_phrases"): trigram_phrases = Phrases.load(output_prefix + "_trigram_phrases") print("Loaded trigram phrases") else: bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"]) trigram_phrases = Phrases(bigram_phrases[corpus], min_count=CONFIG["trigram_phrase_min_count"], threshold=CONFIG["trigram_phrase_threshold"], delimiter=CONFIG["trigram_phrase_delimiter"]) trigram_phrases.save(output_prefix + "_trigram_phrases") trigram_transformer = Phraser(trigram_phrases) dct = Dictionary(trigram_transformer[corpus]) dct.save(output_prefix + "_dictionary_trigram") print("Training tf-idf from trigrams") bow_corpus = [dct.doc2bow(line) for line in trigram_transformer[corpus]] tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc') tfidf.save(output_prefix + "_tfidf_trigram") print("Training word2vec model with trigram") start_time = time() trigram_model = gensim.models.Word2Vec(trigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'], min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'], negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'], iter=CONFIG['train_epoch']) trigram_model.save(output_prefix + "_trigram") print("Time :", format_time(time() - start_time)) return trigram_model
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) in_dir, model_out = sys.argv[1:] sentences = Corpus(in_dir) phrases = Phrases(sentences) phrases.save(model_out)
def generate_bow(corpus_filename, category, use_bigrams, no_above, no_below): if not os.path.exists('./data/%s' % category): os.makedirs('./data/%s' % category) tokens = [ utils.tokenize(line) for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')) if category in label ] print 'First token', tokens[1] category_filename = corpus_filename.replace('corpus', 'category') #Each category gets its own dictionary and its own corpus, but uses the same bigram model #that was computed on all the abstracts if use_bigrams: if not os.path.exists('./data/%s/bigram.bin' % category): bigram = Phrases( utils.tokenize(line) for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')) if category in label) Phrases.save(bigram, './data/%s/bigram.bin' % category) else: bigram = Phrases.load('./data/%s/bigram.bin') tokens = [bigram[token] for token in tokens] print 'First bigram token', tokens[1] #Make the dictionary, a collection of statistics about all tokens in the corpus #This is the mapping from words to their id's. It's the lookup table for features. dictionary = corpora.Dictionary(tokens) # words that appear only once dictionary.filter_extremes( no_above, no_below) #no_above=0.05, no_below=10 yielded good results # remove gaps in id sequence after words that were removed dictionary.compactify() # store the dictionary, for future reference dictionary.save('./data/%s/%s.dict' % (category, category_filename)) # memory-friendly bag-of-words class class BOW(object): def __iter__(self): for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')): # assume there's one document per line, tokens separated by whitespace if category in label: yield dictionary.doc2bow(utils.tokenize(line)) else: pass # Now we can make a bag of words and do something with it by iterating over it arxiv_bow = BOW() corpora.MmCorpus.serialize('./data/%s/%s.mm' % (category, category_filename), arxiv_bow) # store to disk, for later use
def get_bigram_model(): model_exists = os.path.exists(bigram_model_filepath) if model_exists: bigram_model = Phrases.load(bigram_model_filepath) else: unigram_sentences = get_unigram_sentences() bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) return bigram_model
def main(): # ------------------------------------------------------------------------------- # Parameters # the script will most likely work if we swap the TEXTS variable # with any iterable of text (where one element represents a document, # and the whole iterable is the corpus) newsgroups_train = fetch_20newsgroups(subset = 'train') TEXTS = newsgroups_train.data # spacy's english model for text preprocessing NLP = spacy.load('en') # a set of stopwords built-in to spacy, we can always # expand this set for the problem that we are working on, # here we include python built-in string punctuation mark STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(ENGLISH_STOP_WORDS) # create a directory called 'model' to store all outputs in later section MODEL_DIR = 'model' UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt') PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model') BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt') WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec') # ------------------------------------------------------------------------------- logger.info('job started') if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if not os.path.exists(UNIGRAM_PATH): logger.info('preprocessing text') export_unigrams(UNIGRAM_PATH, texts = TEXTS, parser = NLP, stopwords = STOPWORDS) if os.path.exists(PHRASE_MODEL_CHECKPOINT): phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT) else: logger.info('training phrase model') # use LineSetence to stream text as oppose to loading it all into memory unigram_sentences = LineSentence(UNIGRAM_PATH) phrase_model = Phrases(unigram_sentences) phrase_model.save(PHRASE_MODEL_CHECKPOINT) if not os.path.exists(BIGRAM_PATH): logger.info('converting words to phrases') export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model) if os.path.exists(WORD2VEC_CHECKPOINT): word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT) else: logger.info('training word2vec') sentences = LineSentence(BIGRAM_PATH) word2vec = Word2Vec(sentences, workers = cpu_count()) word2vec.save(WORD2VEC_CHECKPOINT) logger.info('job completed')
def generateBigrams(sentences): bigram_transformer = Phrases(sentences, min_count=20, threshold=500) bigram_transformer.save("bigrams", pickle_protocol=3) fd = open("bigrams.txt", 'a') for phrase, score in bigram_transformer.export_phrases(sentences): fd.write(u'{0} {1}'.format(phrase, score)) fd.close() return bigram_transformer
def learnMultiword(ret): print("Learning multiword expressions") bigram = Phrases(ret) bigram.save("phrase_all.model") print("Sanity checking multiword expressions") test = "i like donald trump and hate muslims , go hillary , i like jesus , jesus , against , abortion " sent = test.split(" ") print(bigram[sent]) return bigram[ret]
def train_phrase_model(cleaned_filepath, bigram_model_filepath, run_or_load_flag): if run_or_load_flag: unigram_sentences = LineSentence(cleaned_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) else: bigram_model = Phrases.load(bigram_model_filepath) return bigram_model
def extract_phrases(doc_words, save=False): logging.info("Extracting phrases...") global bigram global trigram bigram = Phrases(doc_words, threshold=5, min_count=5) trigram = Phrases(bigram[doc_words], threshold=3, min_count=3) if save: bigram.save("../model/bigram.model") trigram.save("../model/trigram.model") return trigram[bigram[doc_words]]
def bigram(): # One time use, prepare bigram model unigram_sentences = LineSentence('../Dataset/unigram_sentences_all.txt') bigram_model_filepath = '../Models/bigram_model_all' bigram_sentences_filepath = '../Dataset/bigram_sentences_all.txt' bigram_model = Phrases(unigram_sentences) bigram_model.save('../Models/bigram_model_all') with codecs.open('../Dataset/bigram_sentences_all.txt', 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n')
def trigram(): # One time use, prepare trigram model bigram_sentences = LineSentence('../Dataset/bigram_sentences_all.txt') trigram_model_filepath = '../Models/trigram_model_all' trigram_model = Phrases(bigram_sentences) trigram_model.save(trigram_model_filepath) trigram_sentences_filepath = '../Dataset/trigram_sentences_all.txt' trigram_reviews_filepath = '../Dataset/trigram_transformed_reviews_all.txt' with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence(trigram_sentences_filepath)
def generate_trigram(input_file_path, bigram_save_path, trigram_save_path, final_file_path): unigram_sentence = LineSentence(input_file_path) bigram = Phrases(unigram_sentence) bigram.save(bigram_save_path) # temp strore bigram sentance to get the trigram temp_file_path = os.path.join(Config.temp_directory, "temp_bigram.txt") __gram_to_text(unigram_sentence, bigram, temp_file_path) bigram_sentences = LineSentence(temp_file_path) trigram = Phrases(bigram_sentences) trigram.save(trigram_save_path) __gram_to_text(bigram_sentences, trigram, final_file_path)
def trainPhrasesModel(tweets): """ Train phrases model, experimental, not used :param tweets: list of tokenised tweets :return: """ print("Learning multiword expressions") bigram = Phrases(tweets) bigram.save("../out/phrase_all.model") print("Sanity checking multiword expressions") test = "i like donald trump , go hillary clinton , i like jesus , jesus , legalisation abortion " sent = test.split(" ") print(bigram[sent]) return bigram[tweets]
def generate_bigrams_trigrams(self): unigram_sentences = LineSentence(self.unigram_sentences_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(self.bigram_model_filepath) f = open(self.bigram_sentences_filepath, 'w') for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') f.close() bigram_sentences = LineSentence(self.bigram_sentences_filepath) trigram_model = Phrases(bigram_sentences) trigram_model.save(self.trigram_model_filepath) f = open(self.trigram_sentences_filepath, 'w') for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n')
def phrase_detect_train(sentances,min_count,threshold,common_terms,phrase_model_save_path = None): """ input: sentances: tokenized sentances """ print('Transform sentances to trigrams .........\n') bi_phrases = Phrases(sentances, min_count=min_count, threshold=threshold,common_terms=common_terms) bigram_transformer = Phraser(bi_phrases) if phrase_model_save_path is not None: bi_phrases.save(phrase_model_save_path) bigram_transformer.save(phrase_model_save_path+'_transformer') sentances = list(bigram_transformer[sentances]) ## if you want to check pharses list pharses_list = list(bigram_transformer.phrasegrams) print('Phrase model training done.') return sentances
def generate_bow(corpus_filename, use_bigrams, no_above, no_below): tokens = [ utils.tokenize(line) for line in open('./data/%s.csv' % corpus_filename) ] print 'First token', tokens[1] if use_bigrams: if not os.path.exists('./data/bigram.bin'): print "data/bigram.bin doesn't exist. Generating and saving bigram model. This could take a while." bigram = Phrases( tokenize(line) for line in open('./data/%s.csv' % corpus_filename)) bigram.save('./data/bigram.bin') bigram = Phrases.load('./data/bigram.bin') tokens = [bigram[token] for token in tokens] print 'First bigram token', tokens[1] #Make the dictionary, a collection of statistics about all tokens in the corpus #This is the mapping from words to their id's. It's the lookup table for features. dictionary = corpora.Dictionary(tokens) # words that appear only once dictionary.filter_extremes( no_above, no_below) #no_above=0.05, no_below=10 yielded good results # remove gaps in id sequence after words that were removed dictionary.compactify() # store the dictionary, for future reference dictionary.save('./data/%s.dict' % corpus_filename) # memory-friendly bag-of-words class class BOW(object): def __iter__(self): for token in tokens: # assume there's one document per line, tokens separated by whitespace yield dictionary.doc2bow(token) # Now we can make a bag of words and do something with it by iterating over it arxiv_bow = BOW() corpora.MmCorpus.serialize('./data/%s.mm' % corpus_filename, arxiv_bow) # store to disk, for later use
def train(): sents = LineSentence(args.sents) bi_path = os.path.join(args.save_dir, bi_model) print 'Bigram: ', bi_path if os.path.exists(bi_path): bigram = Phrases.load(bi_path) else: bigram = Phrases(sents, min_count=args.min_count, threshold=args.bi_threshold) bigram.save(bi_path) tri_path = os.path.join(args.save_dir, bi_model + '_' + tri_model) print 'Trigram: ', tri_path trigram = Phrases(bigram[sents], min_count=args.min_count, threshold=args.tri_threshold) trigram.save(tri_path)
def extract_phrases(reviews_sents, reviews_docs, save=False): logging.info("Extracting phrases...") bigram = Phrases(reviews_sents, threshold=5, min_count=5) trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3) if save: with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout: ph_dic = {} for phrase, score in bigram.export_phrases(reviews_sents): ph_dic[phrase] = score for phrase, score in trigram.export_phrases(bigram[reviews_sents]): ph_dic[phrase] = score for phrase, score in ph_dic.items(): if re.search(r'\d+', phrase): # remove digits continue phrase = b"_".join(phrase.split(b' ')) fout.write(phrase + b'\n') bigram.save("../model/bigram.model") trigram.save("../model/trigram.model") return trigram[bigram[reviews_docs]]
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. kwargs = { 'max_vocab_size': 40000000, 'threshold': 8. }.update(kwargs) print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs) print('Saving...') bigram.save(out)
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. kwargs = {'max_vocab_size': 40000000, 'threshold': 8.}.update(kwargs) print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs) print('Saving...') bigram.save(out)
def personel_information(): #name,phone number, address, linkedin, github #Eric’s section (name, email, phone, university, major, gpa) Done #import path & variables resume_path = r"/Users/yiyangzhou/Desktop/Yiyang (Eric) Zhou Resume 2017 Fall.txt" resume_file = open(resume_path).read() resume_file2 = open(resume_path).read() resume_file2 = resume_file2.lower() #change path major_df = pandas.read_excel('majors.xlsx') major_df.columns major_file = major_df['Majors'].values major_lower = [item.lower() for item in major_file] tokenizer = RegexpTokenizer(r'\w+') resume_token = tokenizer.tokenize(resume_file) resume_token2 = tokenizer.tokenize(resume_file2) major_distinct = [] dictionary = {'Name': 5} regular_expression = re.compile(r"/BA|BS|Bachelor of Science|Bachelor of Arts|BBA |B/A|Bachelor of Business Administration/", re.IGNORECASE) bach_major_result = re.search(regular_expression, resume_file) regular_expression_two = re.compile(r"minor|Minor", re.IGNORECASE) minor_result = re.search(regular_expression_two, resume_file) regular_expression_three = re.compile(r"Master|master", re.IGNORECASE) master_major_result = re.search(regular_expression_three, resume_file) regular_expression_four = re.compile(r"university", re.IGNORECASE) university_major_result = re.search(regular_expression_four, resume_file) updated_majors1 = [] indexes_majors1 = [] updated_majors2 = [] indexes_majors2 = [] updated_majors3 = [] indexes_majors3 = [] updated_majors4 = [] indexes_majors4 = [] majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4 university_df1 = pandas.read_excel('China_University.xlsx') university_df2 = pandas.read_excel('India_University.xlsx') university_df3 = pandas.read_excel('US_University.xlsx') university_file1 = university_df1['Universities'].values university_file2 = university_df2['Universities'].values university_file3 = university_df3['Universities'].values university_lower1 = [item.lower() for item in university_file1] university_lower2 = [item.lower() for item in university_file2] university_lower3 = [item.lower() for item in university_file3] university_combined = university_lower1 + university_lower2 + university_lower3 #extract name finished def extract_first_name(resume): name = resume.split('\n', 1)[0] first_name = name.split(' ', 1)[0] return (first_name) print (first_name) def extract_last_name(resume): name = resume.split('\n', 1)[0] last_name = name.split(' ', 1)[-1] return (last_name) print (last_name) def extract_name(resume): name = extract_first_name(resume_file) + extract_last_name(resume_file) print (name) #extract email finished def extract_email(resume): regular_expression = re.compile(r"(\w+[.|\w])*@(\w+[.])*\w+", re.IGNORECASE) result = re.search(regular_expression, resume) if result: result = result.group() print (result) #extract phone number finished def check_phone_number1(resume): resume2 = "".join(c for c in resume if c not in ('!','.','-','(',')',' ','+',)) result = re.findall(r"\d{10}", resume2) result = ''.join(result) return (result) def check_phone_number2(resume): resume2 = "".join(c for c in resume if c not in ('!','.','-','(',')',' ','+',)) result = re.findall(r"\d{11}", resume2) result = ''.join(result) result = result[1:11] return (result) def extract_phone_number(resume): try: return check_phone_number1(resume) print (check_phone_number1(resume)) except: return check_phone_number2(resume) print (check_phone_number2(resume)) def personel_information(resume): print(extract_name(resume)) print(extract_email(resume)) print(extract_phone_number(resume)) #execution of extracting name, email, phone number personel_information(resume_file) #major, University, gpa def get_bigrams(input): n = 2 result = [] bigrams = ngrams(input, n) for grams in bigrams: x = "%s %s" % grams result.append(x) return (result) print (result) def get_threegrams(input): n = 3 result = [] threegrams = ngrams(input, n) for grams in threegrams: x = "%s %s %s" % grams result.append(x) return (result) print (result) def get_fourgrams(input): n = 4 result = [] fourgrams = ngrams(input, n) for grams in fourgrams: x = "%s %s %s %s" % grams result.append(x) return (result) print (result) def get_fivegrams(input): n = 5 result = [] fivegrams = ngrams(input, n) for grams in fivegrams: x = "%s %s %s %s %s" % grams result.append(x) return (result) print (result) def get_sixgrams(input): n = 6 result = [] sixgrams = ngrams(input, n) for grams in sixgrams: x = "%s %s %s %s %s %s" % grams result.append(x) return (result) print (result) def get_majors(a,b): majors=[] for x in a: if x in b: majors.append(x) return (majors) print (majors) def get_majors2(a,b): unigram_major = get_majors(a, b) bigram_major = get_majors(get_bigrams(a), b) threegram_major = get_majors(get_threegrams(a), b) combined_majors_list = unigram_major + bigram_major + threegram_major for i in combined_majors_list: if i not in major_distinct: major_distinct.append(i) print (major_distinct) def get_majors_index(major_distinct): for i, element in enumerate(major_distinct): x = resume_file2.find(element) dictionary[element] = x del dictionary['Name'] print(dictionary) def get_bach_index(bach_major_result): if bach_major_result: bach_major_result = bach_major_result.group() print (bach_major_result) if bach_major_result is not None: bach_major_index = resume_file.find(bach_major_result) return(bach_major_index) print(bach_major_index) def get_minor_index(minor_result): if minor_result: minor_result = minor_result.group() print (minor_result) if minor_result is not None: minor_index = resume_file.find(minor_result) return(minor_index) print(minor_index) def get_master_index(master_major_result): if master_major_result: master_major_result = master_major_result.group() print (master_major_result) if master_major_result is not None: master_major_index = resume_file.find(master_major_result) return(master_major_index) print(master_major_index) def get_university_index(university_major_result): if university_major_result: university_major_result = university_major_result.group() print (university_major_result) if university_major_result is not None: university_major_index = resume_file.find(university_major_result) return(university_major_index) print(university_major_index) def get_bach_major(dictionary): bach_major_index = get_bach_index(bach_major_result) upper_bound = bach_major_index +100 for k, v in dictionary.items(): if (bach_major_index < v < upper_bound): updated_majors1.append(k) indexes_majors1.append(v) print(updated_majors1) print(indexes_majors1) def get_master_major(dictionary): master_major_index = get_master_index(master_major_result) upper_bound = master_major_index +100 for k, v in dictionary.items(): if (master_major_index < v < upper_bound): updated_majors2.append(k) indexes_majors2.append(v) print(updated_majors2) print(indexes_majors2) def get_minor(dictionary): minor_index = get_minor_index(minor_result) upper_bound = minor_index +100 for k, v in dictionary.items(): if (minor_index < v < upper_bound): updated_majors3.append(k) indexes_majors3.append(v) print(updated_majors3) print(indexes_majors3) def get_university_major(dictionary): university_major_index = get_university_index(university_major_result) upper_bound = university_major_index +100 for k, v in dictionary.items(): if (university_major_index < v < upper_bound): updated_majors4.append(k) indexes_majors4.append(v) print(updated_majors4) print(indexes_majors4) def extract_major(majors_minors_all): majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4 majors_minors_final_list = list(dedupe(majors_minors_all)) return (majors_minors_final_list) print (majors_minors_final_list) #execution of extracting majors: get_majors(resume_token2, major_lower) get_majors2(resume_token2, major_lower) get_majors_index(major_distinct) get_bach_index(bach_major_result) get_minor_index(minor_result) get_master_index(master_major_result) get_university_index(university_major_result) get_bach_major(dictionary) get_master_major(dictionary) get_minor(dictionary) get_university_major(dictionary) print (extract_major(majors_minors_all)) #extract University: def get_university(a,b): resume_university=[] for x in a: if x in b: resume_university.append(x) return (resume_university) print (resume_university) def extract_university(resume_token_lower,university_combined): unigram_university = get_university(resume_token_lower, university_combined) bigram_university = get_university(get_bigrams(resume_token_lower), university_combined) threegram_university = get_university(get_threegrams(resume_token_lower), university_combined) fourgram_university = get_university(get_fourgrams(resume_token_lower), university_combined) fivegram_university = get_university(get_fivegrams(resume_token_lower), university_combined) sixgram_university = get_university(get_sixgrams(resume_token_lower), university_combined) combined_university_extraction = set(bigram_university + threegram_university + fourgram_university + fivegram_university + sixgram_university) print (combined_university_extraction) #execution of extracting university: extract_university(resume_token2,university_combined) #extract GPA: def extract_GPA(resume): result = re.search(r'(GPA|gpa): ?\d.\d{1,}',resume) if result: result = result.group(0) return (result) print (result) #execution of extracting GPA: extract_GPA(resume_file) #HENRY #Extracting Address import re import usaddress #extract the address def extract_address (text): text = text.replace('\n', ' ') regex = re.compile(r"[0-9]+ .*[.,-]? .*[.,-]? ([A-Z]{2}|\w+)[.,-]? [0-9]{5}(-[0-9]{4})?") result = re.search(regex, text) if result: result = result.group() return result #Parse the address components def parse_address(result): address = usaddress.tag(result) return address #2. Extracting Company import codecs import os import pandas as pd from fuzzywuzzy.process import dedupe import spacy from nltk.corpus import stopwords filename = 'BrandonThomasResume.txt' #Open file def open_file(filename): resume = open(filename, 'r', errors='ignore').read() return resume resume = open_file(filename) #Read the Work_Experience_List data = pd.read_excel("Work Experience.xlsx", header=0) experience_list = list(data['Example']) #Find the experience header def find_exp_header (resume): exp_header_list=[] for word in experience_list: if resume.find(word) != -1: exp_header_list.append(word) #remove duplicates of experience header exp_header = list(dedupe(exp_header_list)) return exp_header exp_header = find_exp_header(resume) exp_header = (exp_header[0], resume.find(exp_header[0])) #Find next section header def find_next_section (resume): #Find all capitalized words next_section_upper = re.findall(r'([A-Z]{3,}( [A-Z]+)?( [A-Z]+)?( [A-Z]+)?)', resume[(exp_header[1] + len(exp_header[0])+ 1):]) next_section_upper = list((itertools.chain.from_iterable(next_section_upper))) #Find all words with the first letter capitalized next_section_lower = re.findall(r'([A-Z]{1}\w+( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?)', resume[(exp_header[1] + len(exp_header[0])+ 1):]) next_section_lower = list((itertools.chain.from_iterable(next_section_lower))) #Combine into a list next_section_list = next_section_upper + next_section_lower #if one of the items matches items in section list, that item is the next section header next_section=() for item in next_section_list: if item in section_list and (resume[resume.find(item)+len(item)]=='\n' or resume[resume.find(item)-1]=='\n'): next_section = (item, resume.find(item)) break return next_section next_section = find_next_section(resume) # Get the section of Work_Experience def get_workexp_section(resume): if next_section: workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):next_section[1]]) else: workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):]) return workexp_section workexp_section = get_workexp_section(resume) workexp_section = workexp_section.split('\n') #Remove the detail and get the experience information def get_exp_info(work_exp): company_info=[] temp_str='' for i, sent in enumerate(work_exp): if sent != '': #Everything before the bullet will be put into one sentence, for one company if not sent.startswith(('•','', u'\uf095', '§', '§')): temp_str += sent + ' ' else: if not work_exp[i-1].startswith(('•','', u'\uf095', '§', '§')): company_info.append(temp_str) temp_str='' return company_info company_info = get_exp_info(workexp_section) #Print the company info for i, company in enumerate(company_info): company = company.replace('\t', '') print('\nCompany {}:'.format(i+1), company) nlp = spacy.load('en') #Parse company info components def extract_exp_info(company_info, filename): count = 0 print(filename) for i, sent in enumerate(company_info): sent = sent.replace('\t', '') parsed_sent = nlp(sent) print('\nCompany {}'.format(i+1)) company='' location='' time='' role='' for i ,token in enumerate(parsed_sent): if token.ent_type_ =='ORG': company += ' ' + str(token) elif token.ent_type_ =='GPE': location += ' ' + str(token) elif token.ent_type_ =='DATE' or token.ent_type_ =='TIME': time += ' ' + str(token) elif token.ent_type_ =='': if str(token).isalpha() and str(token) not in stopwords.words('english'): role += ' ' + str(token) print('Company: {}'.format(company)) print('Location: {}'.format(location)) print('Time: {}'.format(time)) print('Role: {}'.format(role)) extract_exp_info(company_info, filename) #3. Extract Skills (Just Skills) import nltk import pandas as pd import os import codecs from gensim.models import Phrases import re #Read the Skill_List.xlsx data = pd.read_excel("Skills.xlsx", header=0) skill_list = list(data['Skill Names']) skill_list = set(skill_list) skill_list= [skill.lower() for skill in skill_list] filename ='all_text1.txt' trained_resume_path = os.path.join('Trained Resumes', filename) resume_text = open(trained_resume_path, 'r', encoding='utf_8').read() special_characters = ['!','#', '$', '%','&','*','-', '/', '=','?', '^','.','_','`', '{', '|', '}','~', "'", ',', '(',')', ':', '•', '§' ] # Processing text def resume_processing (resume_text): #tokenize sentences resume_sents = nltk.sent_tokenize(resume_text) #tokenize words resume_words = [nltk.word_tokenize(sent) for sent in resume_sents] #remove stopwords and special characters processed_resume=[] for sentence in resume_words: sent = [w.lower() for w in sentence if w.lower() not in stopwords.words('english') and w.lower() not in special_characters] processed_resume.append(sent) return processed_resume unigram_resume = resume_processing(resume_text) #Create bigram model bigram_model_path = 'bigram_model' bigram_model = Phrases(unigram_resume) bigram_model.save(bigram_model_path) # Create bigram words def create_bigram (unigram_resume): bigram_model = Phrases.load(bigram_model_path) bigram_resume = [bigram_model[sentence] for sentence in unigram_resume] return bigram_resume bigram_resume = create_bigram(unigram_resume) #Create trigram model trigram_model_path = 'trigram_model' trigram_model = Phrases(bigram_resume) trigram_model.save(trigram_model_path) # Create trigram words def create_trigram (bigram_resume): trigram_model = Phrases.load(trigram_model_path) trigram_resume = [trigram_model[sentence] for sentence in bigram_resume] return trigram_resume trigram_resume = create_trigram(bigram_resume) #Normalize bigram/trigram words def normalize_words (trigram_resume): for sentence in trigram_resume: for i, word in enumerate(sentence): if len(re.findall(r'\w+\_\w+', word))!= 0: sentence[i] = re.sub('_', ' ', word) return trigram_resume normalized_resume = normalize_words(trigram_resume) #label skills in the resume def labeled_word (sentence): labels=[] for word in sentence: if word in skill_list: labels.append((word, 'skill')) else: labels.append((word, 'not skill')) return labels labeled_words=[labeled_word(sentence) for sentence in normalized_resume] #Get 25 similar words based on word2vec model def similar_prob(word): count = 0 terms = get_related_terms(word,25) for w in terms: if skill_series.isin([w]).any(): count+=1 return count/25 #Check if the word is in skill clusters, based on KMeans algorithm def in_skill_cluster(word): if word in skills: return True return False #extract featurres of skills def extract_features (sentence, i): features={} #first feature: evaluate if that word is in skill list features["({})in_skill_list".format(sentence[i])]= (sentence[i] in skill_list) if sentence[i] in res2vec.wv.vocab: features["probality_of_similar_words_skills"] = similar_prob(sentence[i]) features["in_skill_cluster"] = in_skill_cluster(sentence[i]) #if the word is in begining of the sentence, return <Start> for prev_word if i==0 and len(sentence)-1 != 0: features["prev_word_in_skill_list"]= '<Start>' features["next_word_in_skill_list"]= (sentence[i+1] in skill_list) #if the word is in begining of the sentence, return <End> for next_word elif i == len(sentence)-1 and i != 0: features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list) features["next_word_in_skill_list"]= '<End>' #if the sentence has only 1 word, return False for both prev_word and next_word elif i==0 and len(sentence)-1 == 0: features["prev_word_in_skill_list"]= False features["next_word_in_skill_list"]= False else: features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list) features["next_word_in_skill_list"]= (sentence[i+1] in skill_list) return features featuresets=[] for labeled_sent in labeled_words: unlabeled_sent = [word[0] for word in labeled_sent] for i, (w, label) in enumerate(labeled_sent): featuresets.append((extract_features(unlabeled_sent, i), label)) #Save the features in a file featuresets_file = 'features_file.txt' file = open(featuresets_file, 'w', encoding='utf_8') file.write('\n'.join('%s %s' % item for item in featuresets )) size = int(len(featuresets)*0.1) train_set = featuresets[size:] test_set = featuresets[:size] #Train the data with NaiveBayes model classifier = nltk.NaiveBayesClassifier.train(train_set) #Evaluate the accuracy nltk.classify.accuracy(classifier, test_set) #Extract the skills def extract_skills(normalized_test_res, resume_number, filename): skills =[] for sent in normalized_test_res: for (i,_) in enumerate(sent): if classifier.classify(extract_features(sent, i))=='skill': skills.append(sent[i]) extracted_skills = set(skills) print('\nResume {}:{} ({} skills)\n'.format(resume_number+1,filename, len(extracted_skills)), extracted_skills) #VAIBHAV #Import Statements import csv import re #Email Address (Finished) def check_email(string_to_search): regular_expression = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,3}", re.IGNORECASE) result = re.search(regular_expression, string_to_search) if result: result = result.group() return result #except: # result=0 # return result #LinkedIn Address(Finished) def check_linkedin(string_to_search): regular_expression1 = re.compile(r"https://" r"[A-Z]{2,3}" r".linkedin.com/in/" r"[-_a-z 0-9]{5,30}", re.IGNORECASE) result = re.search(regular_expression1, string_to_search) try: result = result.group() return result except: regular_expression1 = re.compile(r"[A-Z]{2,3}" r".linkedin.com/in/" r"[-_a-z 0-9]{5,30}", re.IGNORECASE) result = re.search(regular_expression1, string_to_search) try: result=result.group() return result except: regular_expression1 = re.compile(r"[A-Z]{2,3}" r".linkedin.com/" r"[-_a-z 0-9]{5,30}", re.IGNORECASE) result = re.search(regular_expression1, string_to_search) try: result=result.group() return result except: return None #GitHub Address (Finished) def check_GitHub(string_to_search): regular_expression = re.compile(r"https://github.com/" r"[-_A-Z0-9]{5,30}", re.IGNORECASE) result = re.search(regular_expression, string_to_search) try: result = result.group() return result except: return None #Contact Number (Finished) def check_phone_number(string_to_search): try: regular_expression = re.compile(r"\(?" # open parenthesis r"(\d{3})?" # area code r"\)?" # close parenthesis r"[\s\.-]{0,2}?" # area code, phone separator r"(\d{3})" # 3 digit exchange r"[\s\.-]{0,2}" # separator bbetween 3 digit exchange, 4 digit local r"(\d{4})", # 4 digit local re.IGNORECASE) result = re.search(regular_expression, string_to_search) if result: result = result.groups() result = "-".join(result) return result except: return None def main(): # with open('Resume_Test.txt', 'r',encoding="utf8") as myfile: with open('Resume_Test.txt', 'r') as myfile: data=myfile.read().replace('\n',' **** ') result=check_email(data) result_L=check_linkedin(data) result_P=check_phone_number(data) result_G=check_GitHub(data) print("Email Address:",result) print("Contact Number:",result_P) print("Linkedin Profile:",result_L) print("GitHub Profile:",result_G) #print(data) main() #Ashish (LinkedIn Profiles and Every other URL in the file) # import all headers import re import os # function to extract all URLs # implemented using regex def extract_URLs(parsedResume): parsedResume = parsedResume.replace('\n', ' ') regex = regex = re.compile('(?:(?:https?|ftp|file)://|www\.|ftp\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]', re.IGNORECASE) result = re.findall(regex, parsedResume) #if result: #result = result.group() return result # function to extract LinkedIN Profile # implemented using regex def extract_linkedin(parsedResume): parsedResume = parsedResume.replace('\n', ' ') regex = re.compile(r"https://www.linkedin.com/in/([a-zA-Z]|[0-9]|[-])+/?") result = re.search(regex, parsedResume) if result: result = result.group() return result # TESTING # path where all resumes are located test_resume_path = '/Users/Ashish/Desktop/Internship/Personal/Test Resumes' counter = 0 print("URLs in Test Resumes") for filename in os.listdir(test_resume_path): # print(filename) if '.txt' in filename: counter = counter + 1 resume_path= os.path.join('Test Resumes', filename) test_resume = open(resume_path, 'r').read() print("Resume ", (counter), ":") print("All URLs => ", extract_URLs(test_resume)) print("LinkedIn Profiles => ", extract_linkedin(test_resume))
['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'], ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'], ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'], ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington'] ] # Change to use less memory. Default is 40m. max_vocab_size = 40000000 # Train up to trigrams. print('Training bigrams...') bigram = Phrases(doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.) print('Saving...') bigram.save('bigram_model.phrases') print('Training trigrams...') trigram = Phrases(bigram[doc_stream(paths, n)], max_vocab_size=max_vocab_size, threshold=10.) print('Saving...') trigram.save('trigram_model.phrases') print('Done.') #print('Loading bigrams...') #bigram = Phrases.load('bigram_model.phrases') #print('Loading trigrams...') #trigram = Phrases.load('trigram_model.phrases')
import os import codecs import itertools as it from gensim.models import Phrases from gensim.models.word2vec import LineSentence import settings lemmatized = LineSentence(os.path.join(settings.DATA_PATH, 'lemmatized.txt')) bigram_model_filepath = os.path.join(settings.DATA_PATH, 'bigram_model') if 0 == 1: bigram_model = Phrases(lemmatized) bigram_model.save(bigram_model_filepath) bigram_model = Phrases.load(bigram_model_filepath) bigram_sentences_filepath = os.path.join(settings.DATA_PATH, 'bigram_sentences.txt') if 0 == 1: with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f: for sentence in lemmatized: bigram_sentence = ' '.join(bigram_model[sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence(bigram_sentences_filepath) trigram_model_filepath = os.path.join(settings.DATA_PATH,'trigram_model') if 0 == 1: trigram_model = Phrases(bigram_sentences) trigram_model.save(trigram_model_filepath)
ct += 1 if ct % 50000 == 0: print ct if line.strip() == '</VISIT>': text = read_visit(st) text = tokenizer.tokenize(text) for sent in text: yield nltk.word_tokenize(sent.lower()) st = [] elif line.strip() != '<VISIT>': st += [line.strip()] except IOError: pass f = open("tokenizer.pk", "rb") tokenizer = pickle.load(f) f.close() print 'BIGRAMS' bigram = Phrases(next_note(tokenizer), delimiter='') bigram.save('bigrams.pk') print 'TRIGRAMS' trigram = Phrases(bigram[next_note(tokenizer)], delimiter='') trigram.save('trigrams.pk') print '4GRAMS' ngram = Phrases(trigram[next_note(tokenizer)], delimiter='') ngram.save('ngrams.pk')
def link_twoWords(file): bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) bigram_model = Phrases.load(bigram_model_filepath)
#%% ######################## ### bigram and trigram## ######################## data_path = os.path.join('data','tokenized_sentances.p') bigram_model_path = os.path.join('data','bigram_model') trigram_model_path = os.path.join('data','trigram_model') #%% ## first train a bigram and trigram model on a large dataset and save them sentances = pickle.load(open(data_path,'rb')) #%% print('Training bigram model .........\n') bi_phrases = Phrases(sentances, min_count=5, threshold=15) bi_phrases.save(bigram_model_path) bigram_transformer = Phraser(bi_phrases) bigram_transformer.save(os.path.join('data','bigram_transformer')) sentances = list(bigram_transformer[sentances]) tri_phrases = Phrases(sentances, min_count=5, threshold=10) bi_phrases.save(trigram_model_path) trigram_transformer = Phraser(tri_phrases) trigram_transformer.save(os.path.join('data','trigram_transformer')) #sentances = list(trigram_transformer[sentances]) ## if you want to check pharses list pharses_list = list(tri_phrases.vocab.keys()) #print('Dump to Pickle') #pickle.dump(sentances,open(out_path, "wb"))
# build unigram sentence corpus(to train bigram model later) if 1 == 0: with codecs.open(unigram_sentences_path, 'w', encoding='utf-8') as f: for sentence in lemmatized_sentence_corpus(replaced_documents): f.write(sentence + '\n') from gensim.models import Phrases from gensim.models.word2vec import LineSentence # load unigram sentence corpus unigram_sentences = LineSentence(unigram_sentences_path) # training bigram_model bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_path) def lemmatized_document(list_of_documents): """ generator function to use spaCy to parse documents, lemmatize them, and yield, one per document """ for parsed_review in nlp.pipe(list_of_documents, batch_size=10000, n_threads=4): yield [ token.lemma_ for token in parsed_review if not punct_stop_space(token) ]
def phrases(): p = Phrases(sentences=process_corpus('/Users/valeriyischenko/local/projects/lingua_hack/Text')) p.save('../wiki/text_phrase_model_p3')
def tokenize_stem_stop(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for word in nltk.word_tokenize(text) if word not in stopwords] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z0-9]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in tokens] return stems def makesent(path): result = [] for line in open(path): result.append(tokenize_stem_stop(line.rsplit(' ', 1)[0].lower())) return result sentences = makesent('/Users/yjiang/Documents/pythonWorkspace/freqCounter/data/agg_self_in_200.txt') print(sentences) bigram = Phrases(sentences, min_count=1, threshold=1) bi_sent = bigram[sentences] trigram = Phrases(bigram[sentences], min_count=1, threshold=1) #============================================================================== # for phrase, score in trigram.export_phrases(bi_sent): # print(u'{0} {1}'.format(phrase, score)) #============================================================================== bigram.save('model/bigram') trigram.save('model/trigram')