def preprocess_data(train_data, test_data): custom_stopwords = set(ENGLISH_STOP_WORDS) custom_stopwords.update(["say", "says", "said", "saying", "just", "year", "man", "men", "woman", \ "women", "guy", "guys", "run", "running", "ran", "run", "do", "don't", "does", "doesn't" , \ "doing", "did", "didn't", "use", "used", "continue", "number", "great", "big", "good", "bad", \ "better", "worse", "best", "worst", "actually", "fact", "way", "tell", "told", "include", "including", \ "want", "wanting", "will", "won't", "give", "given", "month", "day", "place", "area", "look", \ "looked", "far", "near", "get", "getting", "got", "know", "knows", "knew", "long", "week", "have", \ "has", "haven't", "hasn't", "having", "had", "hadn't", "not", "think", "thinking", "Monday", \ "Tuesday", "Wednesday", "Thursday", "Saturday", "Sunday", "high", "low", "thing", "there", "they're", \ "It", "I've", "I'd", "He's", "She's", "They've", "I'm", "You're", "your", "their", "his", "hers", \ "mine", "today", "yesterday", "it", "ve", "going", "go", "went", "lot", "don", "saw", "seen", "come", "came"]) titled_train_data = add_titles(train_data['Content'], train_data['Title']) if test_data is not None: titled_test_data = add_titles(test_data['Content'], test_data['Title']) # Removing stopwords: new_train_data = [] for doc in titled_train_data: doc_wordlist = doc.split() new_doc_wordlist = [ word for word in doc_wordlist if word not in custom_stopwords ] new_doc = ' '.join(new_doc_wordlist) new_train_data.append(new_doc) if test_data is not None: new_test_data = [] for doc in titled_test_data: doc_wordlist = doc.split() new_doc_wordlist = [ word for word in doc_wordlist if word not in custom_stopwords ] new_doc = ' '.join(new_doc_wordlist) new_test_data.append(new_doc) p = PorterStemmer() train_docs = p.stem_documents(new_train_data) if test_data is not None: test_docs = p.stem_documents(new_test_data) print "my_method: Stemmed data." vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_docs) if test_data is not None: Test = vectorizer.transform(test_docs) else: Test = None print "my_method: Vectorized data" svd_model = TruncatedSVD(n_components=200) # random_state=13 X = svd_model.fit_transform(X) if test_data is not None: Test = svd_model.transform(Test) print "SVD'd data" return X, Test
def assign_country_label_ids(country_scores, label_score, num_candidates, use_label_candidates): """Output: Dictionary --> key = country, value = label""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by=label_score, ascending=False) used_stems = set() if use_label_candidates is True: # print('USING SOFT LABELING') final_labels = defaultdict(set) final_ids = defaultdict(set) for row in country_scores.itertuples(): if len(final_labels[row.country]) <= num_candidates and row.stem not in used_stems and row.stem not in BLACK_LIST: final_labels[row.country].add([row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]) final_ids[row.country].add(int(row.label_id)) used_stems.add(row.stem) else: final_labels = {} final_ids = {} for row in country_scores.itertuples(): if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST: final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi] final_ids[row.country] = row.label_id used_stems.add(row.stem) return final_labels, final_ids
def get_top_labels(country_scores): """Output: Dictionary --> key = country, value = list of top labels""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents( [str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by="tfidf", ascending=False) country_scores_pmi = country_scores.sort_values(by="pmi", ascending=False) top_labels = [[] for x in range(country_scores['num_countries'][0])] top_labels_pmi = [[] for x in range(country_scores_pmi['num_countries'][0])] used_stems = set() used_stems_pmi = set() for row in country_scores.itertuples(): if row.stem not in used_stems: if len(top_labels[row.country]) < 40: top_labels[row.country].extend([ row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi, row.country ]) used_stems.add(row.stem) for row in country_scores_pmi.itertuples(): if row.stem not in used_stems_pmi: if len(top_labels_pmi[row.country]) < 40: top_labels_pmi[row.country].extend([ row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi, row.country ]) used_stems_pmi.add(row.stem) return top_labels, top_labels_pmi
def preprocess_documents(documents): # preprocess each doc documents = [preprocess_doc(doc) for doc in documents] # stem the documents stemmer = PorterStemmer() documents = stemmer.stem_documents(documents) # split all the documents into list of tokens documents = [doc.split() for doc in documents] return documents
def main(): ############################## Setup Code ##################################### global document_index path = "./myroot" file3 = open("cmptext.txt", "w+") number_of_documents = recursive_read(path, file3) file3.close() print 'All files read' file3 = open("cmptext.txt", "r") preprocess(file3, number_of_documents) file3.close() print 'All files processed' print 'Word2Vec begins' model = get_word2vec(number_of_documents) #includes trigrams model.save('vocab.txt') print 'Word2Vec done' vocabulary = model.wv.vocab.keys() inverted_index = get_inverted_index(vocabulary) for item in inverted_index.keys(): if not inverted_index[item]: del inverted_index[item] with open("inverted-index.txt", "wb") as fp: pickle.dump(inverted_index, fp) fp.close() get_tfidf_vectors(inverted_index, number_of_documents) get_norms() doc_num = 0 file1 = open("cmptext.txt", "r") stemmer = PorterStemmer() for document in file1: spreprocessed = [] doc_num += 1 for line in document.split('. '): temp1 = [] temp2 = [] temp1 = gensim.utils.simple_preprocess(line, max_len=20) for word in temp1: if word not in stop_words: temp2.append(word) spreprocessed.append(stemmer.stem_documents(temp2)) with open("spreprocessed" + str(doc_num) + ".txt", "w+") as fp: pickle.dump(spreprocessed, fp) fp.close() del spreprocessed[:] file1.close() with open("document-index.txt", "wb") as fp: pickle.dump(document_index, fp) fp.close()
def assign_country_label_ids(country_scores, label_score): """Output: Dictionary --> key = country, value = label""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by=label_score, ascending=False) used_stems = set() final_labels = {} final_ids = {} for row in country_scores.itertuples(): if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST: final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi] final_ids[row.country] = row.label_id used_stems.add(row.stem) return final_labels, final_ids
def preprocess(file_name, number_of_documents): stemmer = PorterStemmer() fp1 = open("preprocessed.txt", "wb") fp2 = open("preprocessed-cmptext.txt", "wb") pickle.dump(number_of_documents, fp1) for line in file_name: preprocess_list1 = gensim.utils.simple_preprocess(line, max_len=20) preprocess_list2 = [] for word in preprocess_list1: if word not in stop_words: preprocess_list2.append(word) pickle.dump(stemmer.stem_documents(preprocess_list2), fp1) for word in preprocess_list2: fp2.write(stemmer.stem(word.encode('utf-8'))) fp2.write(' ') fp2.write('\n') fp1.close() fp2.close()
def get_top_labels(country_scores, label_score, num_candidates=5): """Output: Dictionary --> key = country, value = list of top labels""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by=label_score, ascending=False) num_labels_per_country = defaultdict(int) top_labels = [] used_stems = set() for row in country_scores.itertuples(): if row.stem not in used_stems: if num_labels_per_country[row.country] < num_candidates: top_labels.append([row.country, row.label_id, row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]) used_stems.add(row.stem) num_labels_per_country[row.country] += 1 return top_labels
def get_top_labels(country_scores): """Output: Dictionary --> key = country, value = list of top labels""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents( [str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by="tfidf", ascending=False) top_labels = [[] for x in range(country_scores['num_countries'][0])] used_stems = set() country_scores = country_scores.fillna(0) for row in country_scores.itertuples(): if row.stem not in used_stems and row.stem is not int: # selecting top 20 labels if len(top_labels[row.country]) < 90: top_labels[row.country].extend([ str(row.label).lower().replace(' ', '_').strip(), float(row.tfidf), row.country ]) used_stems.add(row.stem) if (row.label == "preparation"): print(row.label) print("Ccccccccccccccccc") return top_labels
y = le.transform(train_data["Category"]) titled_train_data = add_titles(train_data['Content'], train_data['Title'])\ # Removing stopwords: new_train_data = [] for doc in titled_train_data: doc_wordlist = doc.split() new_doc_wordlist = [ word for word in doc_wordlist if word not in custom_stopwords ] new_doc = ' '.join(new_doc_wordlist) new_train_data.append(new_doc) p = PorterStemmer() train_docs = p.stem_documents(new_train_data) print "Stemmed data." vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_docs) print "Vectorized data" svd_model5 = TruncatedSVD(n_components=5) # random_state=42 svdX5 = svd_model5.fit_transform(X) svd_model50 = TruncatedSVD(n_components=50) # random_state=13 svdX50 = svd_model50.fit_transform(X) svd_model200 = TruncatedSVD(n_components=200) # random_state=13 svdX200 = svd_model200.fit_transform(X) print "SVD'd data" # Cross Validation:
for line in tokens: documents[0] += line with open(target_document_path) as f: tokens = sent_tokenize(f.read()) for line in tokens: documents[1] += line #bag of word texts = [[text for text in simple_preprocess(doc, deacc=True)] for doc in documents] #stemming p = PorterStemmer() for k in range(len(texts)): texts[k] = p.stem_documents(texts[k]) #Reconvert documents to collection of words/bigrams #texts_bigrams = [[text for text in bigram_phraser[ simple_preprocess(doc, deacc=True)]] for doc in documents] # build N-gram texts_bigrams = [[]] * 2 for k in range(len(texts)): texts_bigrams[k] = [""] * (len(texts[k]) - 1) for kk in range(len(texts[k])): if (kk < len(texts[k]) - 1): texts_bigrams[k][kk] = texts[k][kk] + "_" + texts[k][kk + 1] # remove most frequency word, stop word for k in range(len(texts)): word_counter = {}
neg_list = pd.read_csv('./model_inputs/lm_word_lists/lm_negative.csv', header=None)[0].str.lower().to_list() pos_neg = pos_list + neg_list # tokenize and remove punctuation calls['text'] = calls['text'].apply( lambda x: [w for w in tokenize(x, deacc=True)]) # remove stopwords and uppercase words calls['text'] = calls['text'].apply(lambda x: [remove_stopwords(w) for w in x]) calls['text'] = calls['text'].apply( lambda x: [w for w in x if (2 < len(w) < 15) and (w.islower())]) # stemming (porter) p = PorterStemmer() calls['text'] = calls['text'].apply(lambda x: p.stem_documents(x)) # create dictionary object dictionary = corpora.Dictionary(calls['text']) agg_word_freq = {dictionary[k]: v for k, v in dictionary.cfs.items()} agg_word_freq = pd.DataFrame({ 'token': agg_word_freq.keys(), 'cf': agg_word_freq.values() }) agg_doc_freq = {dictionary[k]: v for k, v in dictionary.dfs.items()} agg_doc_freq = pd.DataFrame({ 'token': agg_doc_freq.keys(), 'df': agg_doc_freq.values() })
def live(model, input_query, vocabulary, length_preprocessed, inverted_index, document_dictionary, norms, nod, list_of_document_tfidf_dicts, user_id): ################################ Live Code ######################################## ERROR_MESSAGE = "" ANSWER = { "error": None, "main_ans": None, "Ans_1": None, "Ans_2": None, "Ans_3": None, "Ans_4": None, "Ans_5": None, "Ans_6": None, "Ans_7": None, "Ans_8": None, "Ans_9": None, "Ans_10": None } stemmer = PorterStemmer() preprocessed_query = gensim.utils.simple_preprocess(input_query, max_len=20) filtered_sentence = [] for word in preprocessed_query: if word not in stop_words: filtered_sentence.append(word) preprocessed_query = filtered_sentence preprocessed_query = stemmer.stem_documents(preprocessed_query) expanded_query = set( get_expanded_query(preprocessed_query, model, vocabulary)) eq_vector = get_eq_tfidf_vector(inverted_index, expanded_query, preprocessed_query, length_preprocessed) relevant_docs = get_relevantdocs(expanded_query, inverted_index) try: tfidf_scores = get_scores(relevant_docs, eq_vector, norms, nod, list_of_document_tfidf_dicts) if not tfidf_scores: ANSWER['error'] = 'Match Not Found.' return ANSWER except ZeroDivisionError: ANSWER['error'] = 'Please be more specific.' return ANSWER _sae = torch.load('my_sae.pt') fp = open('document-index.txt', 'rb') doc_index = pickle.load(fp) fp.close() nb_documents = len(doc_index) user_document_array = np.zeros(nb_documents) doc_ids = doc_index.values() rows = json.load( urllib.urlopen("http://127.0.0.1:8000/clicks/" + str(user_id) + '/'))['clicks'] for row in rows: user_document_array[doc_ids.index(str(row[0]))] = row[1] user_document_array = torch.FloatTensor(user_document_array) reco_sys_scores = _sae.forward(Variable(user_document_array).unsqueeze(0)) db.close() scores = dict() factor = 0.01 #print reco_sys_scores #print reco_sys_scores.data[0] # print (reco_sys_scores.numpy()) for doc_number in tfidf_scores.keys(): scores[doc_number] = ( 0.9 * tfidf_scores[doc_number] + 0.1 * factor * reco_sys_scores.data[0].data[doc_number]).item() print scores print tfidf_scores heap_docs = [(-value, key) for key, value in scores.items()] largest_docs = heapq.nsmallest(100, heap_docs) largest_docs = [(key, -value) for value, key in largest_docs] print largest_docs ############################### End of document ranking ###################################### preprocessed_tuple = [] for x in largest_docs: fp = open('spreprocessed' + str(x[0]) + '.txt', 'rb') list_temp = pickle.load(fp) preprocessed_tuple.extend([(x[0], y) for y in list_temp]) fp.close() vocabulary2 = list(expanded_query) inverted_index2 = get_inverted_index_query_terms(preprocessed_tuple, vocabulary2) relevant_sent = get_relevantdocs(expanded_query, inverted_index2) scores_bm25 = get_bm25(relevant_sent, inverted_index2, expanded_query, preprocessed_tuple) heap_sentences = [(-value, key) for key, value in scores_bm25.items()] largest_sentences = heapq.nsmallest(100, heap_sentences) largest_sentences = [(key, -value) for value, key in largest_sentences] sentenced_docs = set() for sentence in [x[0] for x in largest_sentences]: sentenced_docs.add(preprocessed_tuple[sentence - 1][0]) sentenced_docs_copy = [x for x in sentenced_docs] index_ans = get_index_ans(preprocessed_tuple, sentence_number=largest_sentences[0][0]) fp = open('cmptext.txt', 'rb') doc = 1 for line in fp: if doc == preprocessed_tuple[largest_sentences[0][0] - 1][0]: #ANSWER += 'MAIN ANSWER : ' + str(line.split('. ')[index_ans]) + '\n' ANSWER['main_ans'] = str(line.split('. ')[index_ans]) doc += 1 fp.close() link = open("links", "r") links = pickle.load(link) doc_num_ans = 0 ans_count = 0 for sentence in [x[0] for x in largest_sentences]: doc_number_of_sentence = preprocessed_tuple[sentence - 1][0] if doc_number_of_sentence in sentenced_docs: ans_count += 1 #ANSWER += 'Document ' + str(doc_number_of_sentence) + '\n' doc_num_ans += 1 index_ans = get_index_ans(preprocessed_tuple, sentence_number=sentence) fp = open('cmptext.txt', 'rb') doc = 1 for line in fp: if doc == doc_number_of_sentence: #ANSWER += 'SENTENCE : ' + str(line.split('. ')[index_ans]) + '\n' temp = list() temp.append(document_dictionary[doc_number_of_sentence]) temp.append(str(line.split('. ')[index_ans])) temp.append(str(links[doc_num_ans])) ANSWER['Ans_' + str(doc_num_ans)] = temp #ANSWER['Ans_' + str(doc_num_ans)] = list(document_dictionary[doc_number_of_sentence], str(line.split('. ')[index_ans])).append(str(links[index_ans])) #print 'Ans_' + str(doc_num_ans) + "\t" + str(list(document_dictionary[doc_number_of_sentence], str(line.split('. ')[index_ans])).append(str(links[index_ans]))) print temp doc += 1 fp.close() sentenced_docs.remove(doc_number_of_sentence) else: continue ANSWER['count'] = ans_count # print ANSWER return ANSWER
calls = calls_raw.loc[:, ['ticker_name','text']].groupby(['ticker_name'])['text'].apply(lambda x: ''.join(x)).reset_index() # tokenize and remove punctuation calls['text'] = calls['text'].apply(lambda x: simple_preprocess(x, min_len=2, max_len=15, deacc=True)) # Build the bigram model bigram = models.Phrases(calls['text'], min_count=5, threshold=100) # higher threshold fewer phrases. bigram_model = models.phrases.Phraser(bigram) calls['text'] = calls['text'].apply(lambda x: bigram_model[x]) # remove stopwords calls['text'] = calls['text'].apply(lambda x: remove_stopwords(x.lower())) # stemming (porter) p = PorterStemmer() calls['text'] = p.stem_documents(calls['text']) # create dictionary object dictionary = corpora.Dictionary(calls['text']) # filter extremes dictionary.filter_extremes(no_below=2, no_above=0.5) # bag-of-words transformation corpus = [dictionary.doc2bow(text) for text in calls['text']] # tfidf transformation tfidf = models.TfidfModel(corpus) # fit model corpus_tfidf = tfidf[corpus] # apply model # -------------------------------------------------------------------------------
def compare(self): with open (self.source_document_path , encoding = "ISO-8859-1") as f: tokens = sent_tokenize(f.read()) for line in tokens: self.documents[0] += line with open (self.target_document_path , encoding = "ISO-8859-1") as f: tokens = sent_tokenize(f.read()) for line in tokens: self.documents[1] += line #bag of word texts = [[text for text in simple_preprocess(doc, deacc=True)] for doc in self.documents] #stemming p = PorterStemmer() for k in range(len(texts)): texts[k] = p.stem_documents(texts[k]) #Reconvert documents to collection of words/bigrams bigram_phraser = Phrases(texts, min_count=1) texts_bigrams = [[text for text in bigram_phraser[ simple_preprocess(doc, deacc=True)]] for doc in self.documents] print(texts_bigrams) exit() # build N-gram texts_bigrams = [[]] * 2 for k in range(len(texts)): texts_bigrams[k] = [""] * (len(texts[k])-1) for kk in range(len(texts[k])): if(kk<len(texts[k])-1): texts_bigrams[k][kk]=texts[k][kk]+"_"+texts[k][kk+1] # remove most frequency word, stop word for k in range(len(texts)): word_counter = {} for word in texts_bigrams[k]: if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 popular_words = sorted(word_counter, key = word_counter.get, reverse = True) top = popular_words[:3] for kk in range(len(top))[:]: texts_bigrams[k][:] = (value for value in texts_bigrams[k] if value != top[kk]) #Create dictionary dictionary = corpora.Dictionary(texts_bigrams) #Create corpus corpus = [dictionary.doc2bow(docString) for docString in texts_bigrams] model = gensim.models.TfidfModel(corpus) # fit model vector = model[corpus[0]] #cosine similarity index = Similarity(corpus=corpus,num_features=len(dictionary),output_prefix='on_disk_output') for similarities in index: similar_docs = list(enumerate(similarities)) break return similar_docs[1][1]
for word in temp_doc.split(): word=unicode(word) doc.append(word) tagged = nltk.pos_tag(doc) temp_doc=u"" for word in tagged: tmp=unicode(word[0]) temp_doc= temp_doc + lemmatizer.lemmatize(tmp,get_wordnet_pos(word[1]))+" " ''' train_data["Content"].replace(to_replace=train_data["Content"][i], value=temp_doc, inplace=True) train_data["Content"] = pstem.stem_documents(train_data["Content"]) for i in range(0, len(test_data["Content"])): temp_doc = u"" for word in test_data["Content"][i].split(): word = unicode(word) word = unicode(word.lower()) word = unicode(strip_punctuation(word)) #word =check_money(word) if unicode(word) not in stopwords and unicode( word) != u"–" and unicode(word) != u"…": temp_doc = temp_doc + word + " " '''doc=[] for word in temp_doc.split(): word=unicode(word) doc.append(word)