def __init__(self, core_term_path, pretrain=True, update=True, fasttext_corpus_path=None): p = PorterStemmer() with open(core_term_path, 'r') as f: self.core_terms = list( set([ p.stem(word.strip()) for word in f.readlines() if len(word.strip()) > 0 ])) self.core_terms.sort() self.word_embeddings = WordEmbeddings(pretrain, update, fasttext_corpus_path) self.core_term_dict = {} index = 2 for core_term in self.core_terms: self.core_term_dict[core_term] = index index += 1 with open(fasttext_corpus_path, 'r') as f: fasttext_corpus_content = f.readlines() documents = [ line.strip().split() for idx, line in enumerate(fasttext_corpus_content) if idx % 2 == 0 and len(line.strip()) > 0 ] dictionary = corpora.Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidf_model = models.TfidfModel(corpus) self.idfs = { dictionary[kv[0]]: kv[1] for kv in tfidf_model.idfs.items() }
def get_text_similarity(url_df, dictionary, tfidf, sims, inds, sim_type): ''' calculate the tfidf title similarity between origin and target :param url_df: df of keywords for each url :param dictionary: tfidf dictionary :param tfidf: :param sims: tfidf similarity matrix :param inds: index of order or titles :param sim_type: title or body :return: dataframe of urls and title similarity scores ''' url_df[sim_type] = url_df[sim_type].fillna('') translator = str.maketrans('', '', string.punctuation) global_stemmer = PorterStemmer() # load all stopwords with open( '/Users/thyde/Documents/cloned_proj_moat/project_moat/stopwords.txt' ) as f: stopwords = f.read().split() # parse text into list of word stems texts = [[ global_stemmer.stem(word) for word in text.translate(translator).lower().split() if word not in stopwords ] for text in url_df[sim_type].values] # calculate similarity score to target title sim_scores = [] for text in texts: vec_bow = dictionary.doc2bow(text) vec_tfidf = tfidf[vec_bow] #import pdb;pdb.set_trace() res = sims[vec_tfidf] try: sim_scores.append(res[inds[url_df[ url_df['origin'] == url_df['url']][sim_type].values[0]]]) except KeyError: sim_scores.append(np.NaN) return pd.DataFrame([url_df['url'].values, sim_scores], index=['url', 'title_similarity']).T
def title_sim_construction(url_list, list_name, collection): ''' :param url_list: list of urls to get title similarity for :param list_name: list name for purposes of saving pkl file :param collection: mongo collection to query :return: nothing, just saves files ''' #classic stopwords textfile, will be in repo with open('/Users/thyde/Downloads/stopwords.txt') as f: stopwords = f.read().split() translator = str.maketrans('', '', string.punctuation) global_stemmer = PorterStemmer() #mc = MongoAtlasClient("mongodb://*****:*****@investopedia-shard-00-00-ibdgj.mongodb.net:27017,investopedia-shard-00-01-ibdgj.mongodb.net:27017,investopedia-shard-00-02-ibdgj.mongodb.net:27017/test?ssl=true&replicaSet=investopedia-shard-0&authSource=admin", "sSQXR9fVxNu2P0U5") #my_collection = mc['investopedia']['corpus'] docs = list(collection.find({'url': {'$in': url_list}})) # pull out title text and create index dictionary title_text = [doc['title'] for doc in docs] title_ind = {title: i for i, title in enumerate(title_text)} # parse words in titles texts = [[ global_stemmer.stem(word) for word in title.translate(translator).lower().split() if word not in stopwords ] for title in title_text] # create gensim corpus dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # save files pkl.dump(dictionary, open('{}_title_dictionary.pkl'.format(list_name), 'wb')) pkl.dump(tfidf, open('{}_title_tfidf.pkl'.format(list_name), 'wb')) pkl.dump(corpus_tfidf, open('{}_title_corpus.pkl'.format(list_name), 'wb')) pkl.dump(title_ind, open('{}_title_ind.pkl'.format(list_name), 'wb'))
def tokenize(): fd = open("wiki_text_raw.txt", 'r+') fd_tk = open("wiki_text_tokenized.txt", 'a+') for line in fd: wordlist = re.findall(r"[a-zA-Z]+", line) if len(wordlist) > 10: for index, word in enumerate(wordlist): # Stem each word in word list wordlist[index] = PorterStemmer().stem(word) for word in wordlist: fd_tk.write(str(word + ' ')) fd_tk.write('\n')
def preprocess_text(self, text): """Apply preprocessing to a single text document. This should perform tokenization in addition to any other desired preprocessing steps. Args: text (str): document text read from plain-text file. Returns: iterable of str: tokens produced from `text` as a result of preprocessing. """ for character_filter in self.character_filters: text = character_filter(text) tokens = self.tokenizer(text) for token_filter in self.token_filters: tokens = token_filter(tokens) if self.stem: p = PorterStemmer() tokens = [p.stem(token) for token in tokens] return tokens
def similarity(a, b): text_a = a text_b = b a = a.split(' ') b = b.split(' ') for index, word in enumerate(a): a[index] = PorterStemmer().stem(word) for index, word in enumerate(b): b[index] = PorterStemmer().stem(word) model = word2vec.Word2Vec.load('word2vec_model.txt') b_cp = b[:] for word in b: if word not in model.vocab: del b_cp[b_cp.index(word)] a_cp = a[:] for word in a: if word not in model.vocab: del a_cp[a_cp.index(word)] return model.n_similarity(a_cp, b_cp)
def preprocess(file, out): stemmer = PorterStemmer() f = open(file) outp = open(out, 'w') for line in f: str = "" for word in line.split(): w = processToken(word, stemmer) if w != "": #w = re.sub('\[([^ \[\]\|]+)\|([^\[\]\|]+)\]', r'\1', w) str = str + w + " " str = re.sub('\[([^ \[\]\|]+)\|([^\[\]\|]+)\]', r'\1', str) outp.write(str + "\n") f.close() outp.close()
class TextPreprocessor: def __init__(self): self.stop = set(stopwords.words('english')) self.global_stemmer = PorterStemmer() # Stem the word. It returns base form of a word def stem(self, word): stemmed = self.global_stemmer.stem(word) return stemmed # Removes stop words from a sentence def remove_stopwords(self, sentence): tokens = [] # Currently this only removes fullstop at the end for i in sentence.lower().split(): if i not in '<stop>': if i.endswith('.'): i = i.replace(".", "") elif i.endswith(','): i = i.replace(",", "") tokens.append(i) # if i not in self.stop and i not in '<stop>': # i = i.replace(",", "") # i = i.replace(".", "") # i = self.stem(i) # tokens.append(i) return tokens # Parse the file and generates training set def parse_file(self, filename, is_student_answer): with open(filename) as f: content = f.readlines() if is_student_answer: file_list = [] for line in content: line_list = self.remove_stopwords(line) file_list.append(line_list) return file_list else: file_dict = {} for line in content: line_list = self.remove_stopwords(line) file_dict[line_list.pop(0)] = line_list return file_dict
class Tokenizer: def __init__(self): self.p = PorterStemmer() def parse(self, nl_path, code_path): return self.__combine(self.__parse_file(nl_path, True, True), self.__parse_file(code_path, False, True)) @staticmethod def __combine(nl_dict, code_dict): ret = [] for key in sorted([int(key) for key in nl_dict.keys()]): ret.append((nl_dict[str(key)], code_dict[str(key)], str(key))) return ret def __parse_file(self, file_path, rm_stopwords=False, stem=False): ret = {} with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if len(line) > 0: p = line.index('\t') idx = line[: p] tokens = self.__get_tokens(line[p + 1:], rm_stopwords, stem) ret[idx] = tokens return ret def __get_tokens(self, content, rm_stopwords=False, stem=False): words = [word for word in re.split('[^A-Za-z]+', content) if len(word) > 0] ret = [] for word in words: ret += self.__camel_case_split(word) tmp = [] for word in ret: if rm_stopwords: word = remove_stopwords(word) if len(word) > 0: if stem: word = self.p.stem(word) tmp.append(word) ret = tmp return ret @staticmethod def __camel_case_split(word): matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word) return [m.group(0).lower() for m in matches]
def __init__(self): self.stop = set(stopwords.words('english')) self.global_stemmer = PorterStemmer()
from gensim import corpora, models, similarities from gensim.parsing import PorterStemmer from numpy import array, sqrt #Code for document preprocessing #Build set of stopwords f = open('stopwords') stoplist = [] for line in f: stoplist.append(line[0:-1]) f.close() stoplist = set(stoplist) #Initialize stemmer stemmer = PorterStemmer() def to_vector_model(corpusfile, tfidf=False): """ Convert a corpus to vector space model for processing If tfidf = True, returns a tf-idf based model Returns the model and dictionary of words. """ f = open(corpusfile, "r") documents = f.readlines() f.close() #Remove stopwords and stem
from trained_model import google_model, get_word_vector from nltk.corpus import opinion_lexicon from sklearn.model_selection import cross_val_score from os import listdir from os.path import isfile, join import sklearn.cross_validation import gensim, logging from string import maketrans, translate import string from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score, confusion_matrix from termcolor import colored from colorama import init from nltk.tokenize import sent_tokenize, word_tokenize from gensim.parsing import PorterStemmer global_stemmer = PorterStemmer() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) strip_special_chars = re.compile("[^A-Za-z0-9 ]+") countFor = 0 countAgainst = 0 countDefect = 0 countError = 0 countAllegation = 0 countAppreciation = 0 countCallForAction = 0 countYes = 0 countNo = 0 LabeledSentence = gensim.models.doc2vec.LabeledSentence
def searchIndbFacebookSaved(search_value): for x in "and or it is the a".split(): search_value.replace(" " + x + " ", "") result = dbFacebookSaved.query.filter( dbFacebookSaved.title.ilike("%" + search_value.replace(" ", "%") + "%")) #("%" + search_value + "%"))# idList = [ result.order_by(dbFacebookSaved.date)[count - 1].id for count in range(result.count(), 0, -1) ] idDict = dict() idDict = adding_weight_to_dict(idDict, idList, 1) print ".ilike" print idDict stemmer = PorterStemmer() search_value = search_value.split() search_valueRaw = list(search_value) if len(search_value) > 1: sumVector = model3['car'] * 0 for searchTerm in search_valueRaw: if searchTerm.lower() in model3.vocab: sumVector = sumVector + model3[searchTerm.lower()] similarList = model3.similar_by_vector(sumVector) print "similarList (sumVector)" print similarList """ for i in range(min(5,len(similarList))): if similarList[i][1] >= 0.7 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) print "append " + similarList[i][0] + " from fasttext(sum of vec)" """ print "New search value after sumVec:" search_value += [ similarList[i][0] for i in range(min(5, len(similarList))) if similarList[i][1] >= 0.72 and similarList[i][0] not in search_value ] print search_value search_valueR = [] for searchTerm in search_valueRaw: for i, mdl in enumerate([model, model2]): if searchTerm.lower() in mdl.vocab: similarList = mdl.most_similar(searchTerm.lower()) listLengh = 3 if i == 0 else 5 scoreThreshold = 0.5 if i == 0 else 0.55 tempText = " from gensim_word2vec for relating to " if i == 0 else " from fasttext(CBOW) for relating to " for i in range(min(listLengh, len(similarList))): if similarList[i][1] >= scoreThreshold and similarList[i][ 0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][ 0] + tempText + searchTerm """ if searchTerm.lower() in model.vocab: similarList = model.most_similar(searchTerm.lower()) for i in range(min(3,len(similarList))): if similarList[i][1] >= 0.5 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][0] + " from gensim_word2vec for relating to " + searchTerm if searchTerm.lower() in model2.vocab: similarList = model2.most_similar(searchTerm.lower()) for i in range(min(5,len(similarList))): if similarList[i][1] >= 0.55 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][0] + " from fasttext(CBOW) for relating to " + searchTerm """ """ print "search_value before stemming:" print search_value stemmer = PorterStemmer() search_value = [stemmer.stem(word) for word in search_value] search_value = list(set(search_value)) search_valueR = [stemmer.stem(word) for word in search_valueR] search_valueR = list(set(search_valueR)) print "search_value bafter stemming:" """ print search_value for word in search_value: if word == stemmer.stem( word) or not stemmer.stem(word) in search_value: result = dbFacebookSaved.query.filter( dbFacebookSaved.title.contains(word)) resultKwd = dbFacebookSaved.query.filter( dbFacebookSaved.keywords.contains(word)) resultSummary = dbFacebookSaved.query.filter( dbFacebookSaved.summary.contains(word)) weight = 1 if len(preprocess_string(word)) == 0: weight = 0.1 elif word in search_valueR: weight = 0.5 idList = [ read_db_data_to_article( result.order_by(dbFacebookSaved.date)[count - 1])['id'] for count in range(result.count(), 0, -1) ] idDict = adding_weight_to_dict(idDict, idList, 1 * weight) print ".title.contains(" + word + ")" print idDict idList = [ read_db_data_to_article( resultKwd.order_by(dbFacebookSaved.date)[count - 1])['id'] for count in range(resultKwd.count(), 0, -1) ] idDict = adding_weight_to_dict(idDict, idList, 0.5 * weight) print ".keywords.contains(" + word + ")" print idDict idList = [] for count in range(resultSummary.count(), 0, -1): if not resultSummary.order_by( dbFacebookSaved.date)[count - 1].id in idList and len( preprocess_string(word)) > 0: article = read_db_data_to_article( resultSummary.order_by(dbFacebookSaved.date)[count - 1]) idList.append(article['id']) cumsum = 0 # preprocess_string is a gensim function that do preprocessing for a string. ex: people -> peopl, Oranges -> orang word = preprocess_string(word)[0] for w in article['text']: if len(preprocess_string(w)) > 0: w = preprocess_string(w) if cumsum <= 0.6 and word in w: idDict[article['id']] = idDict.get( article['id'], 0) + 0.2 * weight cumsum = cumsum + 0.2 * weight print ".summary.contains(" + word + ")" #idDict = adding_weight_to_dict(idDict, idList, 0.2) print idDict else: print "ignore " + word + " for " + stemmer.stem(word) return idDict
from typing import List, Set, Union from gensim.parsing import PorterStemmer from gensim.utils import tokenize from Util import utils_ from gitMine.VCClasses import Commit, Issue, PullRequest ps = PorterStemmer() def merge_commit_title_and_desc(commit: Commit) -> str: if commit.title[-3:] == commit.desc[:3] == '...': return commit.title[:-3] + commit.desc[3:] return commit.title + commit.desc def preprocess_text(text: str, stopwords_: Set[str], min_len) -> List[str]: return [ps.stem(tok).lower() for tok in split_compound_toks(tokenize(text, lowercase=False, deacc=True, errors='strict')) if tok not in stopwords_ and len(tok) > min_len] def split_compound_toks(toks: List[str]) -> List[str]: """Method to split CamelCase and snake_case by first normalizing everything to snake_case and splitting on '_'""" result = list() for tok in toks: candidate = utils_.GitMineUtils.camel_to_snake(tok).translate({ord(c): ' ' for c in '_'}) if not tok == candidate: result += [tok_ for tok_ in candidate.split(' ')] result.append(''.join(tok.split('_')))
data['color'] = data.apply( lambda row: VARIETY_MAP.get(row['variety'], (-1, 'N/A'))[1], axis=1) data['class'] = data.apply( lambda row: VARIETY_MAP.get(row['variety'], (-1, 'N/A'))[0], axis=1) #Filter only on selected varieties if filter: data = data.loc[data['color'] != 'N/A'] return data ########################################################################################################## word_stemmer = PorterStemmer() def trainWord2Vec(min_count=2, size=50, window=4): """ Returns a trained word2vec model """ #Load descriptions (i.e. word2vec training data) df = importData("data-raw/winemag-data_first150k.csv", removeSpecialChars=True, censor=False, filter=True, processDescriptions=True, processOptions={ 'removeContractions': False, 'removePunctuation': True,
def __init__(self): self.p = PorterStemmer()