示例#1
0
 def __init__(self,
              core_term_path,
              pretrain=True,
              update=True,
              fasttext_corpus_path=None):
     p = PorterStemmer()
     with open(core_term_path, 'r') as f:
         self.core_terms = list(
             set([
                 p.stem(word.strip()) for word in f.readlines()
                 if len(word.strip()) > 0
             ]))
         self.core_terms.sort()
     self.word_embeddings = WordEmbeddings(pretrain, update,
                                           fasttext_corpus_path)
     self.core_term_dict = {}
     index = 2
     for core_term in self.core_terms:
         self.core_term_dict[core_term] = index
         index += 1
     with open(fasttext_corpus_path, 'r') as f:
         fasttext_corpus_content = f.readlines()
     documents = [
         line.strip().split()
         for idx, line in enumerate(fasttext_corpus_content)
         if idx % 2 == 0 and len(line.strip()) > 0
     ]
     dictionary = corpora.Dictionary(documents)
     corpus = [dictionary.doc2bow(doc) for doc in documents]
     tfidf_model = models.TfidfModel(corpus)
     self.idfs = {
         dictionary[kv[0]]: kv[1]
         for kv in tfidf_model.idfs.items()
     }
示例#2
0
def get_text_similarity(url_df, dictionary, tfidf, sims, inds, sim_type):
    '''
    calculate the tfidf title similarity between origin and target
    :param url_df: df of keywords for each url
    :param dictionary: tfidf dictionary
    :param tfidf:
    :param sims: tfidf similarity matrix
    :param inds: index of order or titles
    :param sim_type: title or body
    :return: dataframe of urls and title similarity scores
    '''
    url_df[sim_type] = url_df[sim_type].fillna('')
    translator = str.maketrans('', '', string.punctuation)
    global_stemmer = PorterStemmer()

    # load all stopwords
    with open(
            '/Users/thyde/Documents/cloned_proj_moat/project_moat/stopwords.txt'
    ) as f:
        stopwords = f.read().split()

    # parse text into list of word stems
    texts = [[
        global_stemmer.stem(word)
        for word in text.translate(translator).lower().split()
        if word not in stopwords
    ] for text in url_df[sim_type].values]

    # calculate similarity score to target title
    sim_scores = []
    for text in texts:
        vec_bow = dictionary.doc2bow(text)
        vec_tfidf = tfidf[vec_bow]
        #import pdb;pdb.set_trace()
        res = sims[vec_tfidf]
        try:
            sim_scores.append(res[inds[url_df[
                url_df['origin'] == url_df['url']][sim_type].values[0]]])
        except KeyError:
            sim_scores.append(np.NaN)

    return pd.DataFrame([url_df['url'].values, sim_scores],
                        index=['url', 'title_similarity']).T
示例#3
0
def title_sim_construction(url_list, list_name, collection):
    '''
    :param url_list: list of urls to get title similarity for
    :param list_name: list name for purposes of saving pkl file
    :param collection: mongo collection to query
    :return: nothing, just saves files
    '''
    #classic stopwords textfile, will be in repo
    with open('/Users/thyde/Downloads/stopwords.txt') as f:
        stopwords = f.read().split()

    translator = str.maketrans('', '', string.punctuation)
    global_stemmer = PorterStemmer()

    #mc = MongoAtlasClient("mongodb://*****:*****@investopedia-shard-00-00-ibdgj.mongodb.net:27017,investopedia-shard-00-01-ibdgj.mongodb.net:27017,investopedia-shard-00-02-ibdgj.mongodb.net:27017/test?ssl=true&replicaSet=investopedia-shard-0&authSource=admin", "sSQXR9fVxNu2P0U5")
    #my_collection = mc['investopedia']['corpus']

    docs = list(collection.find({'url': {'$in': url_list}}))

    # pull out title text and create index dictionary
    title_text = [doc['title'] for doc in docs]
    title_ind = {title: i for i, title in enumerate(title_text)}

    # parse words in titles
    texts = [[
        global_stemmer.stem(word)
        for word in title.translate(translator).lower().split()
        if word not in stopwords
    ] for title in title_text]
    # create gensim corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # save files
    pkl.dump(dictionary, open('{}_title_dictionary.pkl'.format(list_name),
                              'wb'))
    pkl.dump(tfidf, open('{}_title_tfidf.pkl'.format(list_name), 'wb'))
    pkl.dump(corpus_tfidf, open('{}_title_corpus.pkl'.format(list_name), 'wb'))
    pkl.dump(title_ind, open('{}_title_ind.pkl'.format(list_name), 'wb'))
示例#4
0
def tokenize():
	fd = open("wiki_text_raw.txt", 'r+')
	fd_tk = open("wiki_text_tokenized.txt", 'a+')
	for line in fd:
		wordlist = re.findall(r"[a-zA-Z]+", line)
		if len(wordlist) > 10:
			for index, word in enumerate(wordlist):
				# Stem each word in word list
				wordlist[index] = PorterStemmer().stem(word)
			for word in wordlist:
				fd_tk.write(str(word + ' '))
			fd_tk.write('\n')
    def preprocess_text(self, text):
        """Apply preprocessing to a single text document. This should perform tokenization
        in addition to any other desired preprocessing steps.

        Args:
            text (str): document text read from plain-text file.

        Returns:
            iterable of str: tokens produced from `text` as a result of preprocessing.
        """
        for character_filter in self.character_filters:
            text = character_filter(text)

        tokens = self.tokenizer(text)
        for token_filter in self.token_filters:
            tokens = token_filter(tokens)

        if self.stem:
            p = PorterStemmer()
            tokens = [p.stem(token) for token in tokens]

        return tokens
示例#6
0
def similarity(a, b):
	text_a = a
	text_b = b
	a = a.split(' ')
	b = b.split(' ')
	for index, word in enumerate(a):
		a[index] = PorterStemmer().stem(word)
	for index, word in enumerate(b):
		b[index] = PorterStemmer().stem(word)
	model = word2vec.Word2Vec.load('word2vec_model.txt')

	b_cp = b[:]
	for word in b:
		if word not in model.vocab:
			del b_cp[b_cp.index(word)]

	a_cp = a[:]
	for word in a:
		if word not in model.vocab:
			del a_cp[a_cp.index(word)]

	return model.n_similarity(a_cp, b_cp)
示例#7
0
def preprocess(file, out):
    stemmer = PorterStemmer()
    f = open(file)
    outp = open(out, 'w')
    for line in f:
        str = ""
        for word in line.split():
            w = processToken(word, stemmer)
            if w != "":
                #w = re.sub('\[([^ \[\]\|]+)\|([^\[\]\|]+)\]', r'\1', w)
                str = str + w + " "
        str = re.sub('\[([^ \[\]\|]+)\|([^\[\]\|]+)\]', r'\1', str)
        outp.write(str + "\n")
    f.close()
    outp.close()
class TextPreprocessor:
    def __init__(self):
        self.stop = set(stopwords.words('english'))
        self.global_stemmer = PorterStemmer()

    # Stem the word. It returns base form of a word
    def stem(self, word):
        stemmed = self.global_stemmer.stem(word)
        return stemmed

    # Removes stop words from a sentence
    def remove_stopwords(self, sentence):
        tokens = []

        # Currently this only removes fullstop at the end
        for i in sentence.lower().split():
            if i not in '<stop>':
                if i.endswith('.'):
                    i = i.replace(".", "")
                elif i.endswith(','):
                    i = i.replace(",", "")
                tokens.append(i)

        # if i not in self.stop and i not in '<stop>':
        # i = i.replace(",", "")
        # i = i.replace(".", "")
        #  i = self.stem(i)
        #  tokens.append(i)

        return tokens

    # Parse the file and generates training set
    def parse_file(self, filename, is_student_answer):
        with open(filename) as f:
            content = f.readlines()
        if is_student_answer:
            file_list = []
            for line in content:
                line_list = self.remove_stopwords(line)
                file_list.append(line_list)
            return file_list
        else:
            file_dict = {}
            for line in content:
                line_list = self.remove_stopwords(line)
                file_dict[line_list.pop(0)] = line_list
            return file_dict
示例#9
0
class Tokenizer:

    def __init__(self):
        self.p = PorterStemmer()

    def parse(self, nl_path, code_path):
        return self.__combine(self.__parse_file(nl_path, True, True), self.__parse_file(code_path, False, True))

    @staticmethod
    def __combine(nl_dict, code_dict):
        ret = []
        for key in sorted([int(key) for key in nl_dict.keys()]):
            ret.append((nl_dict[str(key)], code_dict[str(key)], str(key)))
        return ret

    def __parse_file(self, file_path, rm_stopwords=False, stem=False):
        ret = {}
        with open(file_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if len(line) > 0:
                    p = line.index('\t')
                    idx = line[: p]
                    tokens = self.__get_tokens(line[p + 1:], rm_stopwords, stem)
                    ret[idx] = tokens
        return ret

    def __get_tokens(self, content, rm_stopwords=False, stem=False):
        words = [word for word in re.split('[^A-Za-z]+', content) if len(word) > 0]
        ret = []
        for word in words:
            ret += self.__camel_case_split(word)
        tmp = []
        for word in ret:
            if rm_stopwords:
                word = remove_stopwords(word)
            if len(word) > 0:
                if stem:
                    word = self.p.stem(word)
                tmp.append(word)
        ret = tmp
        return ret

    @staticmethod
    def __camel_case_split(word):
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
        return [m.group(0).lower() for m in matches]
 def __init__(self):
     self.stop = set(stopwords.words('english'))
     self.global_stemmer = PorterStemmer()
示例#11
0
from gensim import corpora, models, similarities
from gensim.parsing import PorterStemmer
from numpy import array, sqrt

#Code for document preprocessing

#Build set of stopwords
f = open('stopwords')
stoplist = []
for line in f:
    stoplist.append(line[0:-1])
f.close()
stoplist = set(stoplist)

#Initialize stemmer
stemmer = PorterStemmer()


def to_vector_model(corpusfile, tfidf=False):
    """
    Convert a corpus to vector space model for processing

    If tfidf = True, returns a tf-idf based model

    Returns the model and dictionary of words.
    """

    f = open(corpusfile, "r")
    documents = f.readlines()
    f.close()
    #Remove stopwords and stem
示例#12
0
from trained_model import google_model, get_word_vector
from nltk.corpus import opinion_lexicon
from sklearn.model_selection import cross_val_score
from os import listdir
from os.path import isfile, join
import sklearn.cross_validation
import gensim, logging
from string import maketrans, translate
import string
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score, confusion_matrix
from termcolor import colored
from colorama import init
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.parsing import PorterStemmer
global_stemmer = PorterStemmer()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
countFor = 0
countAgainst = 0
countDefect = 0
countError = 0
countAllegation = 0
countAppreciation = 0
countCallForAction = 0
countYes = 0
countNo = 0

LabeledSentence = gensim.models.doc2vec.LabeledSentence
示例#13
0
def searchIndbFacebookSaved(search_value):
    for x in "and or it is the a".split():
        search_value.replace(" " + x + " ", "")
    result = dbFacebookSaved.query.filter(
        dbFacebookSaved.title.ilike("%" + search_value.replace(" ", "%") +
                                    "%"))  #("%" + search_value + "%"))#
    idList = [
        result.order_by(dbFacebookSaved.date)[count - 1].id
        for count in range(result.count(), 0, -1)
    ]
    idDict = dict()
    idDict = adding_weight_to_dict(idDict, idList, 1)
    print ".ilike"
    print idDict

    stemmer = PorterStemmer()
    search_value = search_value.split()
    search_valueRaw = list(search_value)
    if len(search_value) > 1:
        sumVector = model3['car'] * 0
        for searchTerm in search_valueRaw:
            if searchTerm.lower() in model3.vocab:
                sumVector = sumVector + model3[searchTerm.lower()]
        similarList = model3.similar_by_vector(sumVector)
        print "similarList (sumVector)"
        print similarList
        """
        for i in range(min(5,len(similarList))):
                if similarList[i][1] >= 0.7 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from fasttext(sum of vec)"
        """
        print "New search value after sumVec:"
        search_value += [
            similarList[i][0] for i in range(min(5, len(similarList))) if
            similarList[i][1] >= 0.72 and similarList[i][0] not in search_value
        ]
        print search_value

    search_valueR = []
    for searchTerm in search_valueRaw:
        for i, mdl in enumerate([model, model2]):
            if searchTerm.lower() in mdl.vocab:
                similarList = mdl.most_similar(searchTerm.lower())
                listLengh = 3 if i == 0 else 5
                scoreThreshold = 0.5 if i == 0 else 0.55
                tempText = " from gensim_word2vec for relating to " if i == 0 else " from fasttext(CBOW) for relating to "
                for i in range(min(listLengh, len(similarList))):
                    if similarList[i][1] >= scoreThreshold and similarList[i][
                            0] not in search_value:
                        search_value.append(similarList[i][0])
                        search_valueR.append(similarList[i][0])
                        print "append " + similarList[i][
                            0] + tempText + searchTerm
        """
        if searchTerm.lower() in model.vocab:
            similarList = model.most_similar(searchTerm.lower())
            for i in range(min(3,len(similarList))):
                if similarList[i][1] >= 0.5 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    search_valueR.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from gensim_word2vec for relating to " + searchTerm
        if searchTerm.lower() in model2.vocab:
            similarList = model2.most_similar(searchTerm.lower())
            for i in range(min(5,len(similarList))):
                if similarList[i][1] >= 0.55 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    search_valueR.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from fasttext(CBOW) for relating to " + searchTerm
        """
    """
    print "search_value before stemming:"
    print search_value
    stemmer = PorterStemmer()
    search_value = [stemmer.stem(word) for word in search_value]
    search_value = list(set(search_value))
    search_valueR = [stemmer.stem(word) for word in search_valueR]
    search_valueR = list(set(search_valueR))
    print "search_value bafter stemming:"
    """
    print search_value

    for word in search_value:
        if word == stemmer.stem(
                word) or not stemmer.stem(word) in search_value:
            result = dbFacebookSaved.query.filter(
                dbFacebookSaved.title.contains(word))
            resultKwd = dbFacebookSaved.query.filter(
                dbFacebookSaved.keywords.contains(word))
            resultSummary = dbFacebookSaved.query.filter(
                dbFacebookSaved.summary.contains(word))
            weight = 1
            if len(preprocess_string(word)) == 0:
                weight = 0.1
            elif word in search_valueR:
                weight = 0.5

            idList = [
                read_db_data_to_article(
                    result.order_by(dbFacebookSaved.date)[count - 1])['id']
                for count in range(result.count(), 0, -1)
            ]
            idDict = adding_weight_to_dict(idDict, idList, 1 * weight)
            print ".title.contains(" + word + ")"
            print idDict

            idList = [
                read_db_data_to_article(
                    resultKwd.order_by(dbFacebookSaved.date)[count - 1])['id']
                for count in range(resultKwd.count(), 0, -1)
            ]
            idDict = adding_weight_to_dict(idDict, idList, 0.5 * weight)
            print ".keywords.contains(" + word + ")"
            print idDict

            idList = []
            for count in range(resultSummary.count(), 0, -1):
                if not resultSummary.order_by(
                        dbFacebookSaved.date)[count - 1].id in idList and len(
                            preprocess_string(word)) > 0:
                    article = read_db_data_to_article(
                        resultSummary.order_by(dbFacebookSaved.date)[count -
                                                                     1])
                    idList.append(article['id'])
                    cumsum = 0
                    # preprocess_string is a gensim function that do preprocessing for a string. ex: people -> peopl, Oranges -> orang
                    word = preprocess_string(word)[0]
                    for w in article['text']:
                        if len(preprocess_string(w)) > 0:
                            w = preprocess_string(w)
                        if cumsum <= 0.6 and word in w:
                            idDict[article['id']] = idDict.get(
                                article['id'], 0) + 0.2 * weight
                            cumsum = cumsum + 0.2 * weight
            print ".summary.contains(" + word + ")"
            #idDict = adding_weight_to_dict(idDict, idList, 0.2)
            print idDict
        else:
            print "ignore " + word + " for " + stemmer.stem(word)
    return idDict
示例#14
0
from typing import List, Set, Union

from gensim.parsing import PorterStemmer
from gensim.utils import tokenize

from Util import utils_
from gitMine.VCClasses import Commit, Issue, PullRequest

ps = PorterStemmer()


def merge_commit_title_and_desc(commit: Commit) -> str:
    if commit.title[-3:] == commit.desc[:3] == '...':
        return commit.title[:-3] + commit.desc[3:]
    return commit.title + commit.desc


def preprocess_text(text: str, stopwords_: Set[str], min_len) -> List[str]:
    return [ps.stem(tok).lower()
            for tok in split_compound_toks(tokenize(text, lowercase=False, deacc=True, errors='strict'))
            if tok not in stopwords_ and len(tok) > min_len]


def split_compound_toks(toks: List[str]) -> List[str]:
    """Method to split CamelCase and snake_case by first normalizing everything to snake_case and splitting on '_'"""
    result = list()
    for tok in toks:
        candidate = utils_.GitMineUtils.camel_to_snake(tok).translate({ord(c): ' ' for c in '_'})
        if not tok == candidate:
            result += [tok_ for tok_ in candidate.split(' ')]
        result.append(''.join(tok.split('_')))
示例#15
0
        data['color'] = data.apply(
            lambda row: VARIETY_MAP.get(row['variety'], (-1, 'N/A'))[1],
            axis=1)
        data['class'] = data.apply(
            lambda row: VARIETY_MAP.get(row['variety'], (-1, 'N/A'))[0],
            axis=1)

    #Filter only on selected varieties
    if filter:
        data = data.loc[data['color'] != 'N/A']

    return data


##########################################################################################################
word_stemmer = PorterStemmer()


def trainWord2Vec(min_count=2, size=50, window=4):
    """
    Returns a trained word2vec model
    """
    #Load descriptions (i.e. word2vec training data)
    df = importData("data-raw/winemag-data_first150k.csv",
                    removeSpecialChars=True,
                    censor=False,
                    filter=True,
                    processDescriptions=True,
                    processOptions={
                        'removeContractions': False,
                        'removePunctuation': True,
示例#16
0
 def __init__(self):
     self.p = PorterStemmer()