예제 #1
0
 def __init__(self,
              core_term_path,
              pretrain=True,
              update=True,
              fasttext_corpus_path=None):
     p = PorterStemmer()
     with open(core_term_path, 'r') as f:
         self.core_terms = list(
             set([
                 p.stem(word.strip()) for word in f.readlines()
                 if len(word.strip()) > 0
             ]))
         self.core_terms.sort()
     self.word_embeddings = WordEmbeddings(pretrain, update,
                                           fasttext_corpus_path)
     self.core_term_dict = {}
     index = 2
     for core_term in self.core_terms:
         self.core_term_dict[core_term] = index
         index += 1
     with open(fasttext_corpus_path, 'r') as f:
         fasttext_corpus_content = f.readlines()
     documents = [
         line.strip().split()
         for idx, line in enumerate(fasttext_corpus_content)
         if idx % 2 == 0 and len(line.strip()) > 0
     ]
     dictionary = corpora.Dictionary(documents)
     corpus = [dictionary.doc2bow(doc) for doc in documents]
     tfidf_model = models.TfidfModel(corpus)
     self.idfs = {
         dictionary[kv[0]]: kv[1]
         for kv in tfidf_model.idfs.items()
     }
class TextPreprocessor:
    def __init__(self):
        self.stop = set(stopwords.words('english'))
        self.global_stemmer = PorterStemmer()

    # Stem the word. It returns base form of a word
    def stem(self, word):
        stemmed = self.global_stemmer.stem(word)
        return stemmed

    # Removes stop words from a sentence
    def remove_stopwords(self, sentence):
        tokens = []

        # Currently this only removes fullstop at the end
        for i in sentence.lower().split():
            if i not in '<stop>':
                if i.endswith('.'):
                    i = i.replace(".", "")
                elif i.endswith(','):
                    i = i.replace(",", "")
                tokens.append(i)

        # if i not in self.stop and i not in '<stop>':
        # i = i.replace(",", "")
        # i = i.replace(".", "")
        #  i = self.stem(i)
        #  tokens.append(i)

        return tokens

    # Parse the file and generates training set
    def parse_file(self, filename, is_student_answer):
        with open(filename) as f:
            content = f.readlines()
        if is_student_answer:
            file_list = []
            for line in content:
                line_list = self.remove_stopwords(line)
                file_list.append(line_list)
            return file_list
        else:
            file_dict = {}
            for line in content:
                line_list = self.remove_stopwords(line)
                file_dict[line_list.pop(0)] = line_list
            return file_dict
예제 #3
0
class Tokenizer:

    def __init__(self):
        self.p = PorterStemmer()

    def parse(self, nl_path, code_path):
        return self.__combine(self.__parse_file(nl_path, True, True), self.__parse_file(code_path, False, True))

    @staticmethod
    def __combine(nl_dict, code_dict):
        ret = []
        for key in sorted([int(key) for key in nl_dict.keys()]):
            ret.append((nl_dict[str(key)], code_dict[str(key)], str(key)))
        return ret

    def __parse_file(self, file_path, rm_stopwords=False, stem=False):
        ret = {}
        with open(file_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if len(line) > 0:
                    p = line.index('\t')
                    idx = line[: p]
                    tokens = self.__get_tokens(line[p + 1:], rm_stopwords, stem)
                    ret[idx] = tokens
        return ret

    def __get_tokens(self, content, rm_stopwords=False, stem=False):
        words = [word for word in re.split('[^A-Za-z]+', content) if len(word) > 0]
        ret = []
        for word in words:
            ret += self.__camel_case_split(word)
        tmp = []
        for word in ret:
            if rm_stopwords:
                word = remove_stopwords(word)
            if len(word) > 0:
                if stem:
                    word = self.p.stem(word)
                tmp.append(word)
        ret = tmp
        return ret

    @staticmethod
    def __camel_case_split(word):
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
        return [m.group(0).lower() for m in matches]
예제 #4
0
def get_text_similarity(url_df, dictionary, tfidf, sims, inds, sim_type):
    '''
    calculate the tfidf title similarity between origin and target
    :param url_df: df of keywords for each url
    :param dictionary: tfidf dictionary
    :param tfidf:
    :param sims: tfidf similarity matrix
    :param inds: index of order or titles
    :param sim_type: title or body
    :return: dataframe of urls and title similarity scores
    '''
    url_df[sim_type] = url_df[sim_type].fillna('')
    translator = str.maketrans('', '', string.punctuation)
    global_stemmer = PorterStemmer()

    # load all stopwords
    with open(
            '/Users/thyde/Documents/cloned_proj_moat/project_moat/stopwords.txt'
    ) as f:
        stopwords = f.read().split()

    # parse text into list of word stems
    texts = [[
        global_stemmer.stem(word)
        for word in text.translate(translator).lower().split()
        if word not in stopwords
    ] for text in url_df[sim_type].values]

    # calculate similarity score to target title
    sim_scores = []
    for text in texts:
        vec_bow = dictionary.doc2bow(text)
        vec_tfidf = tfidf[vec_bow]
        #import pdb;pdb.set_trace()
        res = sims[vec_tfidf]
        try:
            sim_scores.append(res[inds[url_df[
                url_df['origin'] == url_df['url']][sim_type].values[0]]])
        except KeyError:
            sim_scores.append(np.NaN)

    return pd.DataFrame([url_df['url'].values, sim_scores],
                        index=['url', 'title_similarity']).T
예제 #5
0
def title_sim_construction(url_list, list_name, collection):
    '''
    :param url_list: list of urls to get title similarity for
    :param list_name: list name for purposes of saving pkl file
    :param collection: mongo collection to query
    :return: nothing, just saves files
    '''
    #classic stopwords textfile, will be in repo
    with open('/Users/thyde/Downloads/stopwords.txt') as f:
        stopwords = f.read().split()

    translator = str.maketrans('', '', string.punctuation)
    global_stemmer = PorterStemmer()

    #mc = MongoAtlasClient("mongodb://*****:*****@investopedia-shard-00-00-ibdgj.mongodb.net:27017,investopedia-shard-00-01-ibdgj.mongodb.net:27017,investopedia-shard-00-02-ibdgj.mongodb.net:27017/test?ssl=true&replicaSet=investopedia-shard-0&authSource=admin", "sSQXR9fVxNu2P0U5")
    #my_collection = mc['investopedia']['corpus']

    docs = list(collection.find({'url': {'$in': url_list}}))

    # pull out title text and create index dictionary
    title_text = [doc['title'] for doc in docs]
    title_ind = {title: i for i, title in enumerate(title_text)}

    # parse words in titles
    texts = [[
        global_stemmer.stem(word)
        for word in title.translate(translator).lower().split()
        if word not in stopwords
    ] for title in title_text]
    # create gensim corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # save files
    pkl.dump(dictionary, open('{}_title_dictionary.pkl'.format(list_name),
                              'wb'))
    pkl.dump(tfidf, open('{}_title_tfidf.pkl'.format(list_name), 'wb'))
    pkl.dump(corpus_tfidf, open('{}_title_corpus.pkl'.format(list_name), 'wb'))
    pkl.dump(title_ind, open('{}_title_ind.pkl'.format(list_name), 'wb'))
    def preprocess_text(self, text):
        """Apply preprocessing to a single text document. This should perform tokenization
        in addition to any other desired preprocessing steps.

        Args:
            text (str): document text read from plain-text file.

        Returns:
            iterable of str: tokens produced from `text` as a result of preprocessing.
        """
        for character_filter in self.character_filters:
            text = character_filter(text)

        tokens = self.tokenizer(text)
        for token_filter in self.token_filters:
            tokens = token_filter(tokens)

        if self.stem:
            p = PorterStemmer()
            tokens = [p.stem(token) for token in tokens]

        return tokens
예제 #7
0
def searchIndbFacebookSaved(search_value):
    for x in "and or it is the a".split():
        search_value.replace(" " + x + " ", "")
    result = dbFacebookSaved.query.filter(
        dbFacebookSaved.title.ilike("%" + search_value.replace(" ", "%") +
                                    "%"))  #("%" + search_value + "%"))#
    idList = [
        result.order_by(dbFacebookSaved.date)[count - 1].id
        for count in range(result.count(), 0, -1)
    ]
    idDict = dict()
    idDict = adding_weight_to_dict(idDict, idList, 1)
    print ".ilike"
    print idDict

    stemmer = PorterStemmer()
    search_value = search_value.split()
    search_valueRaw = list(search_value)
    if len(search_value) > 1:
        sumVector = model3['car'] * 0
        for searchTerm in search_valueRaw:
            if searchTerm.lower() in model3.vocab:
                sumVector = sumVector + model3[searchTerm.lower()]
        similarList = model3.similar_by_vector(sumVector)
        print "similarList (sumVector)"
        print similarList
        """
        for i in range(min(5,len(similarList))):
                if similarList[i][1] >= 0.7 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from fasttext(sum of vec)"
        """
        print "New search value after sumVec:"
        search_value += [
            similarList[i][0] for i in range(min(5, len(similarList))) if
            similarList[i][1] >= 0.72 and similarList[i][0] not in search_value
        ]
        print search_value

    search_valueR = []
    for searchTerm in search_valueRaw:
        for i, mdl in enumerate([model, model2]):
            if searchTerm.lower() in mdl.vocab:
                similarList = mdl.most_similar(searchTerm.lower())
                listLengh = 3 if i == 0 else 5
                scoreThreshold = 0.5 if i == 0 else 0.55
                tempText = " from gensim_word2vec for relating to " if i == 0 else " from fasttext(CBOW) for relating to "
                for i in range(min(listLengh, len(similarList))):
                    if similarList[i][1] >= scoreThreshold and similarList[i][
                            0] not in search_value:
                        search_value.append(similarList[i][0])
                        search_valueR.append(similarList[i][0])
                        print "append " + similarList[i][
                            0] + tempText + searchTerm
        """
        if searchTerm.lower() in model.vocab:
            similarList = model.most_similar(searchTerm.lower())
            for i in range(min(3,len(similarList))):
                if similarList[i][1] >= 0.5 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    search_valueR.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from gensim_word2vec for relating to " + searchTerm
        if searchTerm.lower() in model2.vocab:
            similarList = model2.most_similar(searchTerm.lower())
            for i in range(min(5,len(similarList))):
                if similarList[i][1] >= 0.55 and similarList[i][0] not in search_value:
                    search_value.append(similarList[i][0])
                    search_valueR.append(similarList[i][0])
                    print "append " + similarList[i][0] + " from fasttext(CBOW) for relating to " + searchTerm
        """
    """
    print "search_value before stemming:"
    print search_value
    stemmer = PorterStemmer()
    search_value = [stemmer.stem(word) for word in search_value]
    search_value = list(set(search_value))
    search_valueR = [stemmer.stem(word) for word in search_valueR]
    search_valueR = list(set(search_valueR))
    print "search_value bafter stemming:"
    """
    print search_value

    for word in search_value:
        if word == stemmer.stem(
                word) or not stemmer.stem(word) in search_value:
            result = dbFacebookSaved.query.filter(
                dbFacebookSaved.title.contains(word))
            resultKwd = dbFacebookSaved.query.filter(
                dbFacebookSaved.keywords.contains(word))
            resultSummary = dbFacebookSaved.query.filter(
                dbFacebookSaved.summary.contains(word))
            weight = 1
            if len(preprocess_string(word)) == 0:
                weight = 0.1
            elif word in search_valueR:
                weight = 0.5

            idList = [
                read_db_data_to_article(
                    result.order_by(dbFacebookSaved.date)[count - 1])['id']
                for count in range(result.count(), 0, -1)
            ]
            idDict = adding_weight_to_dict(idDict, idList, 1 * weight)
            print ".title.contains(" + word + ")"
            print idDict

            idList = [
                read_db_data_to_article(
                    resultKwd.order_by(dbFacebookSaved.date)[count - 1])['id']
                for count in range(resultKwd.count(), 0, -1)
            ]
            idDict = adding_weight_to_dict(idDict, idList, 0.5 * weight)
            print ".keywords.contains(" + word + ")"
            print idDict

            idList = []
            for count in range(resultSummary.count(), 0, -1):
                if not resultSummary.order_by(
                        dbFacebookSaved.date)[count - 1].id in idList and len(
                            preprocess_string(word)) > 0:
                    article = read_db_data_to_article(
                        resultSummary.order_by(dbFacebookSaved.date)[count -
                                                                     1])
                    idList.append(article['id'])
                    cumsum = 0
                    # preprocess_string is a gensim function that do preprocessing for a string. ex: people -> peopl, Oranges -> orang
                    word = preprocess_string(word)[0]
                    for w in article['text']:
                        if len(preprocess_string(w)) > 0:
                            w = preprocess_string(w)
                        if cumsum <= 0.6 and word in w:
                            idDict[article['id']] = idDict.get(
                                article['id'], 0) + 0.2 * weight
                            cumsum = cumsum + 0.2 * weight
            print ".summary.contains(" + word + ")"
            #idDict = adding_weight_to_dict(idDict, idList, 0.2)
            print idDict
        else:
            print "ignore " + word + " for " + stemmer.stem(word)
    return idDict