예제 #1
0
    def init_mask(self, cpc, min_n, uni_factor=0.8):
        docs_mask_dict = {
            'filter_by': 'union',
            'cpc': cpc,
            'time': None,
            'cite': [],
            'columns': None,
            'date': None,
            'date_header': ''
        }

        self.__tfidf_obj = TFIDF(self.__df['abstract'],
                                 ngram_range=(min_n, self.__max_n),
                                 max_document_frequency=self.__max_df,
                                 tokenizer=StemTokenizer())

        doc_filters = DocumentsFilter(self.__df, docs_mask_dict).doc_weights
        doc_weights = DocumentsWeights(self.__df, docs_mask_dict['time'],
                                       docs_mask_dict['cite'],
                                       docs_mask_dict['date_header']).weights
        doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)]

        # term weights - embeddings
        filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None)
        term_weights = filter_output_obj.ngram_weights_vec

        tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                   ngram_range=(min_n, self.__max_n),
                                   uni_factor=uni_factor)
        tfidf_mask_obj.update_mask(doc_weights, term_weights)
        self.__tfidf_mask = tfidf_mask_obj.tfidf_mask
예제 #2
0
    def setUpClass(cls):
        num_ngrams = 50
        min_n = 2
        max_n = 3
        max_df=0.3
        ngram_range = (min_n, max_n)

        df = pd.read_pickle(FilePaths.us_patents_random_1000_pickle_name)
        tfidf_obj = TFIDF(df['abstract'], ngram_range=ngram_range, max_document_frequency=max_df,
                          tokenizer=StemTokenizer())

        doc_weights = list(np.ones(len(df)))

        # term weights - embeddings
        filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None)
        term_weights = filter_output_obj.ngram_weights_vec

        tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range)
        tfidf_mask_obj.update_mask(doc_weights, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # mask the tfidf matrix
        tfidf_matrix = tfidf_obj.tfidf_matrix
        tfidf_masked = tfidf_mask.multiply(tfidf_matrix)
        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')

        cls.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, tfidf_obj.feature_names)
        term_score_tuples = cls.__tfidf_reduce_obj.extract_ngrams_from_docset('sum')
        graph_obj = TermsGraph(term_score_tuples[:num_ngrams], cls.__tfidf_reduce_obj)
        graph = graph_obj.graph
        cls.__links = graph['links']
        cls.__nodes = graph['nodes']
    def __init__(self, expected, tokenizer=StemTokenizer()):
        self.__tokenizer = tokenizer

        self.__expected_token_unigrams = []
        self.__expected_token_ngrams = []
        self.__expected_token_bigrams = []
        for expected_term in expected:
            tokens = self.__tokenizer(expected_term.lower())
            ngram = ' '.join(tokens)
            for token in tokens:
                self.__expected_token_unigrams.append(token)
            self.__expected_token_ngrams.append(ngram)
            if len(tokens) == 2:
                self.__expected_token_bigrams.append(ngram)
        self.__expected_token_unigrams = list(set(self.__expected_token_unigrams))
def tfidf_from_text(text_series,
                    ngram_range=(1, 3),
                    max_document_frequency=0.3,
                    tokenizer=StemTokenizer()):
    WordAnalyzer.init(tokenizer=tokenizer,
                      preprocess=lowercase_strip_accents_and_ownership,
                      ngram_range=ngram_range)

    #TODO add option to increase uint8 to uint16 or 32 on user
    vectorizer = CountVectorizer(max_df=max_document_frequency,
                                 min_df=1,
                                 ngram_range=ngram_range,
                                 analyzer=WordAnalyzer.analyzer,
                                 dtype=np.uint8)

    count_matrix = vectorizer.fit_transform(text_series)
    feature_names = vectorizer.get_feature_names()

    return _TFIDF(count_matrix, vectorizer.vocabulary_, feature_names)
예제 #5
0
    def __init__(self,
                 text_series,
                 ngram_range=(1, 3),
                 max_document_frequency=0.3,
                 tokenizer=StemTokenizer()):
        WordAnalyzer.init(tokenizer=tokenizer,
                          preprocess=lowercase_strip_accents_and_ownership,
                          ngram_range=ngram_range)

        self.__vectorizer = CountVectorizer(max_df=max_document_frequency,
                                            min_df=1,
                                            ngram_range=ngram_range,
                                            analyzer=WordAnalyzer.analyzer)

        self.__ngram_counts = self.__vectorizer.fit_transform(text_series)
        self.__feature_names = self.__vectorizer.get_feature_names()

        self.__tfidf_transformer = TfidfTransformer(smooth_idf=False)
        self.__tfidf_matrix = self.__tfidf_transformer.fit_transform(
            self.__ngram_counts)
예제 #6
0
 def test_stematizer(self):
     words = ['freezing', 'frozen', 'freeze', 'reading']
     stematizer = StemTokenizer()
     expected_words = ['freez', 'frozen', 'freez', 'read']
     actual_words = [stematizer(word)[0] for word in words]
     self.assertListEqual(expected_words, actual_words)
    def init_mask(self, cpc, min_n, uni_factor=0.8):
        docs_mask_dict = {
            'filter_by': 'union',
            'cpc': cpc,
            'time': None,
            'cite': [],
            'columns': None,
            'date': None,
            'date_header': None
        }

        self.__tfidf_obj = tfidf_from_text(self.__df['abstract'], ngram_range=(min_n, self.__max_n),
                                           max_document_frequency=self.__max_df, tokenizer=StemTokenizer())
        cpc_dict = utils.cpc_dict(self.__df)

        self.__dates = generate_year_week_dates(self.__df, docs_mask_dict['date_header'])
        doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, cpc_dict, self.__df.shape[0]).doc_filters

        # term weights - embeddings
        filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None)
        term_weights = filter_output_obj.ngram_weights_vec

        tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor, unbias=True)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        self.__tfidf_mask = tfidf_mask_obj.tfidf_mask