def init_mask(self, cpc, min_n, uni_factor=0.8): docs_mask_dict = { 'filter_by': 'union', 'cpc': cpc, 'time': None, 'cite': [], 'columns': None, 'date': None, 'date_header': '' } self.__tfidf_obj = TFIDF(self.__df['abstract'], ngram_range=(min_n, self.__max_n), max_document_frequency=self.__max_df, tokenizer=StemTokenizer()) doc_filters = DocumentsFilter(self.__df, docs_mask_dict).doc_weights doc_weights = DocumentsWeights(self.__df, docs_mask_dict['time'], docs_mask_dict['cite'], docs_mask_dict['date_header']).weights doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)] # term weights - embeddings filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None) term_weights = filter_output_obj.ngram_weights_vec tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor) tfidf_mask_obj.update_mask(doc_weights, term_weights) self.__tfidf_mask = tfidf_mask_obj.tfidf_mask
def setUpClass(cls): num_ngrams = 50 min_n = 2 max_n = 3 max_df=0.3 ngram_range = (min_n, max_n) df = pd.read_pickle(FilePaths.us_patents_random_1000_pickle_name) tfidf_obj = TFIDF(df['abstract'], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=StemTokenizer()) doc_weights = list(np.ones(len(df))) # term weights - embeddings filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None) term_weights = filter_output_obj.ngram_weights_vec tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range) tfidf_mask_obj.update_mask(doc_weights, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # mask the tfidf matrix tfidf_matrix = tfidf_obj.tfidf_matrix tfidf_masked = tfidf_mask.multiply(tfidf_matrix) tfidf_masked = utils.remove_all_null_rows(tfidf_masked) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents') cls.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, tfidf_obj.feature_names) term_score_tuples = cls.__tfidf_reduce_obj.extract_ngrams_from_docset('sum') graph_obj = TermsGraph(term_score_tuples[:num_ngrams], cls.__tfidf_reduce_obj) graph = graph_obj.graph cls.__links = graph['links'] cls.__nodes = graph['nodes']
def __init__(self, expected, tokenizer=StemTokenizer()): self.__tokenizer = tokenizer self.__expected_token_unigrams = [] self.__expected_token_ngrams = [] self.__expected_token_bigrams = [] for expected_term in expected: tokens = self.__tokenizer(expected_term.lower()) ngram = ' '.join(tokens) for token in tokens: self.__expected_token_unigrams.append(token) self.__expected_token_ngrams.append(ngram) if len(tokens) == 2: self.__expected_token_bigrams.append(ngram) self.__expected_token_unigrams = list(set(self.__expected_token_unigrams))
def tfidf_from_text(text_series, ngram_range=(1, 3), max_document_frequency=0.3, tokenizer=StemTokenizer()): WordAnalyzer.init(tokenizer=tokenizer, preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) #TODO add option to increase uint8 to uint16 or 32 on user vectorizer = CountVectorizer(max_df=max_document_frequency, min_df=1, ngram_range=ngram_range, analyzer=WordAnalyzer.analyzer, dtype=np.uint8) count_matrix = vectorizer.fit_transform(text_series) feature_names = vectorizer.get_feature_names() return _TFIDF(count_matrix, vectorizer.vocabulary_, feature_names)
def __init__(self, text_series, ngram_range=(1, 3), max_document_frequency=0.3, tokenizer=StemTokenizer()): WordAnalyzer.init(tokenizer=tokenizer, preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) self.__vectorizer = CountVectorizer(max_df=max_document_frequency, min_df=1, ngram_range=ngram_range, analyzer=WordAnalyzer.analyzer) self.__ngram_counts = self.__vectorizer.fit_transform(text_series) self.__feature_names = self.__vectorizer.get_feature_names() self.__tfidf_transformer = TfidfTransformer(smooth_idf=False) self.__tfidf_matrix = self.__tfidf_transformer.fit_transform( self.__ngram_counts)
def test_stematizer(self): words = ['freezing', 'frozen', 'freeze', 'reading'] stematizer = StemTokenizer() expected_words = ['freez', 'frozen', 'freez', 'read'] actual_words = [stematizer(word)[0] for word in words] self.assertListEqual(expected_words, actual_words)
def init_mask(self, cpc, min_n, uni_factor=0.8): docs_mask_dict = { 'filter_by': 'union', 'cpc': cpc, 'time': None, 'cite': [], 'columns': None, 'date': None, 'date_header': None } self.__tfidf_obj = tfidf_from_text(self.__df['abstract'], ngram_range=(min_n, self.__max_n), max_document_frequency=self.__max_df, tokenizer=StemTokenizer()) cpc_dict = utils.cpc_dict(self.__df) self.__dates = generate_year_week_dates(self.__df, docs_mask_dict['date_header']) doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, cpc_dict, self.__df.shape[0]).doc_filters # term weights - embeddings filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None) term_weights = filter_output_obj.ngram_weights_vec tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor, unbias=True) tfidf_mask_obj.update_mask(doc_filters, term_weights) self.__tfidf_mask = tfidf_mask_obj.tfidf_mask