def setUpClass(cls): data_frame = PatentsPickle2DataFrame( FilePaths.us_patents_random_1000_pickle_name).data_frame cls.tfidf_inst = TFIDF(data_frame) cls.citation_count_dict = pd.read_pickle( FilePaths.us_patents_citation_dictionary_1of2_pickle_name) citation_count_dict_pt2 = pd.read_pickle( FilePaths.us_patents_citation_dictionary_1of2_pickle_name) cls.citation_count_dict.update(citation_count_dict_pt2)
def get_tfidf(args, pickle_file_name, cpc): date_from = year2pandas_earliest_date(args.year_from) date_to = year2pandas_latest_date(args.year_to) df = PatentsPickle2DataFrame(pickle_file_name, classification=cpc, date_from=date_from, date_to=date_to).data_frame check_cpc_between_years(args, df) return TFIDF(df, tokenizer=LemmaTokenizer(), ngram_range=(args.min_n, args.max_n))
def test_filters_by_to_date_if_not_None(self): date_from = '1999-01-01' date_to = '2007-07-07' self.print_output = '' def print_func(t): self.print_output += t + '\n' data_frame = PatentsPickle2DataFrame(self.test_pickle_name, date_from, date_to, pickle_reader=self.pickle_reader, print_func=print_func).data_frame self.assertEqual(1, data_frame.shape[0]) self.assertEqual('patent 2', data_frame.loc[0].invention_title) self.assertEqual('Sifting documents between 01-Jan-1999 and 07-Jul-2007\n' '1 documents available after date sift\n', self.print_output)
def test_filters_by_from_date_if_not_None(self): date_from = '2003-01-22' date_to = '2020-02-01' self.print_output = '' def print_func(t): self.print_output += t + '\n' data_frame = PatentsPickle2DataFrame(self.test_pickle_name, date_from, date_to, pickle_reader=self.pickle_reader, print_func=print_func).data_frame self.assertEqual(2, data_frame.shape[0]) self.assertEqual('patent 1', data_frame.loc[0].invention_title) self.assertEqual('patent 3', data_frame.loc[1].invention_title) self.assertEqual('Sifting documents between 22-Jan-2003 and 01-Feb-2020\n' '2 documents available after date sift\n', self.print_output)
def test_filters_by_date_no_effect_if_None(self): date_from = None date_to = None self.print_output = '' def print_func(t): self.print_output += t data_frame = PatentsPickle2DataFrame(self.test_pickle_name, date_from, date_to, pickle_reader=self.pickle_reader, print_func=print_func).data_frame self.assertEqual(3, data_frame.shape[0]) self.assertEqual('patent 1', data_frame.loc[0].invention_title) self.assertEqual('patent 2', data_frame.loc[1].invention_title) self.assertEqual('patent 3', data_frame.loc[2].invention_title) self.assertEqual('', self.print_output)