示例#1
0
    def setUpClass(cls):
        data_frame = PatentsPickle2DataFrame(
            FilePaths.us_patents_random_1000_pickle_name).data_frame
        cls.tfidf_inst = TFIDF(data_frame)

        cls.citation_count_dict = pd.read_pickle(
            FilePaths.us_patents_citation_dictionary_1of2_pickle_name)
        citation_count_dict_pt2 = pd.read_pickle(
            FilePaths.us_patents_citation_dictionary_1of2_pickle_name)
        cls.citation_count_dict.update(citation_count_dict_pt2)
示例#2
0
def get_tfidf(args, pickle_file_name, cpc):
    date_from = year2pandas_earliest_date(args.year_from)
    date_to = year2pandas_latest_date(args.year_to)

    df = PatentsPickle2DataFrame(pickle_file_name,
                                 classification=cpc,
                                 date_from=date_from,
                                 date_to=date_to).data_frame
    check_cpc_between_years(args, df)
    return TFIDF(df,
                 tokenizer=LemmaTokenizer(),
                 ngram_range=(args.min_n, args.max_n))
    def test_filters_by_to_date_if_not_None(self):
        date_from = '1999-01-01'
        date_to = '2007-07-07'
        self.print_output = ''

        def print_func(t):
            self.print_output += t + '\n'

        data_frame = PatentsPickle2DataFrame(self.test_pickle_name, date_from, date_to,
                                             pickle_reader=self.pickle_reader,
                                             print_func=print_func).data_frame

        self.assertEqual(1, data_frame.shape[0])
        self.assertEqual('patent 2', data_frame.loc[0].invention_title)
        self.assertEqual('Sifting documents between 01-Jan-1999 and 07-Jul-2007\n'
                         '1 documents available after date sift\n', self.print_output)
    def test_filters_by_from_date_if_not_None(self):
        date_from = '2003-01-22'
        date_to = '2020-02-01'
        self.print_output = ''

        def print_func(t):
            self.print_output += t + '\n'

        data_frame = PatentsPickle2DataFrame(self.test_pickle_name, date_from, date_to,
                                             pickle_reader=self.pickle_reader,
                                             print_func=print_func).data_frame

        self.assertEqual(2, data_frame.shape[0])
        self.assertEqual('patent 1', data_frame.loc[0].invention_title)
        self.assertEqual('patent 3', data_frame.loc[1].invention_title)
        self.assertEqual('Sifting documents between 22-Jan-2003 and 01-Feb-2020\n'
                         '2 documents available after date sift\n', self.print_output)
    def test_filters_by_date_no_effect_if_None(self):
        date_from = None
        date_to = None
        self.print_output = ''

        def print_func(t):
            self.print_output += t

        data_frame = PatentsPickle2DataFrame(self.test_pickle_name, date_from, date_to,
                                             pickle_reader=self.pickle_reader,
                                             print_func=print_func).data_frame

        self.assertEqual(3, data_frame.shape[0])
        self.assertEqual('patent 1', data_frame.loc[0].invention_title)
        self.assertEqual('patent 2', data_frame.loc[1].invention_title)
        self.assertEqual('patent 3', data_frame.loc[2].invention_title)
        self.assertEqual('', self.print_output)