def setUpClass(cls): min_n = 2 max_n = 3 max_df = 0.3 ngram_range = (min_n, max_n) date_to = date_to_year_week(pd.to_datetime('today').date()) date_from = date_to_year_week(pd.to_datetime('1900-01-01').date()) docs_mask_dict = {} docs_mask_dict['filter_by'] = 'union' docs_mask_dict['cpc'] = None docs_mask_dict['time'] = None docs_mask_dict['cite'] = [] docs_mask_dict['columns'] = None docs_mask_dict['date'] = {'to': date_to, 'from': date_from} docs_mask_dict['timeseries_date'] = {'to': date_to, 'from': date_from} docs_mask_dict['date_header'] = 'publication_date' filename = os.path.join('data', 'USPTO-random-100.csv') cls.__pipeline = Pipeline(filename, docs_mask_dict, ngram_range=ngram_range, text_header='abstract', max_df=max_df, output_name='test', calculate_timeseries=True) cls.__term_score_tuples = cls.__pipeline.term_score_tuples
def test_filter_cpc_A61_intersection_dates(self): self.__docs_mask_dict['date'] = { 'from': date_to_year_week(pd.Timestamp('2010/06/01')), 'to': date_to_year_week(pd.to_datetime('today')) } self.__docs_mask_dict['filter_by'] = 'intersection' self.__docs_mask_dict['cpc'] = 'A61' doc_ids = DocumentsFilter(self.__dates, self.__docs_mask_dict, self.__cpc_dict, 100).doc_indices self.assertListEqual( list(doc_ids), [67, 69, 72, 74, 43, 81, 50, 85, 57, 90, 60, 94, 63])
def test_filter_dates(self): self.__docs_mask_dict['date'] = { 'from': date_to_year_week(pd.Timestamp('2010/06/01')), 'to': date_to_year_week(pd.to_datetime('today')) } doc_ids = DocumentsFilter(self.__dates, self.__docs_mask_dict, self.__cpc_dict, 100).doc_indices self.assertListEqual(list(doc_ids), [ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 ])
def get_docs_mask_dict(self): docs_mask_dict = { 'filter_by': self.args.filter_by, 'cpc': self.args.cpc_classification, 'cite': None, 'columns': self.args.filter_columns, 'date': None, 'date_header': self.args.date_header } if self.args.date_to is not None or self.args.date_from is not None: date_to = pd.to_datetime('today').date( ) if self.args.date_to is None else pd.to_datetime( self.args.date_to) date_from = pd.to_datetime( '1900-01-01' ) if self.args.date_from is None else pd.to_datetime( self.args.date_from) docs_mask_dict['date'] = { 'to': date_to_year_week(date_to), 'from': date_to_year_week(date_from) } return docs_mask_dict