def test_from_date_not_after_to_date(self, mock_path): self.setUpTest(mock_path) self.args.date_from = '2018/03/10' self.args.date_to = '2019/03/10' args_checker = ArgsChecker(self.args, self.args_default) args_checker.checkargs()
def test_from_date_after_to_date(self, mock_path): self.setUpTest(mock_path) self.args.date_from = '2019/03/11' self.args.date_to = '2019/03/10' args_checker = ArgsChecker(self.args, self.args_default) try: args_checker.checkargs() self.fail('Should have detected date_from > date_to') except PygramsException as pe: self.assertEqual(f"date_from '{self.args.date_from}' cannot be after date_to '{self.args.date_to}'", pe.message, 'Messages do not match')
def test_from_date_incorrect_format(self, mock_path): self.setUpTest(mock_path) bogus_date = '03/11' self.args.date_from = bogus_date args_checker = ArgsChecker(self.args, self.args_default) try: args_checker.checkargs() self.fail('Should have detected erroneous date format') except PygramsException as pe: self.assertEqual(f"date_from defined as '{bogus_date}' which is not in YYYY/MM/DD format", pe.message, 'Messages do not match')
def main(supplied_args): paths = [ os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'), os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence') ] for path in paths: os.makedirs(path, exist_ok=True) args = get_args(supplied_args) args_default = get_args([]) argscheck = ArgsChecker(args, args_default) argscheck.checkargs() outputs = args.output[:] outputs.append('json_config') outputs.append('report') if args.term_counts: outputs.append('termcounts') if args.n_nmf_topics > 0: outputs.append('nmf') if args.timeseries: outputs.append('emergence_report') docs_mask_dict = argscheck.get_docs_mask_dict() terms_mask_dict = argscheck.get_terms_mask_dict() doc_source_file_name = os.path.join(args.path, args.doc_source) if args.use_cache is None: pickled_tfidf_folder_name = None else: pickled_tfidf_folder_name = args.use_cache pipeline = Pipeline(doc_source_file_name, docs_mask_dict, pick_method=args.pick, ngram_range=(args.min_ngrams, args.max_ngrams), text_header=args.text_header, cached_folder_name=pickled_tfidf_folder_name, max_df=args.max_document_frequency, user_ngrams=args.search_terms, prefilter_terms=args.prefilter_terms, terms_threshold=args.search_terms_threshold, output_name=args.outputs_name, calculate_timeseries=args.timeseries, m_steps_ahead=args.steps_ahead, emergence_index=args.emergence_index, exponential=args.exponential_fitting, nterms=args.nterms, patents_per_quarter_threshold=args.minimum_per_quarter, sma=args.smoothing_alg) pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report, n_nmf_topics=args.n_nmf_topics) # emtech integration if args.timeseries: if 0 in args.predictor_names: algs_codes = list(range(1, 7)) else: algs_codes = args.predictor_names if isinstance(algs_codes, int): predictors_to_run = [predictor_names[algs_codes]] else: predictors_to_run = [predictor_names[i] for i in algs_codes] for emergence in ['emergent', 'declining']: print(f'Running pipeline for "{emergence}"') if args.normalised: title = 'Forecasts Evaluation: Normalised Counts' if args.test else 'Forecasts: Normalised Counts' else: title = 'Forecasts Evaluation' if args.test else 'Forecasts' title += f' ({emergence})' html_results, training_values = pipeline.run( predictors_to_run, normalized=args.normalised, train_test=args.test, emergence=emergence) if training_values is not None: # save training_values to csv file # # training_values: csv file: # {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6 # 'term2', 2, 4, 1, 3 # filename = os.path.join( 'outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv') with open(filename, 'w') as f: w = csv.writer(f) for key, values in training_values: my_list = ["'" + str(key) + "'"] + values w.writerow(my_list) html_doc = f'''<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>{title}</title> </head> <body> <h1>{title}</h1> {html_results} </body> </html> ''' base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence) if args.normalised: base_file_name += '_normalised' if args.test: base_file_name += '_test' html_filename = base_file_name + '.html' with open(html_filename, 'w') as f: f.write(html_doc) print()
def test_from_date_correct_format(self, mock_path): self.setUpTest(mock_path) self.args.date_from = '2018/03/10' args_checker = ArgsChecker(self.args, self.args_default) args_checker.checkargs()
def main(supplied_args): paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'), os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence')] for path in paths: os.makedirs(path, exist_ok=True) args = get_args(supplied_args) args_default = get_args([]) argscheck = ArgsChecker(args, args_default) argscheck.checkargs() outputs = args.output[:] outputs.append('json_config') outputs.append('report') if args.term_counts: outputs.append('termcounts') docs_mask_dict = argscheck.get_docs_mask_dict() terms_mask_dict = argscheck.get_terms_mask_dict() doc_source_file_name = os.path.join(args.path, args.doc_source) if args.input_tfidf is None: pickled_tf_idf_path = None else: pickled_tf_idf_path = os.path.join('outputs', 'tfidf', args.input_tfidf) pipeline = Pipeline(doc_source_file_name, docs_mask_dict, pick_method=args.pick, ngram_range=(args.min_ngrams, args.max_ngrams), normalize_rows=args.normalize_doc_length, text_header=args.text_header, max_df=args.max_document_frequency, term_counts=args.term_counts, user_ngrams=args.search_terms, pickled_tf_idf_file_name=pickled_tf_idf_path, output_name=args.outputs_name, emerging_technology=args.emerging_technology) pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report) # emtech integration if args.emerging_technology: from scripts.pipeline import PipelineEmtech if 0 in args.predictor_names: algs_codes = list(range(1, len(predictor_names))) else: algs_codes = args.predictor_names if isinstance(algs_codes, int): predictors_to_run = [predictor_names[algs_codes]] else: predictors_to_run = [predictor_names[i] for i in algs_codes] term_counts_data = pipeline.term_counts_data pipeline_emtech = PipelineEmtech(term_counts_data, m_steps_ahead=args.steps_ahead, curves=args.curve_fitting, nterms=args.nterms, minimum_patents_per_quarter=args.minimum_per_quarter, outname=args.outputs_name) for emergence in ['emergent', 'stationary', 'declining']: print(f'Running pipeline for "{emergence}"') if args.normalised: title = 'Forecasts Evaluation: Normalised Counts' if args.test else 'Forecasts: Normalised Counts' else: title = 'Forecasts Evaluation' if args.test else 'Forecasts' title += f' ({emergence})' html_results = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, train_test=args.test, emergence=emergence) html_doc = f'''<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>{title}</title> </head> <body> <h1>{title}</h1> {html_results} </body> </html> ''' base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence) if args.normalised: base_file_name += '_normalised' if args.test: base_file_name += '_test' html_filename = base_file_name + '.html' with open(html_filename, 'w') as f: f.write(html_doc) print()