예제 #1
0
    def test_get_docs_mask_dict_date_from_to_None(self, mock_path):
        self.setUpTest(mock_path)
        self.args.date_from = None
        self.args.date_to = None
        args_checker = ArgsChecker(self.args, self.args_default)

        args_dict = args_checker.get_docs_mask_dict()

        self.assertIsNone(args_dict['date'])
예제 #2
0
    def test_get_docs_mask_dict(self, mock_path):
        expected_date_from = 201910
        expected_date_to = 201911
        self.setUpTest(mock_path)
        self.args.date_from = pd.Timestamp('2019/03/10')
        self.args.date_to = pd.Timestamp('2019/03/11')
        args_checker = ArgsChecker(self.args, self.args_default)

        args_dict = args_checker.get_docs_mask_dict()

        self.assertEqual(expected_date_from, args_dict['date']['from'])
        self.assertEqual(expected_date_to, args_dict['date']['to'])
예제 #3
0
    def test_get_docs_mask_dict(self, mock_path):
        expected_date_from = '2019/03/10'
        expected_date_to = '2019/03/11'
        self.setUpTest(mock_path)
        self.args.date_from = expected_date_from
        self.args.date_to = expected_date_to
        args_checker = ArgsChecker(self.args, self.args_default)

        args_dict = args_checker.get_docs_mask_dict()

        self.assertEqual(Timestamp(expected_date_from), args_dict['date']['from'])
        self.assertEqual(Timestamp(expected_date_to), args_dict['date']['to'])
def main(supplied_args):
    paths = [
        os.path.join('outputs', 'reports'),
        os.path.join('outputs', 'wordclouds'),
        os.path.join('outputs', 'table'),
        os.path.join('outputs', 'emergence')
    ]
    for path in paths:
        os.makedirs(path, exist_ok=True)

    args = get_args(supplied_args)
    args_default = get_args([])
    argscheck = ArgsChecker(args, args_default)
    argscheck.checkargs()
    outputs = args.output[:]
    outputs.append('json_config')
    outputs.append('report')
    if args.term_counts:
        outputs.append('termcounts')
    if args.n_nmf_topics > 0:
        outputs.append('nmf')
    if args.timeseries:
        outputs.append('emergence_report')

    docs_mask_dict = argscheck.get_docs_mask_dict()
    terms_mask_dict = argscheck.get_terms_mask_dict()

    doc_source_file_name = os.path.join(args.path, args.doc_source)

    if args.use_cache is None:
        pickled_tfidf_folder_name = None
    else:
        pickled_tfidf_folder_name = args.use_cache

    pipeline = Pipeline(doc_source_file_name,
                        docs_mask_dict,
                        pick_method=args.pick,
                        ngram_range=(args.min_ngrams, args.max_ngrams),
                        text_header=args.text_header,
                        cached_folder_name=pickled_tfidf_folder_name,
                        max_df=args.max_document_frequency,
                        user_ngrams=args.search_terms,
                        prefilter_terms=args.prefilter_terms,
                        terms_threshold=args.search_terms_threshold,
                        output_name=args.outputs_name,
                        calculate_timeseries=args.timeseries,
                        m_steps_ahead=args.steps_ahead,
                        emergence_index=args.emergence_index,
                        exponential=args.exponential_fitting,
                        nterms=args.nterms,
                        patents_per_quarter_threshold=args.minimum_per_quarter,
                        sma=args.smoothing_alg)

    pipeline.output(outputs,
                    wordcloud_title=args.wordcloud_title,
                    outname=args.outputs_name,
                    nterms=args.num_ngrams_report,
                    n_nmf_topics=args.n_nmf_topics)

    # emtech integration
    if args.timeseries:
        if 0 in args.predictor_names:
            algs_codes = list(range(1, 7))
        else:
            algs_codes = args.predictor_names

        if isinstance(algs_codes, int):
            predictors_to_run = [predictor_names[algs_codes]]
        else:
            predictors_to_run = [predictor_names[i] for i in algs_codes]

        for emergence in ['emergent', 'declining']:
            print(f'Running pipeline for "{emergence}"')

            if args.normalised:
                title = 'Forecasts Evaluation: Normalised Counts' if args.test else 'Forecasts: Normalised Counts'
            else:
                title = 'Forecasts Evaluation' if args.test else 'Forecasts'

            title += f' ({emergence})'

            html_results, training_values = pipeline.run(
                predictors_to_run,
                normalized=args.normalised,
                train_test=args.test,
                emergence=emergence)
            if training_values is not None:
                # save training_values to csv file
                #
                # training_values:                                  csv file:
                # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
                #                                                   'term2', 2, 4, 1, 3
                #
                filename = os.path.join(
                    'outputs', 'emergence',
                    args.outputs_name + '_' + emergence + '_time_series.csv')
                with open(filename, 'w') as f:
                    w = csv.writer(f)
                    for key, values in training_values:
                        my_list = ["'" + str(key) + "'"] + values
                        w.writerow(my_list)

            html_doc = f'''<!DOCTYPE html>
                <html lang="en">
                  <head>
                    <meta charset="utf-8">
                    <title>{title}</title>
                  </head>
                  <body>
                    <h1>{title}</h1>
                {html_results}
                  </body>
                </html>
                '''

            base_file_name = os.path.join('outputs', 'emergence',
                                          args.outputs_name + '_' + emergence)

            if args.normalised:
                base_file_name += '_normalised'

            if args.test:
                base_file_name += '_test'

            html_filename = base_file_name + '.html'

            with open(html_filename, 'w') as f:
                f.write(html_doc)

            print()
예제 #5
0
def main(supplied_args):
    paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'),
             os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence')]
    for path in paths:
        os.makedirs(path, exist_ok=True)

    args = get_args(supplied_args)
    args_default = get_args([])
    argscheck = ArgsChecker(args, args_default)
    argscheck.checkargs()
    outputs = args.output[:]
    outputs.append('json_config')
    outputs.append('report')
    if args.term_counts:
        outputs.append('termcounts')

    docs_mask_dict = argscheck.get_docs_mask_dict()
    terms_mask_dict = argscheck.get_terms_mask_dict()

    doc_source_file_name = os.path.join(args.path, args.doc_source)

    if args.input_tfidf is None:
        pickled_tf_idf_path = None
    else:
        pickled_tf_idf_path = os.path.join('outputs', 'tfidf', args.input_tfidf)

    pipeline = Pipeline(doc_source_file_name, docs_mask_dict, pick_method=args.pick,
                        ngram_range=(args.min_ngrams, args.max_ngrams), normalize_rows=args.normalize_doc_length,
                        text_header=args.text_header, max_df=args.max_document_frequency,
                        term_counts=args.term_counts, user_ngrams=args.search_terms,
                        pickled_tf_idf_file_name=pickled_tf_idf_path,
                        output_name=args.outputs_name, emerging_technology=args.emerging_technology)

    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)

    # emtech integration
    if args.emerging_technology:
        from scripts.pipeline import PipelineEmtech

        if 0 in args.predictor_names:
            algs_codes = list(range(1, len(predictor_names)))
        else:
            algs_codes = args.predictor_names

        if isinstance(algs_codes, int):
            predictors_to_run = [predictor_names[algs_codes]]
        else:
            predictors_to_run = [predictor_names[i] for i in algs_codes]

        term_counts_data = pipeline.term_counts_data

        pipeline_emtech = PipelineEmtech(term_counts_data, m_steps_ahead=args.steps_ahead, curves=args.curve_fitting,
                                         nterms=args.nterms, minimum_patents_per_quarter=args.minimum_per_quarter,
                                         outname=args.outputs_name)

        for emergence in ['emergent', 'stationary', 'declining']:
            print(f'Running pipeline for "{emergence}"')

            if args.normalised:
                title = 'Forecasts Evaluation: Normalised Counts' if args.test else 'Forecasts: Normalised Counts'
            else:
                title = 'Forecasts Evaluation' if args.test else 'Forecasts'

            title += f' ({emergence})'

            html_results = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, train_test=args.test,
                                        emergence=emergence)

            html_doc = f'''<!DOCTYPE html>
                <html lang="en">
                  <head>
                    <meta charset="utf-8">
                    <title>{title}</title>
                  </head>
                  <body>
                    <h1>{title}</h1>
                {html_results}
                  </body>
                </html>
                '''

            base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence)

            if args.normalised:
                base_file_name += '_normalised'

            if args.test:
                base_file_name += '_test'

            html_filename = base_file_name + '.html'

            with open(html_filename, 'w') as f:
                f.write(html_doc)

            print()