예제 #1
0
 def setUpClass(cls):
     cls.num_ngrams = 5
     cold_tfidf = tfidf_from_text(ReferenceData.cold_df,
                                  tokenizer=LemmaTokenizer(),
                                  ngram_range=(2, 3))
     random_tfidf = tfidf_from_text(ReferenceData.random_df,
                                    tokenizer=LemmaTokenizer(),
                                    ngram_range=(2, 3))
     cls.tfocus = TermFocus(cold_tfidf, random_tfidf)
예제 #2
0
    def setUpClass(cls):
        num_ngrams = 50
        min_n = 2
        max_n = 3
        max_df=0.3
        ngram_range = (min_n, max_n)

        df = pd.read_pickle(FilePaths.us_patents_random_1000_pickle_name)
        tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=ngram_range, max_document_frequency=max_df,
                                    tokenizer=StemTokenizer())

        doc_weights = list(np.ones(len(df)))

        # term weights - embeddings
        filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None)
        term_weights = filter_output_obj.ngram_weights_vec

        tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range, unbias=True)
        tfidf_mask_obj.update_mask(doc_weights, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # mask the tfidf matrix
        tfidf_matrix = tfidf_obj.tfidf_matrix
        tfidf_masked = tfidf_mask.multiply(tfidf_matrix)
        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')

        cls.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, tfidf_obj.feature_names)
        term_score_tuples = cls.__tfidf_reduce_obj.extract_ngrams_from_docset('sum')
        graph_obj = TermsGraph(term_score_tuples[:num_ngrams], cls.__tfidf_reduce_obj)
        graph = graph_obj.graph
        cls.__links = graph['links']
        cls.__nodes = graph['nodes']
    def setUpClass(cls):

        df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
        tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=(1, 3), max_document_frequency=0.1,
                                    tokenizer=LemmaTokenizer())
        nmf_topics = 5
        cls.__nmf = nmf_topic_modelling(nmf_topics, tfidf_obj.tfidf_matrix)
    def init_mask(self, cpc, min_n, uni_factor=0.8):
        docs_mask_dict = {
            'filter_by': 'union',
            'cpc': cpc,
            'time': None,
            'cite': [],
            'columns': None,
            'date': None,
            'date_header': None
        }

        self.__tfidf_obj = tfidf_from_text(self.__df['abstract'], ngram_range=(min_n, self.__max_n),
                                           max_document_frequency=self.__max_df, tokenizer=StemTokenizer())
        cpc_dict = utils.cpc_dict(self.__df)

        self.__dates = generate_year_week_dates(self.__df, docs_mask_dict['date_header'])
        doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, cpc_dict, self.__df.shape[0]).doc_filters

        # term weights - embeddings
        filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None)
        term_weights = filter_output_obj.ngram_weights_vec

        tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor, unbias=True)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        self.__tfidf_mask = tfidf_mask_obj.tfidf_mask
예제 #5
0
 def setUp(self):
     df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
     tfidf_obj = tfidf_from_text(df['abstract'],
                                 ngram_range=(1, 3),
                                 max_document_frequency=0.1,
                                 tokenizer=LemmaTokenizer())
     self.feature_names = tfidf_obj.feature_names
    def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract',
                 cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0,
                 terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5,
                 emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None):

        self.__emergence_index = emergence_index

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__timeseries_date_dict = docs_mask_dict['timeseries_date']
        self.__timeseries_data = []

        self.__emergence_list = []
        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if cached_folder_name is None:
            dataframe = data_factory.get(data_filename)
            utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header)
            utils.remove_empty_documents(dataframe, text_header)

            self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header],
                                               ngram_range=ngram_range,
                                               max_document_frequency=max_df,
                                               tokenizer=LemmaTokenizer())
            tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True)
            self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

            if prefilter_terms != 0:
                tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names)
                term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
                num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples))

                feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]])

                number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset)
                number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                      f'to {number_of_ngrams_after:,}')

            self.__cpc_dict = utils.cpc_dict(dataframe)
            if docs_mask_dict['date_header'] is None:
                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}')
                self.__dates = None
            else:
                self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe,
                                                                                 docs_mask_dict['date_header'])
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')

            utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name)
            utils.pickle_object('dates', self.__dates, self.__cached_folder_name)
            utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name)

        else:
            print(f'Reading document and TFIDF from pickle {cached_folder_name}')

            self.__cached_folder_name = path.join('cached', cached_folder_name)
            self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name)
            self.__dates = utils.unpickle_object('dates', self.__cached_folder_name)
            self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name)

            if self.__dates is not None:
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
                      f'to {max_date // 100}-{(max_date % 100):02d}')

            WordAnalyzer.init(
                tokenizer=LemmaTokenizer(),
                preprocess=lowercase_strip_accents_and_ownership,
                ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
        print(f'Applying documents filter...')
        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict,
                                      self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        # todo: this is another weight function...

        # term weights - embeddings
        print(f'Applying terms filter...')
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.
        print(f'Creating a masked tfidf matrix from filters...')
        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix

        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

        tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates)
        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names)
        self.__timeseries_data = None

        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
                                                  WordAnalyzer.stemmed_stop_word_set_n)

        # todo: no output method; just if statements to call output functions...?
        #  Only supply what they each directly require

        # todo: hence Pipeline then becomes a single function
        if not calculate_timeseries:
            return

        # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix
        print(f'Creating timeseries matrix...')
        if cached_folder_name is None or not (
                path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
                and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
                and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
            self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates)
            [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week,
             self.__weekly_iso_dates] = self.__timeseries_data

            utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name)
            utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name)
            utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name)
        else:
            self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name)
            self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name)
            self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name)
            self.__term_ngrams = self.__tfidf_obj.feature_names

        self.__M = m_steps_ahead

        # TODO: define period from command line, then cascade through the code

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()
        self.__timeseries_quarterly = []
        self.__timeseries_intercept = []
        self.__timeseries_derivatives = []
        self.__timeseries_quarterly_smoothed = []
        self.__term_nonzero_dates = []

        all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
            self.__weekly_iso_dates, self.__number_of_patents_per_week)

        # find indexes for date-range
        min_date = max_date = None
        if self.__timeseries_date_dict is not None:
            min_date = self.__timeseries_date_dict['from']
            max_date = self.__timeseries_date_dict['to']

        min_i = 0
        max_i = len(all_quarters)

        for i, quarter in enumerate(all_quarters):
            if min_date is not None and min_date < quarter:
                break
            min_i = i

        for i, quarter in enumerate(all_quarters):
            if max_date is not None and max_date < quarter:
                break
            max_i = i
        self.__lims = [min_i, max_i]
        self.__timeseries_quarterly_smoothed = None if sma is None else []

        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term',
                               desc='Calculating quarterly timeseries',
                               leave=False, unit_scale=True):
            row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index)
            weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices]
            non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates,
                                                                                                       row_values)
            non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters)
            self.__timeseries_quarterly.append(quarterly_values)

        if emergence_index == 'gradients' or sma == 'kalman':
            if cached_folder_name is None or not (
                    path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
                    and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
                for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term',
                                                         desc='smoothing quarterly timeseries with kalman filter',
                                                         leave=False, unit_scale=True,
                                                         total=len(self.__timeseries_quarterly)):
                    _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing()

                    smooth_series = smooth_series_s[0].tolist()[0]
                    smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
                    self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

                    derivatives = smooth_series_s[1].tolist()[0]
                    self.__timeseries_derivatives.append(derivatives)

                utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name)
                utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name)

            else:
                self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s',
                                                                             self.__cached_folder_name)
                self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name)

        if sma == 'savgol':
            for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term',
                                         desc='savgol smoothing quarterly timeseries',
                                         leave=False, unit_scale=True):
                smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
                smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
                self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

        em = Emergence(all_quarterly_values[min_i:max_i])
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore',
                               leave=False, unit_scale=True):
            if term_weights[term_index] == 0.0:
                continue
            term_ngram = self.__term_ngrams[term_index]

            if self.__timeseries_quarterly_smoothed is not None:
                quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i]
            else:
                quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i]

            if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float(
                    patents_per_quarter_threshold):
                continue

            if emergence_index == 'quadratic':
                escore = em.escore2(quarterly_values)
            elif emergence_index == 'porter':
                if not em.is_emergence_candidate(quarterly_values):
                    continue
                escore = em.calculate_escore(quarterly_values)
            elif emergence_index == 'gradients':
                derivatives = self.__timeseries_derivatives[term_index][min_i:max_i]
                escore = em.net_growth(quarterly_values, derivatives)
            else:
                weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0]
                escore = em.escore_exponential(weekly_values)

            self.__emergence_list.append((term_ngram, escore))

        nterms2 = min(nterms, len(self.__emergence_list))
        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]]
        self.__declining.reverse()
        self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]
예제 #7
0
    def __init__(
        self,
        data_filename,
        docs_mask_dict,
        pick_method='sum',
        ngram_range=(1, 3),
        text_header='abstract',
        pickled_tfidf_folder_name=None,
        max_df=0.1,
        user_ngrams=None,
        prefilter_terms=0,
        terms_threshold=None,
        output_name=None,
        calculate_timeseries=None,
        m_steps_ahead=5,
        curves=True,
        nterms=50,
        minimum_patents_per_quarter=20,
    ):

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__timeseries_data = []

        self.__emergence_list = []
        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if pickled_tfidf_folder_name is None:

            dataframe = data_factory.get(data_filename)
            utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict,
                          text_header)
            utils.remove_empty_documents(dataframe, text_header)

            self.__tfidf_obj = tfidf_from_text(
                text_series=dataframe[text_header],
                ngram_range=ngram_range,
                max_document_frequency=max_df,
                tokenizer=LemmaTokenizer())
            tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                       ngram_range=ngram_range,
                                       uni_factor=0.8,
                                       unbias=True)
            self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

            if prefilter_terms != 0:
                tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix,
                                               self.__tfidf_obj.feature_names)
                term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(
                    pick_method)
                num_tuples_to_retain = min(prefilter_terms,
                                           len(term_score_tuples))

                feature_subset = sorted(
                    [x[1] for x in term_score_tuples[:num_tuples_to_retain]])

                number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                self.__tfidf_obj = tfidf_subset_from_features(
                    self.__tfidf_obj, feature_subset)
                number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                print(
                    f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                    f'to {number_of_ngrams_after:,}')

            self.__cpc_dict = utils.cpc_dict(dataframe)
            self.__dates = scripts.utils.date_utils.generate_year_week_dates(
                dataframe, docs_mask_dict['date_header'])

            base_pickle_path = path.join('outputs', 'tfidf')
            makedirs(base_pickle_path, exist_ok=True)

            def pickle_object(short_name, obj):
                folder_name = path.join(base_pickle_path,
                                        output_name + f'-mdf-{max_df}')
                makedirs(folder_name, exist_ok=True)
                file_name = path.join(
                    folder_name,
                    output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2')
                with bz2.BZ2File(file_name, 'wb') as pickle_file:
                    pickle.dump(obj,
                                pickle_file,
                                protocol=4,
                                fix_imports=False)

            pickle_object('tfidf', self.__tfidf_obj)
            pickle_object('dates', self.__dates)
            pickle_object('cpc_dict', self.__cpc_dict)

        else:
            print(
                f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}'
            )

            base_folder = path.basename(pickled_tfidf_folder_name)
            pickled_base_file_name = path.join(pickled_tfidf_folder_name,
                                               base_folder)

            self.__tfidf_obj = read_pickle(pickled_base_file_name +
                                           '-tfidf.pkl.bz2')
            self.__dates = read_pickle(pickled_base_file_name +
                                       '-dates.pkl.bz2')
            self.__cpc_dict = read_pickle(pickled_base_file_name +
                                          '-cpc_dict.pkl.bz2')

            if self.__dates is not None:
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                print(
                    f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
                    f'to {max_date // 100}-{(max_date % 100):02d}')

            WordAnalyzer.init(tokenizer=LemmaTokenizer(),
                              preprocess=lowercase_strip_accents_and_ownership,
                              ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
        print(f'Applying documents filter...')
        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(
            self.__dates, docs_mask_dict, self.__cpc_dict,
            self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        # todo: this is another weight function...

        # term weights - embeddings
        print(f'Applying terms filter...')
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names,
                                       user_ngrams,
                                       threshold=terms_threshold)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.
        print(f'Creating a masked tfidf matrix from filters...')
        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                   ngram_range=ngram_range,
                                   uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix

        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

        tfidf_masked, self.__dates = utils.remove_all_null_rows_global(
            tfidf_masked, self.__dates)
        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked,
                                              self.__tfidf_obj.feature_names)
        self.__timeseries_data = None

        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(
            pick_method)
        self.__term_score_tuples = utils.stop_tup(
            self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
            WordAnalyzer.stemmed_stop_word_set_n)

        # todo: no output method; just if statements to call output functions...?
        #  Only supply what they each directly require

        # todo: hence Pipeline then becomes a single function
        if not calculate_timeseries:
            return

        print(f'Creating timeseries matrix...')
        self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(
            self.__dates)
        [
            self.__term_counts_per_week, self.__term_ngrams,
            self.__number_of_patents_per_week, self.__weekly_iso_dates
        ] = self.__timeseries_data

        self.__M = m_steps_ahead

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()

        em = Emergence(self.__number_of_patents_per_week)
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]),
                               unit='term',
                               desc='Calculating eScore',
                               leave=False,
                               unit_scale=True):
            term_ngram = self.__term_ngrams[term_index]
            row_indices, row_values = utils.get_row_indices_and_values(
                term_counts_per_week_csc, term_index)

            if len(row_values) == 0:
                continue

            weekly_iso_dates = [
                self.__weekly_iso_dates[x] for x in row_indices
            ]

            _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
                weekly_iso_dates, row_values)
            if max(quarterly_values) < minimum_patents_per_quarter:
                continue

            if em.init_vars(row_indices, row_values, porter=not curves):
                escore = em.calculate_escore() if not curves else em.escore2()
                self.__emergence_list.append((term_ngram, escore))

        nterms2 = min(nterms, len(self.__emergence_list))
        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]]
        self.__stationary = utils.stationary_terms(self.__emergence_list,
                                                   nterms2)
예제 #8
0
    def test_table(self):
        max_n = 3
        min_n = 2

        ngram_multiplier = 4

        num_ngrams_report = 25
        num_ngrams_wordcloud = 25

        num_ngrams = max(num_ngrams_report, num_ngrams_wordcloud)

        tfidf_cold = tfidf_from_text(ReferenceData.cold_df,
                                     tokenizer=LemmaTokenizer(),
                                     ngram_range=(min_n, max_n))
        tfidf_random = tfidf_from_text(ReferenceData.random_df,
                                       tokenizer=LemmaTokenizer(),
                                       ngram_range=(min_n, max_n))

        citation_count_dict = {
            1: 10,
            2: 3,
            101: 2,
            102: 0,
            103: 5,
            104: 4,
            105: 10
        }

        args = FakeArgs()

        args.pick = 'sum'
        args.time = False
        args.focus = 'chi2'

        register_writer(TestTableOutput.FakeWriter)
        fake_writer = TestTableOutput.FakeWriter('spreadsheet.fake')

        table_output(tfidf_cold, tfidf_random, num_ngrams, args,
                     ngram_multiplier, fake_writer, citation_count_dict)

        # Check sheet headings...
        self.assertListEqual([
            None, 'Term', 'Score', 'Rank', 'Focus chi2 Score',
            'Focus chi2 Rank', 'Diff Base to Focus Rank', 'Time Score',
            'Time Rank', 'Diff Base to Time Rank', 'Citation Score',
            'Citation Rank', 'Diff Base to Citation Rank'
        ], fake_writer.sheets['Summary'][0])

        self.assertListEqual([None, 'Term', 'Score', 'Rank'],
                             fake_writer.sheets['Base'][0])
        self.assertListEqual(
            [None, 'Term', 'Focus chi2 Score', 'Focus chi2 Rank'],
            fake_writer.sheets['Focus'][0])
        self.assertListEqual([None, 'Term', 'Time Score', 'Time Rank'],
                             fake_writer.sheets['Time'][0])
        self.assertListEqual([None, 'Term', 'Citation Score', 'Citation Rank'],
                             fake_writer.sheets['Cite'][0])

        # Base sheet should match summary sheet
        for y in range(25):
            for x in range(4):
                self.assertEqual(fake_writer.sheets['Summary'][y + 1][x],
                                 fake_writer.sheets['Base'][y + 1][x])