def candidate_weighting(self, df=None): """ Candidate weighting function using document frequencies. Args: df (dict): document frequencies, the number of documents should be specified using the "--NB_DOC--" key. """ # initialize default document frequency counts if none provided if df is None: df = load_document_frequency_file(self._df_counts, delimiter='\t') # initialize the number of documents as --NB_DOC-- + 1 (current) N = 1 + df.get('--NB_DOC--', 0) # loop throught the candidates for k, v in self.candidates.items(): # get candidate document frequency candidate_df = 1 + df.get(k, 0) # compute the idf score idf = math.log(N / candidate_df, 2) # add the idf score to the weights container self.weights[k] = len(v.surface_forms) * idf
def candidate_weighting(self, df=None, sigma=3.0, alpha=2.3): """Candidate weight calculation as described in the KP-Miner paper. Note: w = tf * idf * B * P_f with * B = N_d / (P_d * alpha) and B = min(sigma, B) * N_d = the number of all candidate terms * P_d = number of candidates whose length exceeds one * P_f = 1 Args: df (dict): document frequencies, the number of documents should be specified using the "--NB_DOC--" key. sigma (int): parameter for boosting factor, defaults to 3.0. alpha (int): parameter for boosting factor, defaults to 2.3. """ # initialize default document frequency counts if none provided if df is None: logging.warning('LoadFile._df_counts is hard coded to {}'.format( self._df_counts)) df = load_document_frequency_file(self._df_counts, delimiter='\t') # initialize the number of documents as --NB_DOC-- + 1 (current) N = 1 + df.get('--NB_DOC--', 0) # compute the number of candidates whose length exceeds one P_d = sum([len(v.surface_forms) for v in self.candidates.values() if len(v.lexical_form) > 1]) # fall back to 1 if all candidates are words P_d = max(1, P_d) # compute the number of all candidate terms N_d = sum([len(v.surface_forms) for v in self.candidates.values()]) # compute the boosting factor B = min(N_d / (P_d * alpha), sigma) # loop throught the candidates for k, v in self.candidates.items(): # get candidate document frequency candidate_df = 1 # get the df for unigram only if len(v.lexical_form) == 1: candidate_df += df.get(k, 0) # compute the idf score idf = math.log(N / candidate_df, 2) if len(v.lexical_form) == 1: # If single word candidate do not apply boosting factor self.weights[k] = len(v.surface_forms) * idf else: self.weights[k] = len(v.surface_forms) * B * idf
def feature_extraction(self, df=None, training=False): """Extract features for each keyphrase candidate. Features are the tf*idf of the candidate and its first occurrence relative to the document. Args: df (dict): document frequencies, the number of documents should be specified using the "--NB_DOC--" key. training (bool): indicates whether features are computed for the training set for computing IDF weights, defaults to false. """ # initialize default document frequency counts if none provided if df is None: logging.warning('LoadFile._df_counts is hard coded to {}'.format( self._df_counts)) df = load_document_frequency_file(self._df_counts, delimiter='\t') # initialize the number of documents as --NB_DOC-- N = df.get('--NB_DOC--', 0) + 1 if training: N -= 1 # find the maximum offset maximum_offset = float(sum([s.length for s in self.sentences])) for k, v in self.candidates.items(): # get candidate document frequency candidate_df = 1 + df.get(k, 0) # hack for handling training documents if training and candidate_df > 1: candidate_df -= 1 # compute the tf*idf of the candidate idf = math.log(N / candidate_df, 2) # add the features to the instance container self.instances[k] = np.array([len(v.surface_forms) * idf, v.offsets[0] / maximum_offset]) # scale features self.feature_scaling()
def feature_extraction(self, df=None, training=False, features_set=None): """Extract features for each candidate. Args: df (dict): document frequencies, the number of documents should be specified using the "--NB_DOC--" key. training (bool): indicates whether features are computed for the training set for computing IDF weights, defaults to false. features_set (list): the set of features to use, defaults to [1, 4, 6]. """ # define the default features_set if features_set is None: features_set = [1, 4, 6] # initialize default document frequency counts if none provided if df is None: df = load_document_frequency_file(self._df_counts, delimiter='\t') # initialize the number of documents as --NB_DOC-- N = df.get('--NB_DOC--', 0) + 1 if training: N -= 1 # find the maximum offset maximum_offset = float(sum([s.length for s in self.sentences])) # loop through the candidates for k, v in self.candidates.items(): # initialize features array feature_array = [] # get candidate document frequency candidate_df = 1 + df.get(k, 0) # hack for handling training documents if training and candidate_df > 1: candidate_df -= 1 # compute the tf*idf of the candidate idf = math.log(N / candidate_df, 2) # [F1] TF*IDF feature_array.append(len(v.surface_forms) * idf) # [F2] -> TF feature_array.append(len(v.surface_forms)) # [F3] -> term frequency of substrings tf_of_substrings = 0 stoplist = stopwords.words(self.language) for i in range(len(v.lexical_form)): for j in range(i, min(len(v.lexical_form), i + 3)): sub_words = v.lexical_form[i:j + 1] sub_string = ' '.join(sub_words) # skip if substring is fullstring if sub_string == ' '.join(v.lexical_form): continue # skip if substring contains a stopword if set(sub_words).intersection(stoplist): continue # check whether the substring occurs "as it" if sub_string in self.candidates: # loop throught substring offsets for offset_1 in self.candidates[sub_string].offsets: is_included = False for offset_2 in v.offsets: if offset_1 >= offset_2 and \ offset_1 <= offset_2 + len(v.lexical_form): is_included = True if not is_included: tf_of_substrings += 1 feature_array.append(tf_of_substrings) # [F4] -> relative first occurrence feature_array.append(v.offsets[0] / maximum_offset) # [F5] -> relative last occurrence feature_array.append(v.offsets[-1] / maximum_offset) # [F6] -> length of phrases in words feature_array.append(len(v.lexical_form)) # [F7] -> typeface feature_array.append(0) # extract information from sentence meta information meta = [self.sentences[sid].meta for sid in v.sentence_ids] # extract meta information of candidate sections = [u['section'] for u in meta if 'section' in u] types = [u['type'] for u in meta if 'type' in u] # [F8] -> Is in title feature_array.append('title' in sections) # [F9] -> TitleOverlap feature_array.append(0) # [F10] -> Header feature_array.append('sectionHeader' in types or 'subsectionHeader' in types or 'subsubsectionHeader' in types) # [F11] -> abstract feature_array.append('abstract' in sections) # [F12] -> introduction feature_array.append('introduction' in sections) # [F13] -> related work feature_array.append('related work' in sections) # [F14] -> conclusions feature_array.append('conclusions' in sections) # [F15] -> HeaderF feature_array.append( types.count('sectionHeader') + types.count('subsectionHeader') + types.count('subsubsectionHeader')) # [F11] -> abstractF feature_array.append(sections.count('abstract')) # [F12] -> introductionF feature_array.append(sections.count('introduction')) # [F13] -> related workF feature_array.append(sections.count('related work')) # [F14] -> conclusionsF feature_array.append(sections.count('conclusions')) # add the features to the instance container self.instances[k] = np.array([feature_array[i-1] for i \ in features_set]) # scale features self.feature_scaling()
def extract_terms(core_nlp_folder): compute_document_frequency(core_nlp_folder, os.path.join(INTERIM_DIR, "cargo_df.tsv.gz"), stoplist=list(STOP_WORDS)) log.info("Begin Extraction") n = 15 cargo_df = load_document_frequency_file( os.path.join(INTERIM_DIR, "cargo_df.tsv.gz")) pke_factory = { "grammar": r""" NP: {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>} """, "filtering_params": { "stoplist": list(STOP_WORDS) }, "extractors": { "tfidf": { "instance": terms.PKEBasedTermsExtractor(TfIdf), "weighting_params": { "df": cargo_df } }, "kpm": { "instance": terms.PKEBasedTermsExtractor(KPMiner), "weighting_params": { "df": cargo_df } }, "yake": { "instance": terms.PKEBasedTermsExtractor(YAKE), "filtering_params": { "only_alphanum": True, "strip_outer_stopwords": True }, "weighting_params": {} }, "singlerank": { "instance": terms.PKEBasedTermsExtractor(SingleRank), "weighting_params": { "window": 10, "pos": {"NOUN", "PROPN", "NUM", "ADJ"} } }, "topicrank": { "instance": terms.PKEBasedTermsExtractor(TopicRank), "weighting_params": {} }, "mprank": { "instance": terms.PKEBasedTermsExtractor(MultipartiteRank), "weighting_params": {} }, "positionrank": { "instance": terms.PKEBasedTermsExtractor(PositionRank), "weighting_params": {} } } } for name in pke_factory["extractors"]: log.info(f"Begin Extraction with PKE based extractor: {name}") extractor = pke_factory["extractors"][name]["instance"] if "filtering_params" in pke_factory["extractors"][name]: filtering_params = { **pke_factory["filtering_params"], **pke_factory["extractors"][name]["filtering_params"] } else: filtering_params = pke_factory["filtering_params"] extractor.extract( core_nlp_folder, n, grammar=pke_factory["grammar"], filtering_params=filtering_params, weighting_params=pke_factory["extractors"][name] ["weighting_params"], output_file=os.path.join(EXTRACTED_DIR, f"{name}.csv"), auto_term_file=f"data/annotations/automatic/terms/{name}.jsonl") # EmbedRank log.info("Begin Extraction with EmbedRank extractor") embedrank_extractor = terms.EmbedRankTermsExtractor( emdib_model_path="pretrain_models/torontobooks_unigrams.bin") embedrank_extractor.extract( core_nlp_folder, n, grammar=r""" NALL: {<NN|NNP|NNS|NNPS>} NP: {<NALL|CD|JJ>*<NALL>} """, considered_tags={'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'CD'}, output_file=os.path.join(EXTRACTED_DIR, "torontobooks_unigrams.csv"))