class FromDatasetsNamedEntitiesPredictor(Predictor): """ Predicts named entities of a text by looking up terms in a dataset.""" location_strings = {} dataset = pd.DataFrame(columns=["term", "entity_code", "parent_terms"], dtype=str) flashtext = None marked_for_removal = [] def __init__(self, predictor_config): super().__init__(predictor_config) self.load_datasets(predictor_config["datasets"]) @property def config_validation_schema_custom_part(self): return yaml.load( """ datasets: type: list schema: type: dict schema: code: type: string required: True location: type: string regex: "^.+?:.+" required: True """, Loader=yaml.FullLoader, ) def load_datasets(self, entity_code_location_string_dict): for entity_code_location_string in entity_code_location_string_dict: entity_code = entity_code_location_string["code"] location_string = entity_code_location_string["location"] # remember location string self.location_strings[entity_code] = location_string # load entities into dataset new_data = DatasetManager.load_dataset_from_location_string( location_string, { "term": str, "entity_code": str, "parent_terms": str })[0] self.dataset = self.dataset.append(new_data) # update flashtext self.flashtext = KeywordProcessor() data_for_flashtext = pd.DataFrame({ "against": [ "`{}``SN``{}`´".format(row["term"], row["entity_code"]) if not row["parent_terms"] else "`{}``PN``{}``{}`´".format( row["term"], row["entity_code"], row["parent_terms"]) for index, row in self.dataset.iterrows() ], "replace": self.dataset["term"], }) dict_for_flashtext = data_for_flashtext.set_index( "against").T.to_dict("list") self.flashtext.add_keywords_from_dict(dict_for_flashtext) def add_named_entity_term_to_dataset(self, term, entity_code, parent_terms): new_row = pd.DataFrame({ "term": [term], "entity_code": [entity_code], "parent_terms": [parent_terms], }) self.dataset = self.dataset.append(new_row) if parent_terms != "": self.flashtext.add_keywords_from_dict({ "`{}``PN``{}``{}`´".format(term, entity_code, parent_terms): [term] }) else: self.flashtext.add_keywords_from_dict( {"`{}``SN``{}`´".format(term, entity_code): [term]}) def remove_named_entity_term_from_dataset(self, term, entity_code): self.dataset = self.dataset[~( (self.dataset["term"] == term) & (self.dataset["entity_code"] == entity_code))] self.flashtext.remove_keyword(term) def save_dataset(self, location_string, entity_code): # get the named entities with the specified entity code filtered_named_entities = self.dataset[self.dataset["entity_code"] == entity_code].copy() # sort the filtered named entities for convenience filtered_named_entities["sort"] = filtered_named_entities[ "term"].str.lower() filtered_named_entities = filtered_named_entities.sort_values( by=["sort"]) del filtered_named_entities["sort"] # save the dataset DatasetManager.save_dataset_to_location_string(filtered_named_entities, location_string) def mark_named_entity_term_for_removal(self, term, entity_code): if (term, entity_code) not in self.marked_for_removal: self.marked_for_removal.append((term, entity_code)) def reset_marked_for_removal(self): self.marked_for_removal = [] def get_parent_terms_for_named_entity(self, term, entity_code): # check if we have corresponding parent terms in the named entities dataset dataset_query_result = list( self.dataset[(self.dataset["entity_code"] == entity_code) & (self.dataset["term"] == term)]["parent_terms"]) if len(dataset_query_result) > 0: # we got a row back # return either the parent terms or None depending on parent_terms value in dataset dataset_query_result = dataset_query_result[0] return (None if dataset_query_result is None or pd.isnull(dataset_query_result) else dataset_query_result) else: # no, no parent terms found in dataset return None def learn_from_annotated_text(self, annotated_text, language): # note: the definition of a "term" within this function is a tuple of term and entity code # get terms to add/update terms_to_add = {} parented_terms_to_update = [] affected_entity_codes = [] for annotation in extract_annotations_as_generator( annotated_text, types_to_extract=[ "standalone_named_entity", "parented_named_entity" ], ): if (len(self.dataset[(self.dataset["term"] == annotation["term"]) & (self.dataset["entity_code"] == annotation["entity_code"])]) == 0): # term does not exist yet terms_to_add = merge_dict( terms_to_add, { (annotation["term"], annotation["entity_code"]): annotation["parent_terms"] if "parent_terms" in annotation else "" }, ) affected_entity_codes.append(annotation["entity_code"]) else: # term exists but may need update due to different parent terms if "parent_terms" in annotation: currently_stored_parent_terms = list(self.dataset[ (self.dataset["term"] == annotation["term"]) & (self.dataset["entity_code"] == annotation["entity_code"])]["parent_terms"])[0] if currently_stored_parent_terms != annotation[ "parent_terms"]: # needs update terms_to_add = merge_dict( terms_to_add, { ( annotation["term"], annotation["entity_code"], ): annotation["parent_terms"] if "parent_terms" in annotation else "" }, ) parented_terms_to_update.append( (annotation["term"], annotation["entity_code"])) affected_entity_codes.append(annotation["entity_code"]) # get total terms to remove terms_to_remove = [] for term in self.marked_for_removal: if term in terms_to_add: continue terms_to_remove.append(term) affected_entity_codes.append(term[1]) terms_to_remove.extend(parented_terms_to_update) # update key terms dataset (incl. flashtext) # remove if terms_to_remove: for term in terms_to_remove: self.remove_named_entity_term_from_dataset(term[0], term[1]) # add if terms_to_add: for term in terms_to_add: self.add_named_entity_term_to_dataset(term[0], term[1], terms_to_add[term]) # save for affected_entity_code in affected_entity_codes: if affected_entity_code in self.location_strings: self.save_dataset(self.location_strings[affected_entity_code], affected_entity_code) def predict_inline_annotations(self, text, language="en-US"): return (self.flashtext.replace_keywords(text) if self.flashtext is not None else text)
keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform') # output ['product management', 'java'] # 删除关键词 keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print( keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform')) # output ['product management', 'java'] keyword_processor.remove_keyword('java_2e') print( keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform')) # ['product management'] # you can also remove keywords from a list/ dictionary keyword_processor.remove_keywords_from_dict({"product management": ["PM"]}) keyword_processor.remove_keywords_from_list(["java programing"]) keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform') # output ['product management'] # 查询添加关键词的个数 keyword_processor = KeywordProcessor() # 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围
class FromDatasetKeyTermsPredictor(Predictor): """ Predicts key terms of a text by looking up terms in a dataset.""" location_string = None dataset = pd.DataFrame(columns=["term", "parent_terms"], dtype=str) flashtext = None key_terms_marked_for_removal = [] def __init__(self, predictor_config): super().__init__(predictor_config) self.load_dataset(predictor_config["location"]) @property def config_validation_schema_custom_part(self): return yaml.load( """ location: type: string regex: "^.+?:.+" required: True """, Loader=yaml.FullLoader, ) def load_dataset(self, location_string): # update location_string self.location_string = location_string # load data self.dataset = DatasetManager.load_dataset_from_location_string( location_string, { "term": str, "parent_terms": str })[0] # setup flashtext for later string replacements temp_replace_against_dataset = self.dataset.copy() temp_replace_against_dataset["replace"] = temp_replace_against_dataset[ "term"] temp_replace_against_dataset["against"] = temp_replace_against_dataset[ "replace"] temp_replace_against_dataset.loc[ temp_replace_against_dataset["parent_terms"] != "", "against"] = ("`" + temp_replace_against_dataset["term"] + "``PK``" + temp_replace_against_dataset["parent_terms"] + "`´") temp_replace_against_dataset.loc[ temp_replace_against_dataset["parent_terms"] == "", "against"] = ("`" + temp_replace_against_dataset["term"] + "``SK`´") temp_replace_against_dataset = temp_replace_against_dataset[[ "replace", "against" ]] temp_replace_against_dataset_as_dict = { row["against"]: [row["replace"]] for index, row in temp_replace_against_dataset.iterrows() } self.flashtext = KeywordProcessor() self.flashtext.add_keywords_from_dict( temp_replace_against_dataset_as_dict) def add_key_term_to_dataset(self, key_term, parent_terms): new_row = pd.DataFrame({ "term": [key_term], "parent_terms": [parent_terms] }) self.dataset = self.dataset.append(new_row) if parent_terms != "": self.flashtext.add_keywords_from_dict( {"`{}``PK``{}`´".format(key_term, parent_terms): [key_term]}) else: self.flashtext.add_keywords_from_dict( {"`{}``SK`´".format(key_term): [key_term]}) def remove_key_term_from_dataset(self, key_term): self.dataset = self.dataset[self.dataset.term != key_term] self.flashtext.remove_keyword(key_term) def save_dataset(self, location_string): # sort the key terms dataset for convenience self.dataset["sort"] = self.dataset["term"].str.lower() self.dataset = self.dataset.sort_values(by=["sort"]) del self.dataset["sort"] # save the dataset DatasetManager.save_dataset_to_location_string(self.dataset, location_string) def mark_key_term_for_removal(self, key_term): if key_term not in self.key_terms_marked_for_removal: self.key_terms_marked_for_removal.append(key_term) def reset_key_terms_marked_for_removal(self): self.key_terms_marked_for_removal = [] def learn_from_annotated_text(self, annotated_text, language): # get terms to add/update key_terms_to_add = {} parented_terms_to_update = [] existing_terms_list = list(self.dataset["term"]) for annotation in extract_annotations_as_generator( annotated_text, types_to_extract=["standalone_key_term", "parented_key_term"], ): if annotation["term"] not in existing_terms_list: # term does not exist yet key_terms_to_add = merge_dict( key_terms_to_add, { annotation["term"]: annotation["parent_terms"] if "parent_terms" in annotation else "" }, ) else: # term exists but may need update due to different parent terms if "parent_terms" in annotation: currently_stored_parent_terms = list( self.dataset[self.dataset["term"] == annotation["term"]]["parent_terms"])[0] if currently_stored_parent_terms != annotation[ "parent_terms"]: # needs update key_terms_to_add = merge_dict( key_terms_to_add, { annotation["term"]: annotation["parent_terms"] if "parent_terms" in annotation else "" }, ) parented_terms_to_update.append(annotation["term"]) # get total terms to remove key_terms_to_remove = [ key_term for key_term in self.key_terms_marked_for_removal if key_term not in key_terms_to_add ] key_terms_to_remove.extend(parented_terms_to_update) # update key terms dataset (incl. flashtext) # remove if key_terms_to_remove: for key_term in key_terms_to_remove: self.remove_key_term_from_dataset(key_term) # add if key_terms_to_add: for key_term in key_terms_to_add: self.add_key_term_to_dataset(key_term, key_terms_to_add[key_term]) # save self.save_dataset(self.location_string) def predict_inline_annotations(self, text, language="en-US"): return (self.flashtext.replace_keywords(text) if self.flashtext is not None else text)
class PrejudiceSubPopulation(SubPopulation): r""" Filter samples based on gender bias for example in mode 'man':: sample 1: "There is a boy.", score: 1 sample 2: "There is a girl.", score: 1 sample 3: "There are boys and girls.", score: 0 """ def __init__(self, mode='man'): super().__init__() self.mode = mode assert mode in ['man', 'woman', 'both', 'none'], \ "Mode should be one in ['man', 'woman', 'both', 'none']" man_phrases, woman_phrases = self.get_data( download_if_needed(PREJUDICE_PATH)) man_phrases.extend(self.get_words(MAN_WORDS)) woman_phrases.extend(self.get_words(WOMAN_WORDS)) self.processor = KeywordProcessor(case_sensitive=True) self.processor.add_keywords_from_dict({"man": man_phrases}) self.processor.add_keywords_from_dict({"woman": woman_phrases}) # TODO self.processor.remove_keyword('My') def __repr__(self): return "PrejudiceSubpopulation" + "-" + self.mode @staticmethod def get_data(path): # get the name dictionary for dic in read_json(path): _, dic = dic return dic['men'], dic['women'] @staticmethod def get_words(words): tokens = [] tokens.extend(words) tokens.extend([token.upper() for token in words]) tokens.extend([token.title() for token in words]) return tokens def word_match(self, texts, type): for text in texts: if type == 'man': result = self.processor.extract_keywords(text) if 'man' in result: return True else: result = self.processor.extract_keywords(text) if 'woman' in result: return True return False def _score(self, sample, fields, **kwargs): r""" 1 or 0 indicate whether sample fields match mode and prejudice words :param sample: data sample :param list fields: list of field str :param kwargs: :return int: score for sample """ texts = [sample.get_text(field) for field in fields] man_match = self.word_match(texts, type='man') woman_match = self.word_match(texts, type='woman') if self.mode == 'man': return man_match and not woman_match elif self.mode == 'woman': return woman_match and not man_match elif self.mode == 'both': return woman_match and man_match else: return not woman_match and not man_match def get_slice(self, scores, dataset): r""" Save the samples that mach the phrase groups and mode """ sub_samples = [] for i, sample in enumerate(dataset): if scores[i]: sub_samples.append(sample) return sub_samples
keyword_processor.add_keywords_from_dict(keyword_dict) # Or add keywords from a list: keyword_processor.add_keywords_from_list(["java", "python"]) keyword_processor.extract_keywords('I am a product manager for a java_2e platform') # output ['product management', 'java'] # 删除关键词 keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # output ['product management', 'java'] keyword_processor.remove_keyword('java_2e') print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # ['product management'] # you can also remove keywords from a list/ dictionary keyword_processor.remove_keywords_from_dict({"product management": ["PM"]}) keyword_processor.remove_keywords_from_list(["java programing"]) keyword_processor.extract_keywords('I am a product manager for a java_2e platform') # output ['product management'] # 查询添加关键词的个数 keyword_processor = KeywordProcessor() # 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围 keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"]