def evaluate_spacy(self, eval_data_container: GoldDataContainer): assert self.get_textcat_pipeline().cfg['exclusive_classes'] is not None scorer = self.nlp.evaluate( eval_data_container.get_in_spacy_format(), verbose=False, ) # scorer = spacy.scorer.Scorer(pipeline=self.nlp.pipeline) # # for ed in eval_data_container.gold_data_item_list: # # doc_with_cats = self.nlp(ed.text) # tokenization + predictions # # gold = spacy.gold.GoldParse( # self.nlp.make_doc(ed.text), # tokenization only, no predictions # cats=ed.cats # correct categories # ) # # scorer.score(doc_with_cats, gold, verbose=True) self.log_trainer( "Spacy's scores: {\n" + f" 'textcat_score': {scorer.scores['textcat_score']}\n" + " 'textcats_per_cat': {\n" + ''.join([ f" '{name}': {value}\n" for name, value in scorer.scores['textcats_per_cat'].items() ]) + " }\n" + "}") return scorer.scores
def run(): gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) trainer = main.init_trainer(ConfigTrain, cats_list=gdc.cats_list) main.run_training(config=ConfigTrain, trainer=trainer, gold_data_container=gdc) embed()
def create_cats_overview( gold_data_container: GoldDataContainer, root_coding_node: CodingNode, article_annotated_list: List[ArticleAnnotated]) -> GoldDataContainer: # TODO : remove this once sure it's not needed anymore def save_cats_hierarchy_into_dict(current_coding_node: CodingNode): cats_dict = {} for c in current_coding_node.children: cats_dict.update(save_cats_hierarchy_into_dict(c)) return {current_coding_node.coding_value: cats_dict} # TODO : remove this once sure it's not needed anymore def save_cats_dict_into_list(cat_dict): current_leafs_list = [] for k, v in cat_dict.items(): if v != {}: current_leafs_list.extend(save_cats_dict_into_list(v)) else: current_leafs_list.append(k) return current_leafs_list def filter_out_unused_cats( root_coding_node: CodingNode, article_annotated_list: List[ArticleAnnotated], ) -> List[str]: len_all_aa_list = len(article_annotated_list) all_cats_used_list = [] for cn in root_coding_node.get_all_subnodes(): len_aa_set = len(cn.article_annotated_set) if len_aa_set != 0 and len_aa_set != len_all_aa_list: if cn.coding_value in all_cats_used_list: raise Exception( "Category was already added to this list. Such redundancies could interfer later with training " "where categories are used as keys for dictionaries.") all_cats_used_list.append(cn.coding_value) return all_cats_used_list gold_data_container.cats_list = filter_out_unused_cats( root_coding_node, article_annotated_list) return gold_data_container
def transform_to_gold_data_articles( root_coding_node: CodingNode, article_annotated_list: List[ArticleAnnotated], ) -> GoldDataContainer: gold_data_container = GoldDataContainer() gold_data_container = create_cats_overview(gold_data_container, root_coding_node, article_annotated_list) def save_gold_data_into_container( gold_data_container: GoldDataContainer, article_annotated_list: List[ArticleAnnotated], ) -> GoldDataContainer: gold_data_container.gold_data_item_list = [] def get_cats_assigned(article_annotated, cats_list): article_cats_dict = {} relevant_cats = set( coding_dict["coding_node"].coding_value for coding_dict in article_annotated.coding_list) for cat in cats_list: if cat in relevant_cats: article_cats_dict[cat] = 1 else: article_cats_dict[cat] = 0 return article_cats_dict for article_annotated in article_annotated_list: gold_data_container.gold_data_item_list.append( GoldDataItem( article_id=article_annotated.article_id, text=article_annotated.article_file_content_cleaned, cats=get_cats_assigned(article_annotated, gold_data_container.cats_list))) return gold_data_container gold_data_container = save_gold_data_into_container( gold_data_container=gold_data_container, article_annotated_list=article_annotated_list, ) return gold_data_container
def persist_gold_data( config: Type[ConfigRoot], gold_data_container: GoldDataContainer, ): log_manager.info_global( "--------------------------------" "\nPersisting transformed data into json structured for training\n") if config.should_do_dummy_run: config.gold_data_json_path = config.gold_data_json_path.replace( ".json", "__dummy.json") gold_data_container.gold_data_item_list = gold_data_container.gold_data_item_list[: 40] gold_data_manager.persist_to_json(config.gold_data_json_path, gold_data_container)
def train(self, train_data: GoldDataContainer, eval_data: GoldDataContainer, iteration_limit: int): start = datetime.now() self.log_trainer("--------------------------------" "\nSTART TRAINING\n") self.log_trainer(f"model_path: {self.model_path}") self.log_trainer(f"train_data_json_path: {self.train_data_json_path}") self.log_trainer(f"should_create_model: {self.should_create_model}") self.log_trainer(f"should_load_model: {self.should_load_model}") self.log_trainer(f"should_persist_model: {self.should_persist_model}") self.log_trainer(f"cats: {self.cats}") self.log_trainer(f"spacy.prefer_gpu(): {spacy.prefer_gpu()}") self.log_trainer(f"iteration_limit: {iteration_limit}") self.log_trainer( f"len(train_data): {len(train_data.gold_data_item_list)}") self.log_trainer( f"len(eval_data): {len(eval_data.gold_data_item_list)}") # TODO : add hashing of assigned cats # TODO : Write cats_list to log too hash_texts_train_data = self.get_hash_of_texts( [gdi.text for gdi in train_data.gold_data_item_list]) self.log_trainer( f"hash of texts in train_data: {hash_texts_train_data}") hash_texts_eval_data = self.get_hash_of_texts( [gdi.text for gdi in eval_data.gold_data_item_list]) self.log_trainer(f"hash of texts in eval_data: {hash_texts_eval_data}") textcat = self.get_textcat_pipeline() self.log_trainer( f"textcat.cfg.get('exclusive_classes', None): {textcat.cfg.get('exclusive_classes', None)}" ) dropout = 0.2 self.log_trainer(f"dropout: {dropout}") other_pipes = [ pipe for pipe in self.nlp.pipe_names if pipe not in ["textcat", "trf_wordpiecer", "trf_tok2vec"] ] with self.nlp.disable_pipes(*other_pipes): optimizer = self.nlp.begin_training() for iteration in range(1, iteration_limit + 1): losses = {} start_iteration = datetime.now() self.log_trainer(f"Start iteration: {iteration}") for text, annotations in train_data.get_in_spacy_format(): self.nlp.update([text], [annotations], sgd=optimizer, drop=dropout, losses=losses) end_iteration = datetime.now() self.log_trainer(f"End iteration: {iteration}") self.log_trainer( f"duration iteration: {end_iteration - start_iteration}") self.log_trainer(f"losses: {losses['textcat']}") if len(eval_data.gold_data_item_list) > 0: scores, _ = self.evaluate(eval_data) self.log_trainer( f"overall score: {scores['textcat_score']}") # https://github.com/explosion/spaCy/blob/26a90f011b8c21dfc06940579479aaff8006ff74/spacy/scorer.py#L164 for cat in scores['textcats_per_cat']: self.log_trainer( f"scores for '{cat}': {scores['textcats_per_cat'][cat]}" ) if self.should_persist_model: self.persist_model() end = datetime.now() self.log_trainer("END TRAINING") self.log_trainer(f"DURATION TRAINING: {end - start}")
def run(): eval_data_container = main.load_gold_data(ConfigLoadG8) eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container) modelVR = main.init_trainer(ConfigLoadVRModel) main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over the entire dataset g8: \n" ) scores_spacy, scores_manual = modelVR.evaluate(eval_data_container) # only look at those examples that mo9 predicts as either AF=SM or AF=SC modelAF = main.init_trainer(ConfigLoadAFModel) gdis_to_keep = [] for gdi in eval_data_container.gold_data_item_list: doc = modelAF.nlp(gdi.text) for cat in ['AF: Social Companions', 'AF: Soziale Medien']: if doc.cats[cat] > 0.5: gdis_to_keep.append(gdi) break eval_data_container2 = GoldDataContainer() eval_data_container2.cats_list = eval_data_container.cats_list eval_data_container2.gold_data_item_list = gdis_to_keep main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n" ) scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2) # only look at those examples that were annotated as AF=SM or AF=SC # we need to reload the data to undo the transformation that removes AF eval_data_container = main.load_gold_data(ConfigLoadG8) gdis_to_keep = [] for gdi in eval_data_container.gold_data_item_list: for cat in ['AF: Social Companions', 'AF: Soziale Medien']: if gdi.cats[cat] == 1: gdis_to_keep.append(gdi) break eval_data_container3 = GoldDataContainer() eval_data_container3.cats_list = eval_data_container.cats_list eval_data_container3.gold_data_item_list = gdis_to_keep # now apply the transformation that removes all categories except VR eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n" ) scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3) embed()
def run_2(): class Config1_1(ConfigRoot): # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json' gold_data_json_path = data_flow_registry.gold_data["g1"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule2 class Config1_2(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config2(ConfigRoot): # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4 gold_data_json_path = data_flow_registry.gold_data["g4"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule9 class Config3(ConfigRoot): # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5 gold_data_json_path = data_flow_registry.gold_data["g5"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config4(ConfigRoot): # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6 gold_data_json_path = data_flow_registry.gold_data["g6"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config5(ConfigRoot): # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7 gold_data_json_path = data_flow_registry.gold_data["g7"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config6(ConfigRoot): # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8 gold_data_json_path = data_flow_registry.gold_data["g8"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json" gdc_old = main.load_gold_data(ConfigRoot) gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) gdc_new = gdc pair_differences = [] for i, gdi_o in enumerate(gdc_old.gold_data_item_list): found = False for gdi_n in gdc_new.gold_data_item_list: if gdi_o.article_id == gdi_n.article_id: if gdi_o.cats != gdi_n.cats: texts_equal = gdi_o.text == gdi_n.text pair_differences.append({"gdi_o": gdi_o, "gdi_n": gdi_n}) else: print(i) found = True break if not found: print(i) gdc_d = GoldDataContainer(cats_list=gdc.cats_list) for p in pair_differences: gdc_d.gold_data_item_list.append(p["gdi_o"]) gdc_d.gold_data_item_list.append(p["gdi_n"]) ConfigRoot.gold_data_json_path = "../data/gold_data/differences.json" main.persist_gold_data(ConfigRoot, gdc_d) embed()
def run(): class Config1_1(ConfigRoot): # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json' gold_data_json_path = data_flow_registry.gold_data["g1"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule2 class Config1_2(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config2(ConfigRoot): # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4 gold_data_json_path = data_flow_registry.gold_data["g4"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule9 class Config3(ConfigRoot): # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5 gold_data_json_path = data_flow_registry.gold_data["g5"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config4(ConfigRoot): # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6 gold_data_json_path = data_flow_registry.gold_data["g6"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config5(ConfigRoot): # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7 gold_data_json_path = data_flow_registry.gold_data["g7"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config6(ConfigRoot): # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8 gold_data_json_path = data_flow_registry.gold_data["g8"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json" gdc_old = main.load_gold_data(ConfigRoot) gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) for gdi in gdc_1.gold_data_item_list: gdi.source = "g1" # TODO: Damit dass geht muss golddataitem und gold_data_manager angepasst werden gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) for gdi in gdc_2.gold_data_item_list: gdi.source = "g4" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) for gdi in gdc_3.gold_data_item_list: gdi.source = "g5" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) for gdi in gdc_4.gold_data_item_list: gdi.source = "g6" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) for gdi in gdc_5.gold_data_item_list: gdi.source = "g7" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) for gdi in gdc_6.gold_data_item_list: gdi.source = "g8" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) get_redundancies_by_id(gdc)
def transform_to_gold_data(prodigy_data, db_config, ske_config) -> GoldDataContainer: cats_list=[cats_dict["text"] for cats_dict in prodigy_data[0]["options"]] gold_data_item_list=[] # We open this here in case we need it to convert URLs to doc.ids (p2, p3, p4) db_connection, db_cursor = open_db_connection(db_config, None, None) for row in prodigy_data: if row['answer'] != 'accept': continue answers = row['accept'] options = row['options'] cats_assigned = {} for i, cat in enumerate(cats_list): idx = [ opt['id'] for opt in options if opt['text'] == cat ] if len(idx) == 0 or idx[0] not in answers: cats_assigned[cat] = 0 else: cats_assigned[cat] = 1 article_id = None # p1: doc.id = row['label'] as well as row['meta']['article_id'] # p2, p3, p4: row['meta']['url] -> transform to doc.id via SKE or DB # p5: doc.id = row['meta']['docid'] if 'article_id' in row['meta']: # p1 article_id = row['meta']['article_id'] elif 'docid' in row['meta']: # p5 article_id = row['meta']['docid'] elif 'url' in row['meta']: # p2, p3, p4 # First we check whether there is an ID translation in the DB db_cursor.execute( sql.SQL(""" SELECT {col_docid} FROM {tbl_ids} WHERE {col_url} = %(url)s """).format( col_docid = sql.Identifier('docid'), tbl_ids = sql.Identifier('ske_docid_pos'), col_url = sql.Identifier('url_index1') ), { 'url': row['meta']['url'] } ) result = db_cursor.fetchone() if result: article_id = result['docid'] else: # If that fails, we prompt the SKE pos = ske_manager.get_pos_from_url(row['meta']['url']) article_id = ske_manager.get_docid_from_pos(ske_config, pos) # TODO: Ideally we would then insert this new ID translation into the DB else: raise Exception("Couldn't locate the annotation's text ID.") # TODO : Maybe add a text clean-up here to remove the abundant whitespace? Does it make a difference for spacy however? gold_data_item_list.append( GoldDataItem( article_id=article_id, text=row["text"] if 'text' in row else row['html'], cats=cats_assigned ) ) close_db_connection(db_connection, db_cursor) log_manager.info_global(f"Keeping {len(gold_data_item_list)} data items. ") return GoldDataContainer(cats_list=cats_list, gold_data_item_list=gold_data_item_list)
def save_gold_data_into_container( gold_data_container: GoldDataContainer, article_annotated_list: List[ArticleAnnotated], ): cats_list = gold_data_container.cats_list gold_data_container.gold_data_item_list = [] log_manager.info_global( "Starting to transform articles with annotations to sentences with annotations.\n" "This will take a while.") len_article_annotated_list = len(article_annotated_list) for i, article_annotated in enumerate(article_annotated_list, start=1): if i % 100 == 0 or i % len_article_annotated_list == 0: log_manager.info_global( f"at article number: {i}, out of {len_article_annotated_list}" ) sentence_cats_dict = OrderedDict() sentence_article_list = sentence_split_func( article_annotated.article_file_content_cleaned) for sentence in sentence_article_list: sentence_cats_dict[str(sentence)] = { cat: 0 for cat in cats_list } for coding in article_annotated.coding_list: segment = coding["Segment"] segment = re.sub("<.*?>", "", segment) sentence_segment_list = sentence_split_func(segment) for sentence_segment in sentence_segment_list: for sentence_article in sentence_cats_dict.keys(): if str(sentence_segment) in sentence_article: cat = coding["coding_node"].coding_value cat_used_dict = sentence_cats_dict[ sentence_article] if cat in cat_used_dict: cat_used_dict[cat] = 1 break for sentence, cats in sentence_cats_dict.items(): gold_data_container.gold_data_item_list.append( GoldDataItem(article_id=article_annotated.article_id, text=sentence, cats=cats)) log_manager.info_global( f"Transformed {len(article_annotated_list)} articles into {len(gold_data_container.gold_data_item_list)} sentences." ) return gold_data_container