def load_nes(datasets: list, ) -> (dict, dict): documents = {} doc_alphabet = {} # doc_alphabet = defaultdict(dict) for dataset in datasets: dataset_name = dataset.split('/')[-1] if dataset_name not in ['covid-19', 'us_election_2020']: print(f"Skipping {dataset_name}") continue documents[dataset_name] = {} doc_alphabet[dataset_name] = defaultdict(dict) langs, _ = list_dir(f'{dataset}/predicted') for lang in langs: if lang.lower() not in RELEVANT_LANGS: logger.info(f"Skipping {dataset_name}/{lang}") continue documents[dataset_name][lang] = {} logger.info(f'Extracting from: {dataset}/{lang}') ne_path = f'{dataset}/predicted/{lang}' _, files = list_dir(ne_path) for file in files: df = pd.read_csv(f'{ne_path}/{file}', dtype={ 'docId': str, 'sentenceId': str, 'tokenId': str, 'clID': str, 'text': str, 'lemma': str, 'calcLemma': str, 'upos': str, 'xpos': str, 'ner': str }) df['lang'] = lang df = df.fillna('N/A') records = merge_nes( df.loc[~(df[NER_FIELD] == 'O')].to_dict(orient='records')) for item in records: dkey = f"{lang};{item['docId']};{item['sentenceId']};{item['tokenId']};{item['text']}" fchar = item['text'][0].upper() if dkey in doc_alphabet[dataset_name][fchar]: raise Exception(f"[doc_alphabet] COLLISION!!! {dkey}") doc_alphabet[dataset_name][fchar][dkey] = item if dkey in documents[dataset_name][lang]: raise Exception(f"[documents] COLLISION!!! {dkey}") documents[dataset_name][lang][dkey] = item return { "normal": documents, "alphabetized": doc_alphabet, }
def main(): args = parse_args() run_path = args.run_path if args.run_path is not None else "./data/models/" lang = args.lang year = args.year merge_misc = args.merge_misc print(f"Run path: {run_path}") print(f"Langs: {lang}") print(f"Year: {year}") print(f"Merge misc: {merge_misc}") models, _ = list_dir(f'{run_path}/models') logger.info(f"Models to predict: {json.dumps(models, indent=4)}") # tmodel = tqdm.tqdm(list(map(lambda x: (run_path, lang, x), models)), desc="Model") # predictions = pool.map(looper, tmodel) # predictions = list(map(looper, tmodel)) predictions = [] doc_scores = {} for model in tqdm.tqdm(models, desc="Model"): logger.info(f"Model: {model}") preds, scores = looper(run_path, lang, model, year, merge_misc) predictions.append(preds) doc_scores[model] = scores # logger.info(predictions) with open(f'{run_path}/all_predictions.json', 'w') as f: json.dump(predictions, f) with open(f'{run_path}/all_scores.json', 'w') as f: json.dump(predictions, f) logger.info("Done.")
def main(): global pred_path, cluster_path args = parse_args() pred_path = args.pred_path if args.pred_path is not None else pred_path cluster_path = args.cluster_path if args.cluster_path is not None else cluster_path year = args.year lang = args.lang logger.info(f"Predictions path: {pred_path}") logger.info(f"Clusters path: {pred_path}") logger.info(f"Year: {year}") logger.info(f"Language: {lang}") path = pathlib.Path(pred_path) if not path.exists() or not path.is_dir(): raise Exception(f"Path does not exist or is not a directory: `{pred_path}`") path = pathlib.Path(cluster_path) if not path.exists() or not path.is_dir(): raise Exception(f"Path does not exist or is not a directory: `{cluster_path}`") logger.info("Loading the clusters...") clusters, ne_map = load_clusters(cluster_path) models, _ = list_dir(f'{pred_path}/predictions/bsnlp') for model in models: logger.info(f"Loading the documents for model `{model}`...") data = LoadBSNLPDocuments(year='test_2021', lang=lang, path=f'{pred_path}/predictions/bsnlp/{model}').load_predicted() logger.info(f"[{model}] Merging the cluster data into the prediction data") updated = update_clusters(data, ne_map) logger.info(f"[{model}] Persisting the changes...") UpdateBSNLPDocuments(year='test_2021', lang=lang, path=f'{pred_path}/predictions/bsnlp/{model}').update_clustered(updated) logger.info("Done.")
def convert_files( run_path: str, lang: str = 'sl', year: str = '2021', ) -> None: dirs, _ = list_dir(f'{run_path}/predictions/bsnlp') for dir in dirs: print(f"Working on {dir}") loader = LoadBSNLPDocuments(year=year, lang=lang, path=f'{run_path}/predictions/bsnlp/{dir}') updater = UpdateBSNLPDocuments(year=year, lang=lang, path=f'{run_path}/out/{dir}') data = loader.load_predicted(folder='clustered') # data = loader.load_predicted() updater.update_predicted(data)
def load_data(clear_cache: bool = False) -> (dict, dict): cache_path = f'{RUN_BASE_FNAME}/cached_data.json' cached_file = pathlib.Path(cache_path) if not clear_cache and cached_file.exists() and cached_file.is_file(): mod_time = datetime.fromtimestamp(cached_file.stat().st_mtime) logger.info( f"Using cached data from `{cache_path}`, last modified at: `{mod_time.isoformat()}`" ) with open(cache_path) as f: return json.load(f) # datasets = json.load(open("./data/results/dataset_pairs.json")) datasets, _ = list_dir(DATA_PATH) datasets = [f'{DATA_PATH}/{dataset}' for dataset in datasets] data = load_nes(datasets) with open(cache_path, 'w') as f: logger.info(f"Storing cached data at: {cache_path}") json.dump(data, f) return data
def test(self, test_data: pd.DataFrame) -> (float, float, float): if not (os.path.exists(self.output_model_path) and os.path.isdir(self.output_model_path)): raise Exception( f"A model with the given parameters has not been trained yet," f" or is not located at `{self.output_model_path}`.") models, _ = list_dir(self.output_model_path) models = [ model_fname for model_fname in models if model_fname.startswith(self.output_model_fname) ] print("Models:", models) if not models: raise Exception( f"There are no trained models with the given criteria: `{self.output_model_fname}`" ) logger.info("Loading the testing data...") test_data = self.convert_input(test_data) avg_acc, avg_f1, avg_p, avg_r, reports = [], [], [], [], [] for model_fname in models: logger.info(f"Loading {model_fname}...") model = AutoModelForTokenClassification.from_pretrained( f"{self.output_model_path}/{model_fname}", num_labels=len(self.tag2code), label2id=self.tag2code, id2label=self.code2tag, output_attentions=False, output_hidden_states=False) model = model.to(self.device) _, acc, f1, p, r, report = self.__test(model, test_data) avg_acc.append(acc) avg_f1.append(f1) avg_p.append(p) avg_r.append(r) logger.info(f"Testing P: {p:.4f}, R: {r:.4f}, F1: {f1:.4f}") logger.info(f"Testing classification report:\n{report}") logger.info(f"Average accuracy: {np.mean(avg_acc):.4f}") f1 = np.mean(avg_f1) p = np.mean(avg_p) r = np.mean(avg_r) logger.info(f"Average P: {p:.4f}, R: {r:.4f}, F1: {f1:.4f}") return p, r, f1
def load(self, dset: str) -> pd.DataFrame: dirs, _ = list_dir(self.base_fname) data = pd.DataFrame() for dataset in dirs: if dataset not in self.data_set: continue for lang in self.langs: fname = f"{self.base_fname}/{dataset}/splits/{lang}/{dset}_{lang}.csv" try: df = pd.read_csv(f"{fname}") except: if self.print_debug: print(f"[{dataset}] skipping {lang}.") continue df['sentenceId'] = df['docId'].astype(str) + ';' + df['sentenceId'].astype('str') # + '-' + df['tokenId'].astype(str) if self.merge_misc: df['ner'] = df['ner'].map(lambda x: x.replace("PRO", "MISC").replace("EVT", "MISC")) if self.misc_data_only: df['ner'] = df['ner'].map(lambda x: "O" if x[2:] in ["PER", "LOC", "ORG"] else x) data = pd.concat([data, df]) return data
def create_split( dataset_dir: str, lang: str, docs: list, split_path: str, ) -> None: path = f"{dataset_dir}/merged/{lang}" out_path = f"{dataset_dir}/splits/{lang}/" dataset_name = dataset_dir.split('/')[-1] print(path) _, files = list_dir(path) joined = join_files(files, docs) train_docs, test_docs = train_test_split( joined, train_size=TRAIN_SIZE, random_state=random_state, ) train_docs, val_docs = train_test_split( joined, test_size=TRAIN_SIZE * 0.1, random_state=random_state, ) # print(len(files), len(train_docs), len(val_docs), len(test_docs)) train_data = join_docs(path, train_docs) val_data = join_docs(path, val_docs) test_data = join_docs(path, test_docs) if not os.path.exists(out_path): os.mkdir(out_path) print(f"Saving to: {out_path}") train_data.to_csv(f'{out_path}/train_{lang}.csv', index=False) val_data.to_csv(f'{out_path}/dev_{lang}.csv', index=False) test_data.to_csv(f'{out_path}/test_{lang}.csv', index=False) copy_annotations(train_docs, f'{split_path}/train/{dataset_name}/{lang}') copy_annotations(val_docs, f'{split_path}/dev/{dataset_name}/{lang}') copy_annotations(test_docs, f'{split_path}/test/{dataset_name}/{lang}')
def looper( run_path: str, clang: str, model: str, year: str, categorize_misc: bool = False, ) -> dict: loader = LoadBSNLPDocuments(lang=clang, year=year) model_name = model.split('/')[-1] logger.info(f"Predicting for {model_name}") model_path = f'{run_path}/models/{model}' tag2code, code2tag = get_label_dicts(model_path) misctag2code, misccode2tag = {}, {} logger.info(f"tag2code: {tag2code}") logger.info(f"code2tag: {code2tag}") misc_model, _ = list_dir(f'{run_path}/misc_models') if categorize_misc: logger.info(f"Using misc model: {misc_model[0]}") misctag2code, misccode2tag = get_label_dicts( f'{run_path}/misc_models/{misc_model[0]}') logger.info(f"misctag2code: {misctag2code}") logger.info(f"misccode2tag: {misccode2tag}") predictor = ExtractPredictions(model_path=model_path, tag2code=tag2code, code2tag=code2tag) pred_misc = None if not categorize_misc else ExtractPredictions( model_path=f'./{run_path}/misc_models/{misc_model[0]}', tag2code=misctag2code, code2tag=misccode2tag) updater = UpdateBSNLPDocuments( lang=clang, year=year, path=f'{run_path}/predictions/bsnlp/{model_name}') predictions = {} data = loader.load_merged() tdset = tqdm.tqdm(data.items(), desc="Dataset") scores = [] for dataset, langs in tdset: tdset.set_description(f'Dataset: {dataset}') tlang = tqdm.tqdm(langs.items(), desc="Language") predictions[dataset] = {} for lang, docs in tlang: predictions[dataset][lang] = {} tlang.set_description(f'Lang: {tlang}') for docId, doc in tqdm.tqdm(docs.items(), desc="Docs"): to_pred = pd.DataFrame(doc['content']) if categorize_misc: # categorize the PRO and EVT to MISC, as the model only knows about it to_pred.loc[to_pred['ner'].isin(['B-PRO', 'B-EVT']), 'ner'] = f'B-MISC' to_pred.loc[to_pred['ner'].isin(['I-PRO', 'I-EVT']), 'ner'] = f'I-MISC' doc_scores, pred_data = predictor.predict( to_pred, tag2code, code2tag) doc_scores['id'] = f'{lang};{docId}' scores.append(doc_scores) # if pred_misc is not None and : if categorize_misc and len( pred_data.loc[pred_data['calcNER'].isin( ['B-MISC', 'I-MISC'])]) > 0: misc_data = pd.DataFrame(doc['content']) if len(misc_data.loc[~( misc_data['ner'].isin(['B-MISC', 'I-MISC']))]) > 0: # randomly choose a category for (B|I)-MISC category cat = random.choice(['PRO', 'EVT']) misc_data.loc[(misc_data['ner'] == 'B-MISC'), 'ner'] = f'B-{cat}' misc_data.loc[(misc_data['ner'] == 'I-MISC'), 'ner'] = f'I-{cat}' misc_data.loc[~(misc_data['ner'].isin( ['B-PRO', 'B-EVT', 'I-PRO', 'I-EVT'])), 'ner'] = 'O' _, misc_pred = pred_misc.predict(misc_data, misctag2code, misccode2tag) # pred_data['ner'] = pd.DataFrame(doc['content'])['ner'] # update the entries # update wherever there is misc in the original prediction pred_data.loc[ pred_data['calcNER'].isin(['B-MISC', 'I-MISC']), 'calcNER'] = misc_pred.loc[ pred_data['calcNER'].isin(['B-MISC', 'I-MISC']), 'calcNER'] # update wherever the new predictor made a prediction pred_data.loc[ misc_pred['calcNER']. isin(['B-PRO', 'B-EVT', 'I-PRO', 'I-EVT']), 'calcNER'] = misc_pred.loc[misc_pred['calcNER'].isin( ['B-PRO', 'B-EVT', 'I-PRO', 'I-EVT']), 'calcNER'] doc['content'] = pred_data.to_dict(orient='records') miscs = [ r['calcNER'] for r in doc['content'] if r['calcNER'] in ['B-MISC', 'I-MISC'] ] if len(miscs) > 0: raise Exception(f"STILL MORE MISCS??? {docId}, {miscs}") predictions[dataset][lang][docId] = pred_data.loc[~( pred_data['calcNER'] == 'O')].to_dict(orient='records') updater.update_merged(data) logger.info(f"Done predicting for {model_name}") return { 'model': model_name, 'preds': predictions, }, scores