예제 #1
0
def load_nes(datasets: list, ) -> (dict, dict):
    documents = {}
    doc_alphabet = {}
    # doc_alphabet = defaultdict(dict)
    for dataset in datasets:
        dataset_name = dataset.split('/')[-1]
        if dataset_name not in ['covid-19', 'us_election_2020']:
            print(f"Skipping {dataset_name}")
            continue
        documents[dataset_name] = {}
        doc_alphabet[dataset_name] = defaultdict(dict)
        langs, _ = list_dir(f'{dataset}/predicted')
        for lang in langs:
            if lang.lower() not in RELEVANT_LANGS:
                logger.info(f"Skipping {dataset_name}/{lang}")
                continue
            documents[dataset_name][lang] = {}
            logger.info(f'Extracting from: {dataset}/{lang}')
            ne_path = f'{dataset}/predicted/{lang}'
            _, files = list_dir(ne_path)
            for file in files:
                df = pd.read_csv(f'{ne_path}/{file}',
                                 dtype={
                                     'docId': str,
                                     'sentenceId': str,
                                     'tokenId': str,
                                     'clID': str,
                                     'text': str,
                                     'lemma': str,
                                     'calcLemma': str,
                                     'upos': str,
                                     'xpos': str,
                                     'ner': str
                                 })
                df['lang'] = lang
                df = df.fillna('N/A')
                records = merge_nes(
                    df.loc[~(df[NER_FIELD] == 'O')].to_dict(orient='records'))
                for item in records:
                    dkey = f"{lang};{item['docId']};{item['sentenceId']};{item['tokenId']};{item['text']}"
                    fchar = item['text'][0].upper()
                    if dkey in doc_alphabet[dataset_name][fchar]:
                        raise Exception(f"[doc_alphabet] COLLISION!!! {dkey}")
                    doc_alphabet[dataset_name][fchar][dkey] = item
                    if dkey in documents[dataset_name][lang]:
                        raise Exception(f"[documents] COLLISION!!! {dkey}")
                    documents[dataset_name][lang][dkey] = item
    return {
        "normal": documents,
        "alphabetized": doc_alphabet,
    }
예제 #2
0
def main():
    args = parse_args()
    run_path = args.run_path if args.run_path is not None else "./data/models/"
    lang = args.lang
    year = args.year
    merge_misc = args.merge_misc

    print(f"Run path: {run_path}")
    print(f"Langs: {lang}")
    print(f"Year: {year}")
    print(f"Merge misc: {merge_misc}")

    models, _ = list_dir(f'{run_path}/models')
    logger.info(f"Models to predict: {json.dumps(models, indent=4)}")

    # tmodel = tqdm.tqdm(list(map(lambda x: (run_path, lang, x), models)), desc="Model")
    # predictions = pool.map(looper, tmodel)
    # predictions = list(map(looper, tmodel))
    predictions = []
    doc_scores = {}
    for model in tqdm.tqdm(models, desc="Model"):
        logger.info(f"Model: {model}")
        preds, scores = looper(run_path, lang, model, year, merge_misc)
        predictions.append(preds)
        doc_scores[model] = scores
    # logger.info(predictions)

    with open(f'{run_path}/all_predictions.json', 'w') as f:
        json.dump(predictions, f)
    with open(f'{run_path}/all_scores.json', 'w') as f:
        json.dump(predictions, f)
    logger.info("Done.")
예제 #3
0
def main():
    global pred_path, cluster_path
    args = parse_args()
    pred_path = args.pred_path if args.pred_path is not None else pred_path
    cluster_path = args.cluster_path if args.cluster_path is not None else cluster_path
    year = args.year
    lang = args.lang

    logger.info(f"Predictions path: {pred_path}")
    logger.info(f"Clusters path: {pred_path}")
    logger.info(f"Year: {year}")
    logger.info(f"Language: {lang}")

    path = pathlib.Path(pred_path)
    if not path.exists() or not path.is_dir():
        raise Exception(f"Path does not exist or is not a directory: `{pred_path}`")
    path = pathlib.Path(cluster_path)
    if not path.exists() or not path.is_dir():
        raise Exception(f"Path does not exist or is not a directory: `{cluster_path}`")

    logger.info("Loading the clusters...")
    clusters, ne_map = load_clusters(cluster_path)

    models, _ = list_dir(f'{pred_path}/predictions/bsnlp')
    for model in models:
        logger.info(f"Loading the documents for model `{model}`...")
        data = LoadBSNLPDocuments(year='test_2021', lang=lang, path=f'{pred_path}/predictions/bsnlp/{model}').load_predicted()

        logger.info(f"[{model}] Merging the cluster data into the prediction data")
        updated = update_clusters(data, ne_map)

        logger.info(f"[{model}] Persisting the changes...")
        UpdateBSNLPDocuments(year='test_2021', lang=lang, path=f'{pred_path}/predictions/bsnlp/{model}').update_clustered(updated)

    logger.info("Done.")
예제 #4
0
def convert_files(
    run_path: str,
    lang: str = 'sl',
    year: str = '2021',
) -> None:
    dirs, _ = list_dir(f'{run_path}/predictions/bsnlp')
    for dir in dirs:
        print(f"Working on {dir}")
        loader = LoadBSNLPDocuments(year=year,
                                    lang=lang,
                                    path=f'{run_path}/predictions/bsnlp/{dir}')
        updater = UpdateBSNLPDocuments(year=year,
                                       lang=lang,
                                       path=f'{run_path}/out/{dir}')
        data = loader.load_predicted(folder='clustered')
        # data = loader.load_predicted()
        updater.update_predicted(data)
예제 #5
0
def load_data(clear_cache: bool = False) -> (dict, dict):
    cache_path = f'{RUN_BASE_FNAME}/cached_data.json'
    cached_file = pathlib.Path(cache_path)
    if not clear_cache and cached_file.exists() and cached_file.is_file():
        mod_time = datetime.fromtimestamp(cached_file.stat().st_mtime)
        logger.info(
            f"Using cached data from `{cache_path}`, last modified at: `{mod_time.isoformat()}`"
        )
        with open(cache_path) as f:
            return json.load(f)
    # datasets = json.load(open("./data/results/dataset_pairs.json"))
    datasets, _ = list_dir(DATA_PATH)
    datasets = [f'{DATA_PATH}/{dataset}' for dataset in datasets]
    data = load_nes(datasets)
    with open(cache_path, 'w') as f:
        logger.info(f"Storing cached data at: {cache_path}")
        json.dump(data, f)
    return data
예제 #6
0
    def test(self, test_data: pd.DataFrame) -> (float, float, float):
        if not (os.path.exists(self.output_model_path)
                and os.path.isdir(self.output_model_path)):
            raise Exception(
                f"A model with the given parameters has not been trained yet,"
                f" or is not located at `{self.output_model_path}`.")
        models, _ = list_dir(self.output_model_path)
        models = [
            model_fname for model_fname in models
            if model_fname.startswith(self.output_model_fname)
        ]
        print("Models:", models)
        if not models:
            raise Exception(
                f"There are no trained models with the given criteria: `{self.output_model_fname}`"
            )

        logger.info("Loading the testing data...")
        test_data = self.convert_input(test_data)
        avg_acc, avg_f1, avg_p, avg_r, reports = [], [], [], [], []
        for model_fname in models:
            logger.info(f"Loading {model_fname}...")
            model = AutoModelForTokenClassification.from_pretrained(
                f"{self.output_model_path}/{model_fname}",
                num_labels=len(self.tag2code),
                label2id=self.tag2code,
                id2label=self.code2tag,
                output_attentions=False,
                output_hidden_states=False)
            model = model.to(self.device)
            _, acc, f1, p, r, report = self.__test(model, test_data)
            avg_acc.append(acc)
            avg_f1.append(f1)
            avg_p.append(p)
            avg_r.append(r)
            logger.info(f"Testing P: {p:.4f}, R: {r:.4f}, F1: {f1:.4f}")
            logger.info(f"Testing classification report:\n{report}")
        logger.info(f"Average accuracy: {np.mean(avg_acc):.4f}")
        f1 = np.mean(avg_f1)
        p = np.mean(avg_p)
        r = np.mean(avg_r)
        logger.info(f"Average P: {p:.4f}, R: {r:.4f}, F1: {f1:.4f}")
        return p, r, f1
예제 #7
0
 def load(self, dset: str) -> pd.DataFrame:
     dirs, _ = list_dir(self.base_fname)
     data = pd.DataFrame()
     for dataset in dirs:
         if dataset not in self.data_set:
             continue
         for lang in self.langs:
             fname = f"{self.base_fname}/{dataset}/splits/{lang}/{dset}_{lang}.csv"
             try:
                 df = pd.read_csv(f"{fname}")
             except:
                 if self.print_debug: print(f"[{dataset}] skipping {lang}.")
                 continue
             df['sentenceId'] = df['docId'].astype(str) + ';' + df['sentenceId'].astype('str') # + '-' + df['tokenId'].astype(str)
             if self.merge_misc:
                 df['ner'] = df['ner'].map(lambda x: x.replace("PRO", "MISC").replace("EVT", "MISC"))
             if self.misc_data_only:
                 df['ner'] = df['ner'].map(lambda x: "O" if x[2:] in ["PER", "LOC", "ORG"] else x)
             data = pd.concat([data, df])
     return data
예제 #8
0
def create_split(
    dataset_dir: str,
    lang: str,
    docs: list,
    split_path: str,
) -> None:
    path = f"{dataset_dir}/merged/{lang}"
    out_path = f"{dataset_dir}/splits/{lang}/"
    dataset_name = dataset_dir.split('/')[-1]
    print(path)
    _, files = list_dir(path)
    joined = join_files(files, docs)
    train_docs, test_docs = train_test_split(
        joined,
        train_size=TRAIN_SIZE,
        random_state=random_state,
    )
    train_docs, val_docs = train_test_split(
        joined,
        test_size=TRAIN_SIZE * 0.1,
        random_state=random_state,
    )
    # print(len(files), len(train_docs), len(val_docs), len(test_docs))
    train_data = join_docs(path, train_docs)
    val_data = join_docs(path, val_docs)
    test_data = join_docs(path, test_docs)

    if not os.path.exists(out_path):
        os.mkdir(out_path)
    print(f"Saving to: {out_path}")
    train_data.to_csv(f'{out_path}/train_{lang}.csv', index=False)
    val_data.to_csv(f'{out_path}/dev_{lang}.csv', index=False)
    test_data.to_csv(f'{out_path}/test_{lang}.csv', index=False)

    copy_annotations(train_docs, f'{split_path}/train/{dataset_name}/{lang}')
    copy_annotations(val_docs, f'{split_path}/dev/{dataset_name}/{lang}')
    copy_annotations(test_docs, f'{split_path}/test/{dataset_name}/{lang}')
예제 #9
0
def looper(
    run_path: str,
    clang: str,
    model: str,
    year: str,
    categorize_misc: bool = False,
) -> dict:
    loader = LoadBSNLPDocuments(lang=clang, year=year)

    model_name = model.split('/')[-1]
    logger.info(f"Predicting for {model_name}")
    model_path = f'{run_path}/models/{model}'

    tag2code, code2tag = get_label_dicts(model_path)
    misctag2code, misccode2tag = {}, {}

    logger.info(f"tag2code: {tag2code}")
    logger.info(f"code2tag: {code2tag}")

    misc_model, _ = list_dir(f'{run_path}/misc_models')
    if categorize_misc:
        logger.info(f"Using misc model: {misc_model[0]}")
        misctag2code, misccode2tag = get_label_dicts(
            f'{run_path}/misc_models/{misc_model[0]}')
        logger.info(f"misctag2code: {misctag2code}")
        logger.info(f"misccode2tag: {misccode2tag}")

    predictor = ExtractPredictions(model_path=model_path,
                                   tag2code=tag2code,
                                   code2tag=code2tag)
    pred_misc = None if not categorize_misc else ExtractPredictions(
        model_path=f'./{run_path}/misc_models/{misc_model[0]}',
        tag2code=misctag2code,
        code2tag=misccode2tag)

    updater = UpdateBSNLPDocuments(
        lang=clang,
        year=year,
        path=f'{run_path}/predictions/bsnlp/{model_name}')
    predictions = {}
    data = loader.load_merged()
    tdset = tqdm.tqdm(data.items(), desc="Dataset")
    scores = []
    for dataset, langs in tdset:
        tdset.set_description(f'Dataset: {dataset}')
        tlang = tqdm.tqdm(langs.items(), desc="Language")
        predictions[dataset] = {}
        for lang, docs in tlang:
            predictions[dataset][lang] = {}
            tlang.set_description(f'Lang: {tlang}')
            for docId, doc in tqdm.tqdm(docs.items(), desc="Docs"):
                to_pred = pd.DataFrame(doc['content'])
                if categorize_misc:
                    # categorize the PRO and EVT to MISC, as the model only knows about it
                    to_pred.loc[to_pred['ner'].isin(['B-PRO', 'B-EVT']),
                                'ner'] = f'B-MISC'
                    to_pred.loc[to_pred['ner'].isin(['I-PRO', 'I-EVT']),
                                'ner'] = f'I-MISC'
                doc_scores, pred_data = predictor.predict(
                    to_pred, tag2code, code2tag)
                doc_scores['id'] = f'{lang};{docId}'
                scores.append(doc_scores)
                # if pred_misc is not None and :
                if categorize_misc and len(
                        pred_data.loc[pred_data['calcNER'].isin(
                            ['B-MISC', 'I-MISC'])]) > 0:
                    misc_data = pd.DataFrame(doc['content'])
                    if len(misc_data.loc[~(
                            misc_data['ner'].isin(['B-MISC', 'I-MISC']))]) > 0:
                        # randomly choose a category for (B|I)-MISC category
                        cat = random.choice(['PRO', 'EVT'])
                        misc_data.loc[(misc_data['ner'] == 'B-MISC'),
                                      'ner'] = f'B-{cat}'
                        misc_data.loc[(misc_data['ner'] == 'I-MISC'),
                                      'ner'] = f'I-{cat}'
                    misc_data.loc[~(misc_data['ner'].isin(
                        ['B-PRO', 'B-EVT', 'I-PRO', 'I-EVT'])), 'ner'] = 'O'
                    _, misc_pred = pred_misc.predict(misc_data, misctag2code,
                                                     misccode2tag)
                    # pred_data['ner'] = pd.DataFrame(doc['content'])['ner']
                    # update the entries
                    # update wherever there is misc in the original prediction
                    pred_data.loc[
                        pred_data['calcNER'].isin(['B-MISC', 'I-MISC']),
                        'calcNER'] = misc_pred.loc[
                            pred_data['calcNER'].isin(['B-MISC', 'I-MISC']),
                            'calcNER']
                    # update wherever the new predictor made a prediction
                    pred_data.loc[
                        misc_pred['calcNER'].
                        isin(['B-PRO', 'B-EVT', 'I-PRO', 'I-EVT']),
                        'calcNER'] = misc_pred.loc[misc_pred['calcNER'].isin(
                            ['B-PRO', 'B-EVT', 'I-PRO', 'I-EVT']), 'calcNER']
                doc['content'] = pred_data.to_dict(orient='records')
                miscs = [
                    r['calcNER'] for r in doc['content']
                    if r['calcNER'] in ['B-MISC', 'I-MISC']
                ]
                if len(miscs) > 0:
                    raise Exception(f"STILL MORE MISCS??? {docId}, {miscs}")
                predictions[dataset][lang][docId] = pred_data.loc[~(
                    pred_data['calcNER'] == 'O')].to_dict(orient='records')
    updater.update_merged(data)
    logger.info(f"Done predicting for {model_name}")
    return {
        'model': model_name,
        'preds': predictions,
    }, scores