Пример #1
0
def main(args):
    set_logging(args.log_dir)
    logger.setLevel(logging.INFO)
    logger.info(f"Parameters: {args}")

    logger.info('Reading data...')

    data_dir = r'./data/'
    data_name = f'{args.partition}.txt'
    articles = load_conll_2003_data(os.path.join(data_dir, data_name))

    nlp = spacy.load('en_core_web_md')
    docs = []
    for article in tqdm(articles):
        sents = article['sent_list']
        doc = construct_spacy_doc(sents, nlp)
        docs.append(doc)

    united_annotator = CoNLL2003Annotator().add_all()

    logger.info('labeling articles...')
    docs = list(united_annotator.pipe(docs))

    standarizer = ConLL2003Standardiser()
    docs = [standarizer(doc) for doc in docs]

    label_srcs = [k for k in docs[0].spans.keys()]
    label_srcs.sort()

    data_list = list()

    logger.info('converting labels...')
    for article, doc in tqdm(zip(articles, docs), total=len(articles)):
        sent_list = article['sent_list']
        lbs_list = article['labels_list']
        sent_annos = annotate_doc_with_spacy(sent_list, doc)
        assert len(sent_list) == len(lbs_list) == len(sent_annos)

        for sent, lbs, annos in zip(sent_list, lbs_list, sent_annos):
            data_inst = dict()
            anno_list = list()
            for src in label_srcs:
                anno_list.append(span_to_label(sent, annos[src]))
            data_inst['label'] = lbs
            data_inst['data'] = {'text': sent}
            data_inst['weak_labels'] = np.asarray(anno_list).T.tolist()

            data_list.append(data_inst)

    data_dict = {i: inst for i, inst in enumerate(data_list)}

    meta_dict = dict()
    meta_dict['lf'] = label_srcs
    meta_dict['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']

    logger.info('Saving results...')
    with open(os.path.join(args.save_loc, f"{args.partition}.json"),
              'w',
              encoding='utf-8') as f:
        json.dump(data_dict, f, ensure_ascii=False, indent=2)

    with open(os.path.join(args.save_loc, "meta.json"), 'w',
              encoding='utf-8') as f:
        json.dump(meta_dict, f, ensure_ascii=False, indent=2)

    logger.info('Exit with no error')
Пример #2
0
def main(args):
    set_seed_everywhere(args.seed)

    set_logging(args.log_dir)
    logger.setLevel(logging.INFO)
    logger.info(f"Parameters: {args}")

    logger.info('Reading data...')
    reader = LaptopsDatasetReader()

    data_path = {
        'train': './data/Laptop_Train_v2.xml',
        'test': './data/Laptops_Test_Data_phaseB.xml'
    }

    src_path = data_path[args.partition]
    src_data = reader.read(src_path)
    docs = list(src_data)

    logger.info("Annotating data...")
    docs = laptop_annotators(docs=docs)

    root = ElementTree.parse(src_path).getroot()
    xml_sents = root.findall("./sentence")

    sentences = list()
    for xml_sent in xml_sents:
        text = xml_sent.find("text").text
        sentences.append(text)

    results = annotate_sent_with_wiser_allennlp(sentences,
                                                docs,
                                                token_suffix='-TERM')
    src_token_list, src_anno_list, tagging_anno_list, linking_anno_list = results

    normalized_link_anno_list = linking_to_tagging_annos(
        tagging_anno_list, linking_anno_list)

    # combine link annotations and tag annotations
    combined_anno_list = list()
    for tag_anno, link_anno in zip(tagging_anno_list,
                                   normalized_link_anno_list):
        comb_anno = dict()
        for k, v in tag_anno.items():
            comb_anno[f'tag-{k}'] = v
        for k, v in link_anno.items():
            comb_anno[f'link-{k}'] = v
        combined_anno_list.append(comb_anno)

    if args.partition == 'train':
        indices = np.arange(len(src_token_list))
        np.random.shuffle(indices)
        train_partition = len(src_token_list) * 4 // 5

        label_srcs = list(combined_anno_list[0].keys())
        data_list = list()

        train_token_list = list()
        train_anno_list = list()
        train_lb_list = list()
        for i in indices[:train_partition]:
            train_token_list.append(src_token_list[i])
            train_anno_list.append(combined_anno_list[i])
            train_lb_list.append(src_anno_list[i])

        for sent, true_spans, anno_spans in zip(train_token_list,
                                                train_lb_list,
                                                train_anno_list):

            data_inst = dict()
            anno_list = list()
            lbs = span_to_label(sent, true_spans)
            for src in label_srcs:
                anno_list.append(span_to_label(sent, anno_spans[src]))
            data_inst['label'] = lbs
            data_inst['data'] = {'text': sent}
            data_inst['weak_labels'] = np.asarray(anno_list).T.tolist()

            data_list.append(data_inst)

        data_dict = {i: inst for i, inst in enumerate(data_list)}

        logger.info(['Saving training results...'])
        with open(os.path.join(args.save_loc, "train.json"),
                  'w',
                  encoding='utf-8') as f:
            json.dump(data_dict, f, ensure_ascii=False, indent=2)

        label_srcs = list(combined_anno_list[0].keys())
        data_list = list()

        valid_token_list = list()
        valid_anno_list = list()
        valid_lb_list = list()
        for i in indices[train_partition:]:
            valid_token_list.append(src_token_list[i])
            valid_anno_list.append(combined_anno_list[i])
            valid_lb_list.append(src_anno_list[i])

        for sent, true_spans, anno_spans in zip(valid_token_list,
                                                valid_lb_list,
                                                valid_anno_list):

            data_inst = dict()
            anno_list = list()
            lbs = span_to_label(sent, true_spans)
            for src in label_srcs:
                anno_list.append(span_to_label(sent, anno_spans[src]))
            data_inst['label'] = lbs
            data_inst['data'] = {'text': sent}
            data_inst['weak_labels'] = np.asarray(anno_list).T.tolist()

            data_list.append(data_inst)

        data_dict = {i: inst for i, inst in enumerate(data_list)}

        logger.info('Saving validation results...')
        with open(os.path.join(args.save_loc, f"valid.json"),
                  'w',
                  encoding='utf-8') as f:
            json.dump(data_dict, f, ensure_ascii=False, indent=2)

    else:

        label_srcs = list(combined_anno_list[0].keys())
        data_list = list()

        for sent, true_spans, anno_spans in zip(src_token_list, src_anno_list,
                                                combined_anno_list):

            data_inst = dict()
            anno_list = list()
            lbs = span_to_label(sent, true_spans)
            for src in label_srcs:
                anno_list.append(span_to_label(sent, anno_spans[src]))
            data_inst['label'] = lbs
            data_inst['data'] = {'text': sent}
            data_inst['weak_labels'] = np.asarray(anno_list).T.tolist()

            data_list.append(data_inst)

        data_dict = {i: inst for i, inst in enumerate(data_list)}

        logger.info('Saving test results...')
        with open(os.path.join(args.save_loc, f"{args.partition}.json"),
                  'w',
                  encoding='utf-8') as f:
            json.dump(data_dict, f, ensure_ascii=False, indent=2)

    meta_dict = dict()
    meta_dict['lf'] = label_srcs
    meta_dict['entity_types'] = ['TERM']

    with open(os.path.join(args.save_loc, "meta.json"), 'w',
              encoding='utf-8') as f:
        json.dump(meta_dict, f, ensure_ascii=False, indent=2)

    logger.info('Exit with no error')
Пример #3
0
def main(args):
    set_logging(args.log_dir)
    logger.setLevel(logging.INFO)
    logger.info(f"Parameters: {args}")

    logger.info('Reading data...')
    reader = NCBIDiseaseDatasetReader()

    data_path = {
        'train': './data/NCBItrainset_corpus.txt',
        'valid': './data/NCBIdevelopset_corpus.txt',
        'test': './data/NCBItestset_corpus.txt'
    }

    src_path = data_path[args.partition]
    src_data = reader.read(src_path)
    docs = list(src_data)

    logger.info("Annotating data...")
    docs = ncbi_annotators(docs=docs)

    sents = load_ncbi_sentences(src_path)

    assert len(sents) == len(docs)

    results = annotate_sent_with_wiser_allennlp(sents, docs, token_suffix='-Disease')
    src_token_list, src_anno_list, tagging_anno_list, linking_anno_list = results

    normalized_link_anno_list = linking_to_tagging_annos(tagging_anno_list, linking_anno_list)

    # combine link annotations and tag annotations
    combined_anno_list = list()
    for tag_anno, link_anno in zip(tagging_anno_list, normalized_link_anno_list):
        comb_anno = dict()
        for k, v in tag_anno.items():
            comb_anno[f'tag-{k}'] = v
        for k, v in link_anno.items():
            comb_anno[f'link-{k}'] = v
        combined_anno_list.append(comb_anno)

    label_srcs = list(combined_anno_list[0].keys())
    data_list = list()

    for sent, true_spans, anno_spans in zip(src_token_list, src_anno_list, combined_anno_list):

        data_inst = dict()
        anno_list = list()
        lbs = span_to_label(sent, true_spans)
        for src in label_srcs:
            anno_list.append(span_to_label(sent, anno_spans[src]))
        data_inst['label'] = lbs
        data_inst['data'] = {'text': sent}
        data_inst['weak_labels'] = np.asarray(anno_list).T.tolist()

        data_list.append(data_inst)

    data_dict = {i: inst for i, inst in enumerate(data_list)}

    meta_dict = dict()
    meta_dict['lf'] = label_srcs
    meta_dict['entity_types'] = ['Disease']

    logger.info('Saving results...')
    with open(os.path.join(args.save_loc, f"{args.partition}.json"), 'w', encoding='utf-8') as f:
        json.dump(data_dict, f, ensure_ascii=False, indent=2)

    with open(os.path.join(args.save_loc, "meta.json"), 'w', encoding='utf-8') as f:
        json.dump(meta_dict, f, ensure_ascii=False, indent=2)

    logger.info('Exit with no error')
Пример #4
0
def main(args):
    set_logging(args.log_dir)
    logger.setLevel(logging.INFO)
    logger.info(f"Parameters: {args}")

    logger.info('Reading data...')
    with open(os.path.join(args.save_loc, f"train.json"),
              'r',
              encoding='utf-8') as f:
        train_data = json.load(f)
    with open(os.path.join(args.save_loc, f"valid.json"),
              'r',
              encoding='utf-8') as f:
        valid_data = json.load(f)
    with open(os.path.join(args.save_loc, f"test.json"), 'r',
              encoding='utf-8') as f:
        test_data = json.load(f)

    logger.info('Reading metadata...')
    with open(os.path.join(args.save_loc, "meta.json"), 'r',
              encoding='utf-8') as f:
        meta = json.load(f)

    logger.info('Getting new metadata')

    max_length = 0
    for data in [train_data, valid_data, test_data]:
        for k, v in data.items():
            l_sent = len(v['data']['text'])
            if l_sent > max_length:
                max_length = l_sent

    meta['train_size'] = len(train_data)
    meta['valid_size'] = len(valid_data)
    meta['test_size '] = len(test_data)

    meta['max_length'] = max_length
    meta['num_lf'] = len(meta['lf'])
    meta['num_labels'] = 2 * len(meta['entity_types']) + 1

    # get the performance of each source
    t_lbs = list()
    w_lbs = [[] for _ in range(meta['num_lf'])]
    for k, v in train_data.items():
        t_lbs.append(v['label'])
        for i, w_lb in enumerate(np.asarray(v['weak_labels']).T):
            w_lbs[i].append(w_lb.tolist())

    for k, v in valid_data.items():
        t_lbs.append(v['label'])
        for i, w_lb in enumerate(np.asarray(v['weak_labels']).T):
            w_lbs[i].append(w_lb.tolist())

    for k, v in test_data.items():
        t_lbs.append(v['label'])
        for i, w_lb in enumerate(np.asarray(v['weak_labels']).T):
            w_lbs[i].append(w_lb.tolist())

    rec_src = list()
    logger.info(f'Source performance (F1 score)')
    for i, src_name in enumerate(meta['lf']):
        f1 = metrics.f1_score(t_lbs, w_lbs[i], mode='strict', scheme=IOB2)
        p = metrics.precision_score(t_lbs,
                                    w_lbs[i],
                                    mode='strict',
                                    scheme=IOB2)
        r = metrics.recall_score(t_lbs, w_lbs[i], mode='strict', scheme=IOB2)
        logger.info(f'[{src_name}] F1: {f1}; Precision: {p}; Recall: {r}.')
        if f1 > 0.05:
            rec_src.append(src_name)

    logger.info(
        f'The following sources are recommended for model evaluation:\n'
        f'\t{rec_src}')

    meta['lf_rec'] = rec_src
    meta['num_lf_rec'] = len(rec_src)

    logger.info('Saving results...')

    with open(os.path.join(args.save_loc, "meta.json"), 'w',
              encoding='utf-8') as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    logger.info('Exit with no error')