Exemplo n.º 1
0
def generate_submission(args):
    reviews_file = f"{args.latest_dir}/{args.dataset_name}_reviews_{args.local_id}.json"
    reviews = json.load(open(reviews_file, 'r'))

    submission_file = f"{args.submissions_dir}/{args.dataset_name}_submission_{args.local_id}.json.txt"
    with open(submission_file, 'w') as wt:
        for guid, json_data in reviews.items():
            output_data = {'originalText': json_data['text'], 'entities': []}
            for json_entity in json_data['entities']:
                output_data['entities'].append({
                    'label_type':
                    json_entity['category'],
                    'overlap':
                    0,
                    'start_pos':
                    json_entity['start'],
                    'end_pos':
                    json_entity['end'] + 1
                })
            output_data['entities'] = sorted(output_data['entities'],
                                             key=lambda x: x['start_pos'])
            output_string = json.dumps(output_data, ensure_ascii=False)
            wt.write(f"{output_string}\n")

    logger.info(f"Saved {len(reviews)} lines in {submission_file}")

    from theta.modeling import archive_local_model
    archive_local_model(args, submission_file)
Exemplo n.º 2
0
def generate_submission(args):
    '''
    submission_file = f"ccks2021_predict.json"
    test_results = {}

    json.dump(test_results,
              open(f"{submission_file}", 'w'),
              ensure_ascii=False,
              indent=2)
    '''
    reviews_file = f"{args.latest_dir}/{args.dataset_name}_reviews_{args.local_id}.json"
    reviews = json.load(open(reviews_file, 'r'))

    submission_file = f"{args.submissions_dir}/{args.dataset_name}_submission_{args.local_id}.json"
    D = []
    for guid, json_data in tqdm(reviews.items(), desc="events"):

        attributes = []
        #  output_data = {'doc_id': guid, 'events': []}
        text = json_data['text']
        id_ = json_data['guid']
        json_entities = json_data['entities']
        entities = []
        for json_entity in json_entities:
            category = json_entity['category']
            mention = json_entity['mention']
            start = int(json_entity['start'])
            end = int(json_entity['end'])
            entities.append((category, mention))
        D.append((text, id_, entities))

    rel_list = []
    with open(submission_file, 'w', encoding='utf-8') as f:
        for i in range(len(D)):
            texti, idxi, entities = D[i]
            for j in range(len(entities)):
                subject_type = entities[j][0]
                for k in range(len(entities)):
                    object_type = entities[k][0]
                    if j >= k or entities[j][1] == entities[k][1]:
                        continue
                    rel_list.append({
                        "id":
                        idxi,
                        "text":
                        texti,
                        "spo_list": [{
                            "subject": entities[j][1],
                            "subject-type": subject_type,
                            "object": entities[k][1],
                            "object-type": object_type
                        }]
                    })
        json.dump(rel_list, f, ensure_ascii=False, indent=4)

    logger.info(f"Saved {len(reviews)} lines in {submission_file}")

    from theta.modeling import archive_local_model
    archive_local_model(args, submission_file)
Exemplo n.º 3
0
def generate_submission(args):
    #  submission_file = os.path.join(args.output_dir,
    #                                 f"{args.dataset_name}_submission.xlsx")
    submission_file = f"{args.submissions_dir}/{args.dataset_name}_submission_{args.local_id}.xlsx"
    import xlwt
    workbook = xlwt.Workbook(encoding='utf-8')
    worksheet = workbook.add_sheet(f"{args.dataset_name}")

    worksheet.write(0, 0, label='原文')
    worksheet.write(0, 1, label='肿瘤原发部位')
    worksheet.write(0, 2, label='原发病灶大小')
    worksheet.write(0, 3, label='转移部位')

    reviews_file = f"{args.latest_dir}/{args.dataset_name}_reviews_{args.local_id}.json"
    reviews = json.load(open(reviews_file, 'r'))

    idx = 1
    for guid, json_data in reviews.items():
        text = json_data['text']
        entities = json_data['entities']
        label_entities = {}
        for entity in entities:
            c = entity['category']
            s = entity['start']
            e = entity['end'] + 1
            entity_text = text[s:e]

            if s > len(text) or e > len(text):
                continue
            if len(entity_text) == 0 or len(entity_text) > 16:
                continue
            if ';' in entity_text or '、' in entity_text:
                continue

            if c not in label_entities:
                label_entities[c] = []
            label_entities[c].append(entity_text)

        worksheet.write(idx, 0, label=text)
        if '肿瘤部位' in label_entities:
            worksheet.write(idx, 1, ','.join(label_entities['肿瘤部位']))
        if '病灶大小' in label_entities:
            worksheet.write(idx, 2, ','.join(label_entities['病灶大小']))
        if '转移部位' in label_entities:
            worksheet.write(idx, 3, ','.join(label_entities['转移部位']))

        idx += 1

    workbook.save(submission_file)

    if args.do_experiment:
        mlflow.log_param("submission_file", submission_file)
        mlflow.log_artifact(submission_file)

    logger.info(f"Saved {submission_file}")

    from theta.modeling import archive_local_model
    archive_local_model(args, submission_file)
Exemplo n.º 4
0
def generate_submission(args):
    reviews_file = f"{args.latest_dir}/{args.dataset_name}_reviews_{args.local_id}.tsv"
    df_reviews = pd.read_csv(reviews_file, sep='\t')

    submission_file = f"{args.submissions_dir}/{args.dataset_name}_submission_{args.local_id}.txt"
    with open(submission_file, 'w') as fw:
        for i, row in tqdm(df_reviews.iterrows()):
            text = row.text_a
            label = row.label
            fw.write(f"{text}\t{label}\n")
    logger.info(f"Saved {df_reviews.shape[0]} lines in {submission_file}")

    #  ----- Tracking -----
    if args.do_experiment:
        mlflow.log_param("submission_file", submission_file)
        mlflow.log_artifact(submission_file)

    from theta.modeling import archive_local_model
    archive_local_model(args, submission_file)
Exemplo n.º 5
0
def generate_submission(args):
    reviews_file = f"{args.latest_dir}/{args.dataset_name}_reviews_{args.local_id}.json"
    reviews = json.load(open(reviews_file, 'r'))

    submission_file = f"{args.submissions_dir}/{args.dataset_name}_submission_{args.local_id}.json.txt"
    results = []
    for guid, json_data in tqdm(reviews.items(), desc="events"):

        events = []
        #  output_data = {'doc_id': guid, 'events': []}
        json_entities = json_data['entities']

        for json_entity in json_entities:
            category = json_entity['category']
            mention = json_entity['mention']

            tokens = category.split('_')
            assert len(tokens) >= 2
            event_type = tokens[0]
            event_role = "".join(tokens[1:])
            events.append({'event_type': event_type, event_role: mention})

        done, final_events = process_only_once_events(events)
        if not done:
            done, final_events = merge_events_by_subject(events)
            if not done:
                done, final_events = merge_events_last(events)
                if not done:
                    final_events = merge_events(events)

        final_events = keep_events_completeness(final_events)
        result = {'doc_id': guid, 'events': final_events}
        results.append(result)

    with open(submission_file, 'w') as wt:
        for result in tqdm(results, desc="results"):
            line = json.dumps(result, ensure_ascii=False)
            wt.write(f"{line}\n")

    #  result = {'doc_id': guid, 'events': final_events}
    #          master_event_type = get_master_event_type(json_entities)
    #          if master_event_type in [
    #                  "破产清算",
    #                  "重大安全事故",
    #                  "高层死亡",
    #                  "重大资产损失",
    #                  "重大对外赔付",
    #          ]:
    #              process_only_once_events(json_entities)
    #          #  output_data['entities'].append({
    #          #      'label_type':
    #          #      json_entity['category'],
    #          #      'overlap':
    #          #      0,
    #          #      'start_pos':
    #          #      json_entity['start'],
    #          #      'end_pos':
    #          #      json_entity['end'] + 1
    #          #  })
    #          output_data['entities'] = sorted(output_data['entities'],
    #                                           key=lambda x: x['start_pos'])
    #          output_string = json.dumps(output_data, ensure_ascii=False)
    #          wt.write(f"{output_string}\n")
    #
    logger.info(f"Saved {len(reviews)} lines in {submission_file}")

    from theta.modeling import archive_local_model
    archive_local_model(args, submission_file)