예제 #1
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['text'])
        sl = LabeledText(guid, text)

        # -------------------- 训练数据json格式 --------------------
        #  {
        #      "text": "万通地产设计总监刘克峰;",
        #      "label": {
        #          "name": {
        #              "刘克峰": [[8, 10]]
        #          },
        #          "company": {
        #              "万通地产": [[0, 3]]
        #          },
        #          "position": {
        #              "设计总监": [[4, 7]]
        #          }
        #      }
        #  }

        entities = []
        classes = x['label'].keys()
        for c in classes:
            c_labels = x['label'][c]
            #  logger.debug(f"c_labels:{c_labels}")
            for label, span in c_labels.items():
                x0, x1 = span[0]
                sl.add_entity(c, x0, x1)

        yield str(i), text, None, sl.entities
예제 #2
0
def train_data_from_last_generator(train_file):

    train_files = [
        './data/event_element_sampling_0713.json',
    ]

    for train_file in train_files:
        tagged_train_json_data = json.load(open(train_file, 'r'))

        all_labels = tagged_train_json_data['labelCategories']
        id2label = {x['id']: x['text'] for x in all_labels}

        all_entities = tagged_train_json_data['labels']

        content = tagged_train_json_data['content']

        #  re_b = '(\\n[-]+ yanbao\\d\\d\\d\\.txt Begin [-]+\\n\\n)'
        #  re_e = '(\\n[-]+ yanbao\\d\\d\\d\\.txt End [-]+\\n\\n)'
        re_b = '(\\n[-]+ [\d]+ Begin [-]+\\n\\n)'
        re_e = '(\\n[-]+ [\d]+ End [-]+\\n\\n)'
        b_list = []
        for x in re.finditer(re_b, content):
            b_list.append((x.start(), x.end()))
        e_list = []
        for x in re.finditer(re_e, content):
            e_list.append((x.start(), x.end()))

        pages = [(x_b[0], x_b[1], x_e[0], x_e[1])
                 for x_b, x_e in zip(b_list, e_list)]

        logger.warning(f"pages: {pages}")

        for i, page in enumerate(pages):
            head_x0, head_x1, tail_x0, tail_x1 = page

            guid = f"{i}"
            text = content[head_x1:tail_x0]
            sl = LabeledText(guid, text)

            for entity in all_entities:
                s = entity['startIndex']
                e = entity['endIndex'] - 1
                assert e >= s
                if s >= head_x1 and e < tail_x0:
                    sl.add_entity(id2label[entity['categoryId']], s - head_x1,
                                  e - head_x1)
            yield guid, text, None, sl.entities
예제 #3
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['originalText'])
        sl = LabeledText(guid, text)

        entities = x['entities']
        for entity in entities:
            start_pos = entity['start_pos']
            end_pos = entity['end_pos'] - 1
            category = entity['label_type']
            sl.add_entity(category, start_pos, end_pos)

        yield str(i), text, None, sl.entities
예제 #4
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['text'])
        sl = LabeledText(guid, text)
        entities = []
        classes = x['label'].keys()
        for c in classes:
            c_labels = x['label'][c]
            #  logger.debug(f"c_labels:{c_labels}")
            for label, span in c_labels.items():
                x0, x1 = span[0]
                sl.add_entity(c, x0, x1)
        print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        break
예제 #5
0
def train_data_generator(train_file):

    data = load_ner_train_data(train_file)

    for i, x in enumerate(tqdm(data)):
        guid = x[0]
        text = clean_text(x[1])
        sl = LabeledText(guid, text)
        entities = x[2]
        for entity in entities:
            c = entity[0]
            x0 = int(entity[1])
            x1 = int(entity[2])
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield guid, text, None, sl.entities
예제 #6
0
def train_data_generator_0(train_file):

    with open(train_file, 'r') as fr:
        lines = fr.readlines()
        for line in tqdm(lines, desc=f"train & eval"):
            d = json.loads(line)
            guid = d['doc_id']
            text = clean_text(d['content'])

            seg_text = text
            seg_labels = []
            for e in d['events']:
                event_type = e['event_type']
                #  if event_type not in ['破产清算']:  # ['股东减持', '股东增持']:
                #      continue
                for k, v in e.items():
                    if not v:
                        continue

                    if k not in ['event_id', 'event_type']:
                        label = '_'.join((event_type, k))

                        #  if label not in ner_labels:
                        #      ner_labels.append(label)

                        i0 = seg_text.find(v)
                        while i0 >= 0:
                            #  if i0 >= 0:
                            if len(v) == 1:
                                #  if labels[i0] == 'O':
                                #      labels[i0] = f"S-{label}"
                                pass
                            else:
                                seg_labels.append((label, i0, i0 + len(v) - 1))
                            #  break
                            i0 = seg_text.find(v, i0 + len(v))

            sl = LabeledText(guid, text)
            for category, start_char, end_char in seg_labels:
                sl.add_entity(category, start_char, end_char)

            yield guid, text, None, sl.entities
예제 #7
0
def train_data_generator(train_text_file, train_bio_file):

    texts = load_texts(train_text_file)
    cond, labels = load_bioattr_labels(train_bio_file)

    for i, x in enumerate(tqdm(texts)):
        guid = str(i)
        text = clean_text(x)
        sl = LabeledText(guid, text)
        entities = labels[i]
        for entity in entities:
            c = entity[0]
            x0 = int(entity[1])
            x1 = int(entity[2])
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield str(i), text, None, sl.entities
예제 #8
0
def train_data_generator(train_file):

    data, _ = load_data(train_file)

    for i, x in enumerate(tqdm(data)):
        guid = x[1]
        text = clean_text(x[0])
        level1 = x[2]
        level2 = x[3]
        level3 = x[4]
        sl = LabeledText(guid, text)
        entities = x[5]
        for entity in entities:
            c = level1 + "_" + level2 + "_" + level3 + "_" + entity[2]
            x0 = int(entity[0])
            x1 = int(entity[1])
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield str(i), text, None, sl.entities
예제 #9
0
def train_data_generator(train_file):

    data = load_data(train_file)

    for i, x in enumerate(tqdm(data)):
        guid = str(i)
        text = clean_text(x[0])
        arguments = x[1]
        sl = LabeledText(guid, text)
        entities = []
        for key, value in arguments.items():
            argument = key
            event_type = value[0]
            role = value[1]
            start_index = int(value[2])
            c = event_type + "_" + role
            x0 = start_index
            x1 = start_index + len(argument) - 1
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield str(i), text, None, sl.entities