Exemplo n.º 1
0
 def generateConfusionMatrices(self):
     counter = 0
     if self.options['LOCAL']:
         PRINT_THRESHOLD = 10
     else:
         PRINT_THRESHOLD = 10000
     with open(self.options['DATA_FILE']) as f:
         for line in f:
             line = line.strip().split('\t')
             if options['USE_CONTEXT']:
                 # TODO : Fill this out
                 pass
             else:
                 _label = eval(line[3])
                 if self.options['PR_CURVE']:
                     self.gold_labels.append(_label)
                 for method in self.options['EMBEDDING_METHODS']:
                     self.updateConfusionMatrix(method,
                                                my_tokenize(line[1]),
                                                my_tokenize(line[2]),
                                                _label)
                 counter += 1
                 if (counter % PRINT_THRESHOLD) == 0:
                     print counter, 'DONE'
                     sys.stdout.flush()
     if self.options['PR_CURVE']:
         self.generateProbabilities()
Exemplo n.º 2
0
def text_tokenize(txt, sent_start):

    tokens = my_utils.my_tokenize(txt)
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        yield token, offset + sent_start, offset + len(token) + sent_start
        offset += len(token)
Exemplo n.º 3
0
def text_tokenize_and_postagging(txt, sent_start):
    tokens = my_utils.my_tokenize(txt)
    pos_tags = nltk.pos_tag(tokens)

    offset = 0
    for token, pos_tag in pos_tags:
        offset = txt.find(token, offset)
        yield token, pos_tag, offset + sent_start, offset + len(
            token) + sent_start
        offset += len(token)
Exemplo n.º 4
0
def to_conll(all_news, source_filter, type_filter, out_file):
    out_f = codecs.open(out_file, 'w', 'utf-8')
    sent_num = 0

    for news in all_news:
        if news['source'] not in source_filter:
            continue
        if news['category'] not in type_filter:
            continue

        tokens = my_tokenize(news['title'])
        for token in tokens:
            out_f.write(token + "\n")

        out_f.write("\n")
        sent_num += 1

        all_sents_inds = []
        generator = sent_tokenizer.span_tokenize(news['description'])
        for t in generator:
            all_sents_inds.append(t)

        for ind in range(len(all_sents_inds)):
            t_start = all_sents_inds[ind][0]
            t_end = all_sents_inds[ind][1]
            sent_text = news['description'][t_start:t_end]

            tokens = my_tokenize(sent_text)
            for token in tokens:
                out_f.write(token + "\n")

            out_f.write("\n")
            sent_num += 1

    out_f.close()
    print("write {} into {}".format(sent_num, out_file))
Exemplo n.º 5
0
def pubmed_to_conll(dir, out_file):
    sent_num = 0
    out_f = codecs.open(out_file, 'w', 'utf-8')

    for input_file_name in os.listdir(dir):
        if input_file_name.find(".txt") == -1:
            continue

        with codecs.open(os.path.join(dir, input_file_name), 'r',
                         'utf-8') as f:
            text = ''
            for line in f:
                line = line.strip()
                if line.find("|t|") != -1:
                    p = line.find("|t|")
                    text += line[p + len("|t|"):] + " "
                elif line.find("|a|") != -1:
                    p = line.find("|a|")
                    text += line[p + len("|a|"):]

            all_sents_inds = []
            generator = sent_tokenizer.span_tokenize(text)
            for t in generator:
                all_sents_inds.append(t)

            for ind in range(len(all_sents_inds)):
                t_start = all_sents_inds[ind][0]
                t_end = all_sents_inds[ind][1]
                sent_text = text[t_start:t_end]

                tokens = my_tokenize(sent_text)
                for token in tokens:
                    out_f.write(token + "\n")

                out_f.write("\n")
                sent_num += 1

    out_f.close()
    print("write {} into {}".format(sent_num, out_file))
Exemplo n.º 6
0
def getRelationInstanceForOneDoc(doc_token, entities, doc_name, data):
    X = []
    other = []

    row_num = len(entities)

    for latter_idx in range(row_num):

        for former_idx in range(row_num):

            if former_idx < latter_idx:

                former = entities[former_idx]
                latter = entities[latter_idx]


                if math.fabs(latter.sent_idx-former.sent_idx) >= data.sent_window:
                    continue

                # for double annotation, we don't generate instances
                if former.start==latter.start and former.end==latter.end:
                    continue

                #type_constraint = relationConstraint(former['type'], latter['type'])
                type_constraint = relationConstraint_chapman(former.type, latter.type)
                if type_constraint == 0:
                    continue

                # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter
                sent_idx = former.sent_idx
                context_token = pd.DataFrame(columns=doc_token.columns)
                base = 0
                former_tf_start, former_tf_end = -1, -1
                latter_tf_start, latter_tf_end = -1, -1
                while sent_idx <= latter.sent_idx:
                    sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

                    if former.sent_idx == sent_idx:
                        former_tf_start, former_tf_end = base+former.tf_start, base+former.tf_end
                    if latter.sent_idx == sent_idx:
                        latter_tf_start, latter_tf_end = base+latter.tf_start, base+latter.tf_end

                    context_token = context_token.append(sentence, ignore_index=True)

                    base += len(sentence['text'])
                    sent_idx += 1

                if context_token.shape[0] > data.max_seq_len:
                    # truncate
                    logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0]))
                    context_token = context_token.iloc[:data.max_seq_len]


                words = []
                postags = []
                cap = []
                chars = []
                positions1 = []
                positions2 = []
                former_token = []
                latter_token = []
                i = 0
                for _, token in context_token.iterrows():
                    if data.number_normalized:
                        word = utils.functions.normalize_word(token['text'])
                    else:
                        word = token['text']
                    entity_word = my_utils1.normalizeWord(token['text'])
                    words.append(data.word_alphabet.get_index(word))
                    postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag']))
                    cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index(
                        str(my_utils.featureCapital(token['text']))))
                    char_for1word = []
                    for char in word:
                        char_for1word.append(data.char_alphabet.get_index(char))
                    chars.append(char_for1word)

                    if i < former_tf_start:
                        positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            former_tf_start - i))

                    elif i > former_tf_end:
                        positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            former_tf_end - i))
                        pass
                    else:
                        positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                        former_token.append(
                            data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                    if i < latter_tf_start:
                        positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            latter_tf_start - i))
                        pass
                    elif i > latter_tf_end:
                        positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            latter_tf_end - i))
                        pass
                    else:
                        positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                        latter_token.append(
                            data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                    i += 1

                if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity
                    # splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", former['text'])
                    splitted = my_utils.my_tokenize(former.text)
                    for s in splitted:
                        s = s.strip()
                        if s != "":
                            former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))
                if len(latter_token) == 0:
                    #splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", latter['text'])
                    splitted = my_utils.my_tokenize(latter.text)
                    for s in splitted:
                        s = s.strip()
                        if s != "":
                            latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))

                assert len(former_token)>0
                assert len(latter_token)>0


                features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2}
                if type_constraint == 1:
                    features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type)
                    features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type)
                    features['e1_token'] = former_token
                    features['e2_token'] = latter_token
                else:
                    features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type)
                    features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type)
                    features['e1_token'] = latter_token
                    features['e2_token'] = former_token

                features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter.tf_start-former.tf_end)

                entity_between = getEntitiesBetween(former, latter, entities)
                features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(len(entity_between))

                X.append(features)

                other.append((former, latter))

    return X, other
Exemplo n.º 7
0
def getRelationInstance2(tokens, entities, relations, names, data):
    X = []
    Y = []
    cnt_neg = 0

    for i in tqdm(range(len(relations))):

        doc_relation = relations[i]
        doc_token = tokens[i]
        doc_entity = entities[i] # entity are sorted by start offset
        doc_name = names[i]

        row_num = doc_entity.shape[0]

        for latter_idx in range(row_num):

            for former_idx in range(row_num):

                if former_idx < latter_idx:

                    former = doc_entity.iloc[former_idx]
                    latter = doc_entity.iloc[latter_idx]


                    if math.fabs(latter['sent_idx']-former['sent_idx']) >= data.sent_window:
                        continue

                    # for double annotation, we don't generate instances
                    if former['start']==latter['start'] and former['end']==latter['end']:
                        continue

                    #type_constraint = relationConstraint(former['type'], latter['type'])
                    type_constraint = relationConstraint_chapman(former['type'], latter['type'])
                    if type_constraint == 0:
                        continue

                    gold_relations = doc_relation[
                        (
                                ((doc_relation['entity1_id'] == former['id']) & (
                                            doc_relation['entity2_id'] == latter['id']))
                                |
                                ((doc_relation['entity1_id'] == latter['id']) & (
                                            doc_relation['entity2_id'] == former['id']))
                        )
                    ]
                    if gold_relations.shape[0] > 1:
                        #raise RuntimeError("the same entity pair has more than one relations")
                        logging.debug("entity {} and {} has more than one relations".format(former['id'], latter['id']))
                        continue

                    # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter
                    sent_idx = former['sent_idx']
                    context_token = pd.DataFrame(columns=doc_token.columns)
                    base = 0
                    former_tf_start, former_tf_end = -1, -1
                    latter_tf_start, latter_tf_end = -1, -1
                    while sent_idx <= latter['sent_idx']:
                        sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

                        if former['sent_idx'] == sent_idx:
                            former_tf_start, former_tf_end = base+former['tf_start'], base+former['tf_end']
                        if latter['sent_idx'] == sent_idx:
                            latter_tf_start, latter_tf_end = base+latter['tf_start'], base+latter['tf_end']

                        context_token = context_token.append(sentence, ignore_index=True)

                        base += len(sentence['text'])
                        sent_idx += 1

                    if context_token.shape[0] > data.max_seq_len:
                        # truncate
                        logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0]))
                        context_token = context_token.iloc[:data.max_seq_len]


                    words = []
                    postags = []
                    cap = []
                    chars = []
                    positions1 = []
                    positions2 = []
                    former_token = []
                    latter_token = []
                    i = 0
                    for _, token in context_token.iterrows():
                        if data.number_normalized:
                            word = utils.functions.normalize_word(token['text'])
                        else:
                            word = token['text']
                        entity_word = my_utils1.normalizeWord(token['text'])
                        words.append(data.word_alphabet.get_index(word))
                        postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag']))
                        cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index(str(my_utils.featureCapital(token['text']))))
                        char_for1word = []
                        for char in word:
                            char_for1word.append(data.char_alphabet.get_index(char))
                        chars.append(char_for1word)

                        if i < former_tf_start:
                            positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_start - i))

                        elif i > former_tf_end:
                            positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_end - i))
                            pass
                        else:
                            positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                            former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                        if i < latter_tf_start:
                            positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_start - i))
                            pass
                        elif i > latter_tf_end:
                            positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_end - i))
                            pass
                        else:
                            positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                            latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                        i += 1

                    if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity
                        splitted = my_utils.my_tokenize(former['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))
                    if len(latter_token) == 0:
                        splitted = my_utils.my_tokenize(latter['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))

                    assert len(former_token)>0
                    assert len(latter_token)>0


                    features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2}
                    if type_constraint == 1:
                        features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type'])
                        features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type'])
                        features['e1_token'] = former_token
                        features['e2_token'] = latter_token
                    else:
                        features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type'])
                        features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type'])
                        features['e1_token'] = latter_token
                        features['e2_token'] = former_token

                    features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter['tf_start']-former['tf_end'])

                    entity_between = doc_entity[((doc_entity['start']>=former['end']) & (doc_entity['end']<=latter['start']))]
                    features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(entity_between.shape[0])

                    X.append(features)

                    if gold_relations.shape[0] == 0:
                        Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>'))
                        cnt_neg += 1
                    else:
                        gold_answer = gold_relations.iloc[0]['type']
                        Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer))


    neg = 100.0*cnt_neg/len(Y)

    logging.info("positive instance {}%, negative instance {}%".format(100-neg, neg))
    return X, Y
Exemplo n.º 8
0
def load_data(data_dir, mode='train'):

    doc_num = 0
    sent_num = 0
    max_sent_length = 0
    min_sent_length = 9999
    total_sent_length = 0

    documents = []

    alphabet_category = Alphabet('category', True)
    print(os.listdir(data_dir))
    for input_file_name in os.listdir(data_dir):

        alphabet_category.add(input_file_name)

        wb = load_workbook(os.path.join(data_dir, input_file_name))

        sheetnames = wb.get_sheet_names()
        ws = wb.get_sheet_by_name(sheetnames[0])

        for row_idx, row in enumerate(ws.rows):
            if row_idx == 0:
                continue  # head
            document = Document()
            document.pmid = row[0].value
            document.title = row[1].value
            document.abstract = row[2].value
            if mode == 'train':
                document.relevant_sentences = parseReleventFromExcel(
                    row[3].value)
            document.category = input_file_name

            all_sents_inds = []
            generator = nlp_tool.span_tokenize(document.abstract)
            for t in generator:
                all_sents_inds.append(t)

            for ind in range(len(all_sents_inds)):
                sentence = Sentence()
                sentence.start = all_sents_inds[ind][0]
                sentence.end = all_sents_inds[ind][1]

                offset = 0
                sentence.text = document.abstract[sentence.start:sentence.end]
                if len(document.relevant_sentences) != 0:
                    if sentence.text in document.relevant_sentences:
                        sentence.label = 'yes'
                    else:
                        sentence.label = 'no'
                else:
                    sentence.label = 'no'

                # replace due to nltk transfer " to other character, see https://github.com/nltk/nltk/issues/1630
                sentence.text = sentence.text.replace('"', " ")
                sentence.text = sentence.text.replace('\'', " ")
                for token_txt in my_tokenize(sentence.text):
                    token = {}
                    offset = sentence.text.find(token_txt, offset)
                    if offset == -1:
                        raise RuntimeError("can't find {} in '{}'".format(
                            token_txt, sentence.text))

                    token['text'] = token_txt
                    token['start'] = sentence.start + offset
                    token['end'] = sentence.start + offset + len(token_txt)
                    token['wp'] = wp_tokenizer.tokenize(token_txt)
                    if len(
                            token['wp']
                    ) == 0:  # for some oov tokens (e.g., \x99), wp_tokenizer return a empty list
                        token['wp'] = ['[UNK]']

                    sentence.tokens.append(token)
                    offset += len(token_txt)

                document.sentences.append(sentence)
                sent_num += 1
                total_sent_length += len(sentence.tokens)
                if len(sentence.tokens) > max_sent_length:
                    max_sent_length = len(sentence.tokens)
                if len(sentence.tokens) < min_sent_length:
                    min_sent_length = len(sentence.tokens)

            documents.append(document)
            doc_num += 1

    logging.info("{} statistics".format(data_dir))
    logging.info("doc number {}, sent number {}".format(doc_num, sent_num))
    logging.info(
        "avg sent length {}, max sent length {}, min sent length {}".format(
            total_sent_length // sent_num, max_sent_length, min_sent_length))

    return documents, alphabet_category
Exemplo n.º 9
0
def getRelationInstance(tokens, entities, relations, names, data):
    X = []
    Y = []
    cnt_neg = 0

    for i in tqdm(range(len(relations))):

        doc_relation = relations[i]
        doc_token = tokens[i]
        doc_entity = entities[i]  # entity are sorted by start offset
        doc_name = names[i]

        row_num = doc_entity.shape[0]

        for latter_idx in range(row_num):

            for former_idx in range(row_num):

                if former_idx < latter_idx:

                    former = doc_entity.iloc[former_idx]
                    latter = doc_entity.iloc[latter_idx]

                    if former['text'] == latter['text']:
                        continue

                    gold_relations = doc_relation[(
                        ((doc_relation['entity1_text'] == former['text']) &
                         (doc_relation['entity2_text'] == latter['text']))
                        | ((doc_relation['entity1_text'] == latter['text']) &
                           (doc_relation['entity2_text'] == former['text'])))]
                    # if gold_relations.shape[0] == 0:
                    #     raise RuntimeError("{}: entity {} and {} has strange relations".format(doc_name, former['id'], latter['id']))

                    context_token = doc_token
                    former_tf_start, former_tf_end = former[
                        'tf_start'], former['tf_end']
                    latter_tf_start, latter_tf_end = latter[
                        'tf_start'], latter['tf_end']

                    if context_token.shape[0] > data.max_seq_len:
                        # truncate
                        logging.debug("exceed max_seq_len {} {}".format(
                            doc_name, context_token.shape[0]))
                        context_token = context_token.iloc[:data.max_seq_len]

                    words = []
                    postags = []
                    cap = []
                    chars = []
                    positions1 = []
                    positions2 = []
                    former_token = []
                    latter_token = []
                    i = 0
                    for _, token in context_token.iterrows():
                        if data.number_normalized:
                            word = normalize_word(token['text'])
                        else:
                            word = token['text']
                        entity_word = my_utils1.normalizeWord(token['text'])
                        words.append(data.word_alphabet.get_index(word))
                        postags.append(data.feature_alphabets[
                            data.feature_name2id['[POS]']].get_index(
                                token['postag']))
                        cap.append(data.feature_alphabets[
                            data.feature_name2id['[Cap]']].get_index(
                                str(my_utils.featureCapital(token['text']))))
                        char_for1word = []
                        for char in word:
                            char_for1word.append(
                                data.char_alphabet.get_index(char))
                        chars.append(char_for1word)

                        if i < former_tf_start:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(former_tf_start - i))

                        elif i > former_tf_end:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(former_tf_end - i))

                        else:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(0))
                            former_token.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[ENTITY]']].get_index(
                                    entity_word))

                        if i < latter_tf_start:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(latter_tf_start - i))

                        elif i > latter_tf_end:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(latter_tf_end - i))

                        else:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(0))
                            latter_token.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[ENTITY]']].get_index(
                                    entity_word))

                        i += 1

                    if len(
                            former_token
                    ) == 0:  # truncated part contains entity, so we have to use the text in doc_entity
                        splitted = my_utils.my_tokenize(former['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                former_token.append(data.re_feature_alphabets[
                                    data.
                                    re_feature_name2id['[ENTITY]']].get_index(
                                        my_utils1.normalizeWord(s)))
                    if len(latter_token) == 0:
                        splitted = my_utils.my_tokenize(latter['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                latter_token.append(data.re_feature_alphabets[
                                    data.
                                    re_feature_name2id['[ENTITY]']].get_index(
                                        my_utils1.normalizeWord(s)))

                    assert len(former_token) > 0
                    assert len(latter_token) > 0

                    features = {
                        'tokens': words,
                        'postag': postags,
                        'cap': cap,
                        'char': chars,
                        'positions1': positions1,
                        'positions2': positions2
                    }
                    features['e1_type'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_TYPE]']].get_index(
                            former['type'])
                    features['e2_type'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_TYPE]']].get_index(
                            latter['type'])
                    features['e1_token'] = former_token
                    features['e2_token'] = latter_token

                    features['tok_num_betw'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[TOKEN_NUM]']].get_index(
                            latter['tf_start'] - former['tf_end'])

                    entity_between = doc_entity[(
                        (doc_entity['start'] >= former['end']) &
                        (doc_entity['end'] <= latter['start']))]
                    features['et_num'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_NUM]']].get_index(
                            entity_between.shape[0])

                    X.append(features)

                    gold_answer = '</unk>'
                    for _, gold_relation in gold_relations.iterrows():
                        if gold_relation['type'] != 'None':
                            gold_answer = gold_relation['type']
                            break

                    Y.append(data.re_feature_alphabets[data.re_feature_name2id[
                        '[RELATION]']].get_index(gold_answer))
                    if gold_answer == '</unk>':
                        cnt_neg += 1

                    # if gold_relations.iloc[0]['type']=='None' and gold_relations.iloc[1]['type']=='None':
                    #     Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>'))
                    #     cnt_neg += 1
                    # else:
                    #     gold_answer = gold_relations.iloc[0]['type'] if gold_relations.iloc[0]['type']!='None' else gold_relations.iloc[1]['type']
                    #     Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer))

    neg = 100.0 * cnt_neg / len(Y)

    logging.info("positive instance {}%, negative instance {}%".format(
        100 - neg, neg))
    return X, Y
def load_data(data_dir):

    doc_num = 0
    sent_num = 0
    entity_num = 0
    max_sent_length = 0
    min_sent_length = 9999
    total_sent_length = 0

    documents = []
    for input_file_name in os.listdir(data_dir):
        if input_file_name.find(".txt") != -1:
            document = Document()
            document.name = input_file_name

            ann_file_name = input_file_name.replace(".txt", '.ann')
            if os.path.isfile(os.path.join(data_dir, ann_file_name)):
                with codecs.open(os.path.join(data_dir, ann_file_name), 'r',
                                 'UTF-8') as fp:

                    for line in fp:
                        line = line.strip()
                        if line == '':
                            continue
                        if line[0] == '#':  # ignored annotations for task 2
                            continue
                        entity = {}
                        columns = line.split('\t')
                        entity['id'] = columns[0]
                        columns_1 = columns[1].split(" ")
                        entity['type'] = columns_1[0]
                        entity['start'] = int(columns_1[1])
                        entity['end'] = int(columns_1[2])
                        entity['text'] = columns[2]

                        document.entities.append(entity)
                        entity_num += 1

            with codecs.open(os.path.join(data_dir, input_file_name), 'r',
                             'UTF-8') as fp:
                document.text = fp.read()

            all_sents_inds = []
            generator = nlp_tool.span_tokenize(document.text)
            for t in generator:
                all_sents_inds.append(t)

            for ind in range(len(all_sents_inds)):
                sentence = Sentence()
                sentence.start = all_sents_inds[ind][0]
                sentence.end = all_sents_inds[ind][1]

                offset = 0
                sentence_txt = document.text[sentence.start:sentence.end]
                # replace due to nltk transfer " to other character, see https://github.com/nltk/nltk/issues/1630
                sentence_txt = sentence_txt.replace('"', " ")
                sentence_txt = sentence_txt.replace('\'', " ")
                for token_txt in my_tokenize(sentence_txt):
                    token = {}
                    offset = sentence_txt.find(token_txt, offset)
                    if offset == -1:
                        raise RuntimeError("can't find {} in '{}'".format(
                            token_txt, sentence_txt))

                    token['text'] = token_txt
                    token['start'] = sentence.start + offset
                    token['end'] = sentence.start + offset + len(token_txt)
                    token['wp'] = wp_tokenizer.tokenize(token_txt)
                    if len(
                            token['wp']
                    ) == 0:  # for some oov tokens (e.g., \x99), wp_tokenizer return a empty list
                        token['wp'] = ['[UNK]']
                    # if len(document.entities) != 0:
                    #     token['label'] = getLabel_BIO(token['start'], token['end'], document.entities)
                    token['label'] = getLabel_BIO(token['start'], token['end'],
                                                  document.entities)

                    sentence.tokens.append(token)
                    offset += len(token_txt)

                document.sentences.append(sentence)
                sent_num += 1
                total_sent_length += len(sentence.tokens)
                if len(sentence.tokens) > max_sent_length:
                    max_sent_length = len(sentence.tokens)
                if len(sentence.tokens) < min_sent_length:
                    min_sent_length = len(sentence.tokens)

            documents.append(document)
            doc_num += 1

    logging.info("{} statistics".format(data_dir))
    logging.info("doc number {}, sent number {}, entity number {}".format(
        doc_num, sent_num, entity_num))
    logging.info(
        "avg sent length {}, max sent length {}, min sent length {}".format(
            total_sent_length // sent_num, max_sent_length, min_sent_length))
    return documents