示例#1
0
 def span_tokenize(self, s):
     if self._blanklines == 'keep':
         for span in string_span_tokenize(s, r'\n'):
             yield span
     else:
         for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
             yield span
示例#2
0
文件: simple.py 项目: prz3m/kind2anki
 def span_tokenize(self, s):
     if self._blanklines == 'keep':
         for span in string_span_tokenize(s, r'\n'):
             yield span
     else:
         for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
             yield span
def split_by_sentence(start=0, end=63):
    """
    Split the document into sentence.    (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    raw_text_dir = read.read_from_json('raw_data_dir')  #### in folder data/
    raw_dir_simple = read.read_from_json(
        'raw_dir_simple')  #### in folder data/
    for data_id in range(start, end):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        sent_tokenize_list = sent_tokenize(raw_text)
        sent_tokenize_span_list = spans(sent_tokenize_list, raw_text)

        sent_span_list = list()
        for sent_tokenize_span in sent_tokenize_span_list:
            sent_spans = list(
                regexp_span_tokenize(sent_tokenize_span[0], r'\n'))
            for sent_span in sent_spans:
                sent_span = (sent_span[0] + sent_tokenize_span[1],
                             sent_span[1] + sent_tokenize_span[1])
                sent_span_list.append((raw_text[sent_span[0]:sent_span[1]],
                                       sent_span[0], sent_span[1]))
        read.save_in_json(
            "training_sentence/sentences/" + raw_dir_simple[data_id],
            sent_span_list)
def split_by_sentence(raw_text, char_vocab):
    sent_tokenize_list = sent_tokenize(raw_text)
    sent_tokenize_span_list = process.spans(sent_tokenize_list, raw_text)
    sent_span_list = list()
    max_len = list()
    for sent_tokenize_span in sent_tokenize_span_list:
        sent_spans = list(regexp_span_tokenize(sent_tokenize_span[0], r'\n'))
        for sent_span in sent_spans:
            sent_span = (sent_span[0] + sent_tokenize_span[1],
                         sent_span[1] + sent_tokenize_span[1])
            sent = raw_text[sent_span[0]:sent_span[1]]
            for char in sent:
                char_vocab[char] += 1
            if len(sent) >= 350:
                #print sent
                multi_sent_span, multi_sent_len = process.rule_based_tokenizer(
                    sent, sent_span)
                sent_span_list += multi_sent_span
                max_len += multi_sent_len
                if max(multi_sent_len) > 350:
                    print(sent)
            elif len(list(set(sent))) >= 2:
                sent_span_list.append([sent, sent_span[0], sent_span[1]])
                max_len.append(len(sent))
    return sent_span_list, max_len, char_vocab
示例#5
0
 def span_tokenize(self, s):
     if self._blanklines == "keep":
         for span in string_span_tokenize(s, r"\n"):
             yield span
     else:
         for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
             yield span
示例#6
0
 def span_tokenize(self, s):
     if self._blanklines == "keep":
         for span in string_span_tokenize(s, r"\n"):
             yield span
     else:
         for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
             yield span
示例#7
0
 def span_tokenize(self, text):
     if self._gaps:
         for left, right in regexp_span_tokenize(text, self._regexp):
             if not (self._discard_empty and left == right):
                 yield left, right
     else:
         for m in finditer(self._regexp, text):
             yield m.span()
示例#8
0
 def span_tokenize(self, text):
     if self._gaps:
         for left, right in regexp_span_tokenize(text, self._regexp):
             if not (self._discard_empty and left == right):
                 yield left, right
     else:
         for m in re.finditer(self._regexp, text):
             yield m.span()
示例#9
0
    def gap_split(self, text, return_spans=False):
        if len(text) == 0:
            return []
        self._check_regex()

        spans = [
            (left, right)
            for left, right in regexp_span_tokenize(text, self._regex['gap'])
            if not (left == right)
        ]
        tokens = [text[span[0]:span[1]] for span in spans]

        if return_spans:
            return spans
        else:
            return tokens
示例#10
0
 def span_tokenize(self, s):
     if self._blanklines == "keep":
         yield from string_span_tokenize(s, r"\n")
     else:
         yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
def use_context_words(ann_file_tbf, dataset="_train", window_size=4):
    """
    Builds two files one with positive examples and one with negative examples.
    Each event pair surrounded by context words lays in one line seperated with
    whitespace. Eg: c1 c2 e1 c3 c4 c5 c6 e2 c7 c8
    """

    events, corefs, afters, parents = read_annotations(ann_file_tbf)

    data_folder = os.path.join("data", "LDC2016E130_V5", "data", "all")
    positives, negatives = [], []
    for doc_id in events:
        for event_id in events[doc_id]:
            for ind, to_event_id in enumerate(events[doc_id]):
                if event_id == to_event_id:
                    continue
                linked_event_ids = [event_id, to_event_id]
                is_positive = linked_event_ids in afters[doc_id].values()
                if not is_positive and ind % 30 != 0:
                    continue
                with open(os.path.join(data_folder, doc_id+".txt")) as file:
                    text = file.read()
                    replacements = [(" author=", "_author="),
                                    (" datetime=", "_datetime="),
                                    (" id=", "_id="),
                                    (" alt=", "_alt="),
                                    ("doc id", "doc_id"),
                                    ("img src", "img_src"),
                                    ("a href", "a_href"),
                                    ("\n" , " "),
                                    #(" <", "_<"),
                                    #("> ", ">_"),
                    ]
                    for r in replacements:
                        text = text.replace(r[0], r[1])

                token_list = list(regexp_span_tokenize(text, r'\s'))
                ctx_word_list = [] # [doc_id,event_id, to_event_id]
                for i in range(2):
                    e_id = linked_event_ids[i]
                    event_offsets = tuple([int(a) for a in events[doc_id][e_id]["offsets"].split(",")])
                    try:
                        nugget_ind = token_list.index(event_offsets)
                    except ValueError:
                        try:
                            new_nugget_ind = [ind for ind,off in enumerate(token_list) if
                                              off[0] == event_offsets[0] or
                                              off[1] == event_offsets[1] or
                                              off[0]-1 == event_offsets[0] or
                                              off[0]+1 == event_offsets[0]
                                              ][0]
                            nugget_ind = new_nugget_ind
                        except IndexError:
                            print(is_positive, doc_id, events[doc_id][e_id]["nugget"],
                                  text[event_offsets[0]-5:event_offsets[1]+5])
                            if is_positive:
                                import ipdb; ipdb.set_trace()
                            continue
                    # found the nugget in the tokenized text
                    # index of the nugget in token list is nugget_ind
                    if i == 0:
                        ctx_word_list.append(doc_id)
                        ctx_word_list.append(event_id)
                        ctx_word_list.append(to_event_id)
                    for t_ind in range(nugget_ind-window_size, nugget_ind + window_size + 1):
                        if 0 > t_ind or t_ind >= len(token_list):
                            context_word = "pad"
                        else:
                            context_word = text[token_list[t_ind][0]:token_list[t_ind][1]]
                            # remove <post> <a href.. > etc
                            context_word = re.sub(r"<.*>", "", context_word)
                        ctx_word_list.append(context_word.strip('"\',.:“”'))

                if is_positive:
                    positives.append(" ".join(ctx_word_list))
                else:
                    negatives.append(" ".join(ctx_word_list))
    with open("seq_positives%s_%s.txt" % (dataset, window_size), "w") as file:
        file.write("\n".join(positives))
    with open("seq_negatives%s_%s.txt" % (dataset, window_size), "w") as file:
        file.write("\n".join(negatives[:len(positives)*2]))
    with open("seq_negatives%s_%s_all.txt" % (dataset, window_size), "w") as file:
        file.write("\n".join(negatives))