def span_tokenize(self, s): if self._blanklines == 'keep': for span in string_span_tokenize(s, r'\n'): yield span else: for span in regexp_span_tokenize(s, r'\n(\s+\n)*'): yield span
def split_by_sentence(start=0, end=63): """ Split the document into sentence. (needed to build end2end system) :param start: :param end: :return: """ raw_text_dir = read.read_from_json('raw_data_dir') #### in folder data/ raw_dir_simple = read.read_from_json( 'raw_dir_simple') #### in folder data/ for data_id in range(start, end): raw_text = read.read_from_dir(raw_text_dir[data_id]) sent_tokenize_list = sent_tokenize(raw_text) sent_tokenize_span_list = spans(sent_tokenize_list, raw_text) sent_span_list = list() for sent_tokenize_span in sent_tokenize_span_list: sent_spans = list( regexp_span_tokenize(sent_tokenize_span[0], r'\n')) for sent_span in sent_spans: sent_span = (sent_span[0] + sent_tokenize_span[1], sent_span[1] + sent_tokenize_span[1]) sent_span_list.append((raw_text[sent_span[0]:sent_span[1]], sent_span[0], sent_span[1])) read.save_in_json( "training_sentence/sentences/" + raw_dir_simple[data_id], sent_span_list)
def split_by_sentence(raw_text, char_vocab): sent_tokenize_list = sent_tokenize(raw_text) sent_tokenize_span_list = process.spans(sent_tokenize_list, raw_text) sent_span_list = list() max_len = list() for sent_tokenize_span in sent_tokenize_span_list: sent_spans = list(regexp_span_tokenize(sent_tokenize_span[0], r'\n')) for sent_span in sent_spans: sent_span = (sent_span[0] + sent_tokenize_span[1], sent_span[1] + sent_tokenize_span[1]) sent = raw_text[sent_span[0]:sent_span[1]] for char in sent: char_vocab[char] += 1 if len(sent) >= 350: #print sent multi_sent_span, multi_sent_len = process.rule_based_tokenizer( sent, sent_span) sent_span_list += multi_sent_span max_len += multi_sent_len if max(multi_sent_len) > 350: print(sent) elif len(list(set(sent))) >= 2: sent_span_list.append([sent, sent_span[0], sent_span[1]]) max_len.append(len(sent)) return sent_span_list, max_len, char_vocab
def span_tokenize(self, s): if self._blanklines == "keep": for span in string_span_tokenize(s, r"\n"): yield span else: for span in regexp_span_tokenize(s, r"\n(\s+\n)*"): yield span
def span_tokenize(self, text): if self._gaps: for left, right in regexp_span_tokenize(text, self._regexp): if not (self._discard_empty and left == right): yield left, right else: for m in finditer(self._regexp, text): yield m.span()
def span_tokenize(self, text): if self._gaps: for left, right in regexp_span_tokenize(text, self._regexp): if not (self._discard_empty and left == right): yield left, right else: for m in re.finditer(self._regexp, text): yield m.span()
def gap_split(self, text, return_spans=False): if len(text) == 0: return [] self._check_regex() spans = [ (left, right) for left, right in regexp_span_tokenize(text, self._regex['gap']) if not (left == right) ] tokens = [text[span[0]:span[1]] for span in spans] if return_spans: return spans else: return tokens
def span_tokenize(self, s): if self._blanklines == "keep": yield from string_span_tokenize(s, r"\n") else: yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
def use_context_words(ann_file_tbf, dataset="_train", window_size=4): """ Builds two files one with positive examples and one with negative examples. Each event pair surrounded by context words lays in one line seperated with whitespace. Eg: c1 c2 e1 c3 c4 c5 c6 e2 c7 c8 """ events, corefs, afters, parents = read_annotations(ann_file_tbf) data_folder = os.path.join("data", "LDC2016E130_V5", "data", "all") positives, negatives = [], [] for doc_id in events: for event_id in events[doc_id]: for ind, to_event_id in enumerate(events[doc_id]): if event_id == to_event_id: continue linked_event_ids = [event_id, to_event_id] is_positive = linked_event_ids in afters[doc_id].values() if not is_positive and ind % 30 != 0: continue with open(os.path.join(data_folder, doc_id+".txt")) as file: text = file.read() replacements = [(" author=", "_author="), (" datetime=", "_datetime="), (" id=", "_id="), (" alt=", "_alt="), ("doc id", "doc_id"), ("img src", "img_src"), ("a href", "a_href"), ("\n" , " "), #(" <", "_<"), #("> ", ">_"), ] for r in replacements: text = text.replace(r[0], r[1]) token_list = list(regexp_span_tokenize(text, r'\s')) ctx_word_list = [] # [doc_id,event_id, to_event_id] for i in range(2): e_id = linked_event_ids[i] event_offsets = tuple([int(a) for a in events[doc_id][e_id]["offsets"].split(",")]) try: nugget_ind = token_list.index(event_offsets) except ValueError: try: new_nugget_ind = [ind for ind,off in enumerate(token_list) if off[0] == event_offsets[0] or off[1] == event_offsets[1] or off[0]-1 == event_offsets[0] or off[0]+1 == event_offsets[0] ][0] nugget_ind = new_nugget_ind except IndexError: print(is_positive, doc_id, events[doc_id][e_id]["nugget"], text[event_offsets[0]-5:event_offsets[1]+5]) if is_positive: import ipdb; ipdb.set_trace() continue # found the nugget in the tokenized text # index of the nugget in token list is nugget_ind if i == 0: ctx_word_list.append(doc_id) ctx_word_list.append(event_id) ctx_word_list.append(to_event_id) for t_ind in range(nugget_ind-window_size, nugget_ind + window_size + 1): if 0 > t_ind or t_ind >= len(token_list): context_word = "pad" else: context_word = text[token_list[t_ind][0]:token_list[t_ind][1]] # remove <post> <a href.. > etc context_word = re.sub(r"<.*>", "", context_word) ctx_word_list.append(context_word.strip('"\',.:“”')) if is_positive: positives.append(" ".join(ctx_word_list)) else: negatives.append(" ".join(ctx_word_list)) with open("seq_positives%s_%s.txt" % (dataset, window_size), "w") as file: file.write("\n".join(positives)) with open("seq_negatives%s_%s.txt" % (dataset, window_size), "w") as file: file.write("\n".join(negatives[:len(positives)*2])) with open("seq_negatives%s_%s_all.txt" % (dataset, window_size), "w") as file: file.write("\n".join(negatives))