def tokenize(documents): real_tokens = [] documents2 = [] tbw = TreebankWordTokenizer() for doc in documents: text = doc["text"] file = doc["id"] text = text.replace("\"","'") #text = text.replace("/", " ") text = text.replace("-", " ") text = text.replace(".", " ") tokens = tbw.span_tokenize(text) for token in tokens: token_txt = text[token[0]:token[1]] found = False for tag in doc["tags"]: if int(tag["start"])<=token[0] and int(tag["end"])>=token[1]: token_tag = tag["tag"] token_tag_type = tag["type"] found = True if found==False: token_tag = "O" token_tag_type = "O" real_tokens.append({"token":token_txt,"start":token[0],"end":token[1],"tag":token_tag,"tag_type":token_tag_type}) documents2.append({"id": file, "text": text, "tags": doc["tags"],"tokens":real_tokens}) return documents2
class NLTKWordTokenizer(PackProcessor): r"""A wrapper of NLTK word tokenizer. """ def __init__(self): super().__init__() self.tokenizer = TreebankWordTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.tokenizer.span_tokenize(input_pack.text): Token(input_pack, begin, end)
class DocumentTokenizer(object): """ Used to split a document into sentences and tokens. Returns a list of lists TODO """ def __init__(self, sent_tokenizer=None, word_tokenizer=None): if not sent_tokenizer: #self.sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv') self.sent_tokenizer = DefaultSentenceTokenizer() if not word_tokenizer: self.word_tokenizer = TreebankWordTokenizer() #self.rush = rush #self.word_tokenizer = word_tokenizer def tokenize_doc(self, doc): """ Takes raw string. Returns a list of lists where each list is the sentence, and each sentence contains two-tuples of tokens and spans. """ tokenized_sents_and_spans = [] try: # sentence_span is a list of tuples of spans sentence_spans = self.sent_tokenizer.tokenize_sents(doc) except Exception as e: raise e return [] #raise e for start, end in sentence_spans: sentence = doc[start:end] tokenized_sents_and_spans.append( self.tokenize_sent(sentence, start)) return tokenized_sents_and_spans def tokenize_sent(self, sentence, offset): try: tokens = self.word_tokenizer.tokenize(sentence) except Exception as e: print("Word tokenizing failed") print(sentence) raise e try: spans = self.word_tokenizer.span_tokenize(sentence) except Exception as e: print("Span tokenizing failed") print(sentence) raise e tokens_and_spans = [] for token, span in zip(tokens, spans): start, end = span true_start = start + offset true_end = end + offset tokens_and_spans.append((token, (true_start, true_end))) return tokens_and_spans
def tokenize_en(text): """Receive text string and return tokens and spans""" tokenizer = TreebankWordTokenizer() tokens = [] tokens_span = [] for start, end in tokenizer.span_tokenize(text): token = text[start:end] # Separate ending dot "." in token if len(token) > 1 and token[-1] == "." and token.count(".") == 1: end_resize = end - 1 tokens.append(text[start:end_resize]) tokens_span.append((start, end_resize)) tokens.append(text[end_resize:end]) tokens_span.append((end_resize, end)) else: tokens.append(token) tokens_span.append((start, end)) return tokens, tokens_span
class NLTKWordTokenizer(PackProcessor): r"""A wrapper of NLTK word tokenizer.""" def __init__(self): super().__init__() self.tokenizer = TreebankWordTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.tokenizer.span_tokenize(input_pack.text): Token(input_pack, begin, end) def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKWordTokenizer`, which is `ft.onto.base_ontology.Token`, to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Token"] = set()
class SrlDataReader: def __init__(self, vocab): self.vocab = vocab self.tokenizer = TreebankWordTokenizer() def read_data(self, data_dir): for root, dirs, files in os.walk(data_dir): for name in files: if not name.endswith(".json"): continue full_path = os.path.join(root, name) with open(full_path) as fin: doc = json.load(fin) for data in self.parse_doc(doc): yield data def parse_doc(self, doc): text = doc["text"] events = doc["events"] fillers = doc["fillers"] entities = doc["entities"] begin_map = defaultdict(list) end_map = defaultdict(list) for f in fillers: begin_map[f["begin"]].append((f["id"], f["type"])) end_map[f["end"]].append((f["id"], f["type"])) for entity in entities: for em in entity["mentions"]: begin_map[em["begin"]].append((em["id"], em["type"])) end_map[em["end"]].append((em["id"], em["type"])) for event in events: for evm in event["mentions"]: begin_map[evm["begin"]].append((evm["id"], evm["type"])) end_map[evm["end"]].append((evm["id"], evm["type"])) indexed_doc = [] tags = [] sent_offset = 0 on_types = set() for sentence in text.split("\n"): word_spans = self.tokenizer.span_tokenize(sentence) tokens = [] for b, e in word_spans: token_text = sentence[b:e] indexed_doc.append(self.vocab.get(token_text, 0)) begin = sent_offset + b end = sent_offset + e token_tags = [] for begin_obj in begin_map[begin]: obj_id, obj_type = begin_obj token_tags.append("B_" + obj_type) on_types.add(obj_type) for end_obj in end_map[end]: obj_id, obj_type = end_obj token_tags.append("I_" + obj_type) on_types.remove(obj_type) if on_types: for t in on_types: token_tags.append("I_" + t) else: token_tags.append("O") tags.append(token_tags) tokens.append(self.vocab.get(token_text)) sent_offset += len(sentence) + 1 print(tokens) input(tags) yield tokens, tags