def tag(self, sent): times = self.find_time(sent) intervals = dict([(time[0], time[1]) for time in times]) tag_dict = dict([(time[2], time[3]) for time in times]) tokenizer = WordPunctTokenizer() # for a in [time[2] for time in times]: # tokenizer.add_mwe(a.split()) # --- FIXED --- original_tokens = tokenizer.tokenize(sent) original_tags = pos_tag(original_tokens) # --- END FIXED --- tokens = [] current = 0 for span in tokenizer.span_tokenize(sent): if span[0] < current: continue if span[0] in intervals: tokens.append(f'__{sent[span[0]: intervals[span[0]]]}') current = intervals[span[0]] else: tokens.append(sent[span[0]:span[1]]) current = span[1] tags = pos_tag(tokens) new_tags = [] for word, tag in tags: if word[:2] == '__': new_tags.append((word[2:], tag_dict[word[2:]])) else: tag = [t[1] for t in original_tags if t[0] == word][0] # FIXED new_tags.append((word, tag)) return new_tags
def tokenize(text): """Tokenize a raw text. Args: text (str) Returns: list of {token, char1, char2, pos} """ tokenizer = WordPunctTokenizer() # Get token character spans. spans = list(tokenizer.span_tokenize(text)) # Materialize the token stream. tokens = [text[c1:c2] for c1, c2 in spans] # Tag parts-of-speech. tags = pos_tag(tokens) return [ dict( token=token.lower(), char1=c1, char2=c2, pos=pos, ) for (c1, c2), token, (_, pos) in zip(spans, tokens, tags) ]
def tokens(self): """Tokenize the text. """ tokenizer = WordPunctTokenizer() # Get token character spans. spans = list(tokenizer.span_tokenize(self.text)) # Materialize the token stream. tokens = [self.text[c1:c2] for c1, c2 in spans] tags = pos_tag(tokens) return [ Token( token=token.lower(), char1=c1, char2=c2, pos=pos, ) for (c1, c2), token, (_, pos) in zip(spans, tokens, tags) ]
def __get_words_boundaries(self): """ function to tokenize words in the document and return words boundaries of each sentence using a tokenizer. :return: """ tokenizer = WordPunctTokenizer() words = list(tokenizer.span_tokenize(self.text)) return words
def change_db2(text, origin_dict, id): print origin_dict tokens_ar = [] word_punct_tokenizer = WordPunctTokenizer() for token in word_punct_tokenizer.span_tokenize(origin_dict): tokens_ar.append(token) for line in text.split("\n"): markup_error_line = line.split(';') print "MARKUP", markup_error_line convert_coord_2dbformat(markup_error_line, tokens_ar, id)
def word_loss(chunk: str, losses: List[float]) -> List[float]: tokenizer = WordPunctTokenizer() spans = list(tokenizer.span_tokenize(chunk)) # FIXME - this is a bit optimistic, do it properly: # gather outputs, check wordpunkt tokenizer regexp against characters return [ sum(losses[start:end]) # first and last might be incomplete for start, end in spans[1:-1] ]
def convert(sgm_path, apf_path, bio_path=None): xml_parser = etree.XMLParser(recover=True) try: sgm_tree = etree.parse(sgm_path, xml_parser) apf_tree = etree.parse(apf_path, xml_parser) if not bio_path: bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio' output = open(bio_path, 'w') except: print 'Something wrong when opening/parsing xml file, or opening output file' return init_offset = get_init_offset(sgm_path) text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n') tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) spans = list(tokenizer.span_tokenize(text)) pos = pos_tag(tokens) ts = [] for i in range(len(tokens)): t = token() t.text = tokens[i] t.pos = pos[i][1] t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset) t.bio = 'O' ts.append(t) entits = apf_tree.xpath('/source_file/document/entity') for enty in entits: enty_type = enty.get('TYPE') mentions = enty.xpath('entity_mention') for m in mentions: head = m.xpath('head')[0] span = (int(head[0].get('START')), int(head[0].get('END'))) found = False for t in ts: if t.span[0] == span[0]: t.bio = 'B-' + enty_type found = True if t.span[0] > span[0] and t.span[1] <= span[1]: t.bio = 'I-' + enty_type found = True if not found: print 'entity mention head span not found', span, apf_path for t in ts: #print t.text, t.span output.write('\t'.join([t.text, t.pos, t.bio]) + '\n') output.close()
class NltkTokenizer(Tokenizer): """ Tokenizer that uses WordPunctTokenizer from NLTK """ tokenizer: WordPunctTokenizer def __init__(self) -> None: self.tokenizer = WordPunctTokenizer() def tokenize(self, text: str) -> List[Token]: spans = list(self.tokenizer.span_tokenize(text)) words = [text[span_start:span_end] for span_start, span_end in spans] return [Token(word=word, span=span) for word, span in zip(words, spans)]
ann_path = os.path.join(brat_data_dirname, filename) if os.stat(ann_path).st_size > 0: doc_path = os.path.join(brat_data_dirname, '%s.txt' % basename) with open(doc_path, 'r') as doc_file, open(ann_path, 'r') as ann_file: start_dict = {} end_dict = {} for (id, type_pos, entity) in (tuple(line.strip().split('\t')) for line in ann_file): (type, start, end) = type_pos.split(' ') start_dict[start] = type end_dict[end] = type content = doc_file.read() ann_content = "" active_type = 'O' for (start, end) in tokenizer.span_tokenize(content): if str(start) in start_dict: active_type = start_dict[str(start)] #print("%s\t%s" % (content[start:end], active_type)) col_file.write("%s\t%s\n" % (content[start:end], active_type)) if str(end) in end_dict: active_type = 'O'