def generate_words_tags_from_tsv(tsv_file_path, lower=False, gold=True, max_seq_length=None, sent_delimiter=None, char_level=False, hard_constraint=False): for sent in read_tsv_as_sents(tsv_file_path): words = [cells[0] for cells in sent] if max_seq_length: offset = 0 # try to split the sequence to make it fit into max_seq_length for shorter_words in split_long_sentence_into(words, max_seq_length, sent_delimiter, char_level, hard_constraint): if gold: shorter_tags = [cells[1] for cells in sent[offset:offset + len(shorter_words)]] offset += len(shorter_words) else: shorter_tags = None if lower: shorter_words = [word.lower() for word in shorter_words] yield shorter_words, shorter_tags else: if gold: try: tags = [cells[1] for cells in sent] except: raise ValueError(f'Failed to load {tsv_file_path}: {sent}') else: tags = None if lower: words = [word.lower() for word in words] yield words, tags
def generator_words_tags(tsv_file_path, lower=True, gold=True, max_seq_length=None): for sent in read_tsv(tsv_file_path): words = [cells[0] for cells in sent] if max_seq_length and len(words) > max_seq_length: offset = 0 # try to split the sequence to make it fit into max_seq_length for shorter_words in split_long_sentence_into( words, max_seq_length): if gold: shorter_tags = [ cells[1] for cells in sent[offset:offset + len(shorter_words)] ] offset += len(shorter_words) else: shorter_tags = None if lower: shorter_words = [word.lower() for word in shorter_words] yield shorter_words, shorter_tags else: if gold: tags = [cells[1] for cells in sent] else: tags = None if lower: words = [word.lower() for word in words] yield words, tags
def test_split_127(self): sent = [ '“', '旧', '货', '”', '不', '仅', '仅', '是', '指', '新', '货', '被', '使', '用', '才', '成', '为', '旧', '货', ';', '还', '包', '括', '商', '品', '的', '调', '剂', ',', '即', '卖', '出', '旧', '货', '的', '人', '是', '为', '了', '买', '入', '新', '货', ',', '买', '入', '旧', '货', '的', '人', '是', '因', '为', '符', '合', '自', '己', '的', '需', '要', ',', '不', '管', '新', '旧', ';', '有', '的', '商', '店', '还', '包', '括', '一', '些', '高', '档', '的', '工', '艺', '品', '、', '古', '董', '、', '字', '画', '、', '家', '具', '等', '商', '品', ';', '有', '的', '还', '包', '括', '新', '货', '卖', '不', '出', '去', ',', '企', '业', '或', '店', '主', '为', '了', '盘', '活', '资', '金', ',', '削', '价', '销', '售', '积', '压', '产', '品', '。' ] results = list(split_long_sentence_into(sent, 126)) self.assertListEqual([[ '“', '旧', '货', '”', '不', '仅', '仅', '是', '指', '新', '货', '被', '使', '用', '才', '成', '为', '旧', '货', ';', '还', '包', '括', '商', '品', '的', '调', '剂', ',', '即', '卖', '出', '旧', '货', '的', '人', '是', '为', '了', '买', '入', '新', '货', ',', '买', '入', '旧', '货', '的', '人', '是', '因', '为', '符', '合', '自', '己', '的', '需', '要', ',', '不', '管', '新', '旧', ';', '有', '的', '商', '店', '还', '包', '括', '一', '些', '高', '档', '的', '工', '艺', '品', '、', '古', '董', '、', '字', '画', '、', '家', '具', '等', '商', '品', ';', '有', '的', '还', '包', '括', '新', '货', '卖', '不', '出', '去', ',', '企', '业', '或', '店', '主', '为', '了', '盘', '活', '资', '金', ',' ], ['削', '价', '销', '售', '积', '压', '产', '品', '。']], results)
def load_file(self, filepath: str): """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated by a delimiter (usually space). .. highlight:: bash .. code-block:: bash $ head train.txt 上海 浦东 开发 与 法制 建设 同步 新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 ) Args: filepath: The path to the corpus. """ f = TimingFileIterator(filepath) # longest_sent = 0 for line in f: line = line.rstrip('\n') tokens = line.split(self.delimiter) if not tokens: continue if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len: # debug = [] for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter, char_level=self.char_level, hard_constraint=self.hard_constraint): # debug.extend(short_sents) # longest_sent = max(longest_sent, len(''.join(short_sents))) yield {'token': short_sents} # assert debug == tokens else: # longest_sent = max(longest_sent, len(''.join(tokens))) yield {'token': tokens} f.log(line[:20]) f.erase()
def load_file(self, filepath): """Load a ``.tsv`` file. A ``.tsv`` file for tagging is defined as a tab separated text file, where non-empty lines have two columns for token and tag respectively, empty lines mark the end of sentences. Args: filepath: Path to a ``.tsv`` tagging file. .. highlight:: bash .. code-block:: bash $ head eng.train.tsv -DOCSTART- O EU S-ORG rejects O German S-MISC call O to O boycott O British S-MISC lamb O """ filepath = get_resource(filepath) # idx = 0 for words, tags in generate_words_tags_from_tsv(filepath, lower=False): # idx += 1 # if idx % 1000 == 0: # print(f'\rRead instances {idx // 1000}k', end='') if self.max_seq_len: start = 0 for short_sents in split_long_sentence_into( words, self.max_seq_len, self.sent_delimiter, char_level=self.char_level, hard_constraint=self.hard_constraint): end = start + len(short_sents) yield {'token': short_sents, 'tag': tags[start:end]} start = end else: yield {'token': words, 'tag': tags}
def test_split_long_sentence_into(self): sent = ['a', 'b', 'c', ',', 'd', 'e', ',', 'f', 'g', ',', 'h'] self.assertListEqual( [['a', 'b', 'c', ','], ['d', 'e', ','], ['f', 'g', ','], ['h']], list(split_long_sentence_into(sent, 2)))