def read_all(path: Path, add_eos): count = 0 with IO.reader(path) as reader: dialog = Dialog() for line in reader: line = line.strip() if line: parts = line.split("\t") char, seq = parts[-2:] # the last two are mandatory uid = parts[0] if len(parts) > 2 else None weight = float(parts[1]) if len(parts) > 3 else None char, seq = int(char), [ int(x) for x in seq.strip().split() ] if add_eos and seq[-1] != EOS_TOK_IDX: seq.append(EOS_TOK_IDX) dialog.append(Utterance(char, seq, uid=uid, weight=weight)) else: if len(dialog) > 0: yield dialog count += 1 dialog = Dialog() if len(dialog) > 0: count += 1 yield dialog log.info(f"Read {count} dialogs")
def read_msg_resp(path: str): def _read(rdr): recs = (x.strip() for x in rdr) recs = (x for x in recs if x) recs = (x.split('\t') for x in recs) recs = (x for x in recs if len(x) == 2) recs = list(recs) msgs = [x[0] for x in recs] resps = [x[1] for x in recs] return msgs, resps if type(path) is str: with IO.reader(path) as r: return _read(r) else: return _read(path)
def __init__(self, inp: Union[str, Path, TextIO, Iterator[str]], text_field: Field = None, char_field: LookupField = None, max_seq_len: int = 100, add_eos=True): """ :param inp: dialog seq file :param text_field: provide this field if you want to map text to word ids. by default it splits words by white space and return words as sequence :param char_field: provide this field if you want to map character name to id. """ if type(inp) is str: inp = Path(inp) if isinstance(inp, Path): assert inp.exists() inp = IO.reader(inp).open() self.reader = inp self.text_field = text_field self.char_field = char_field self.max_seq_len = max_seq_len self.add_eos = add_eos self.num_cols = 0
def read_raw_lines(dialog_path: Union[str, Path]) -> Iterator[RawRecord]: with IO.reader(dialog_path) as lines: recs = (line.split("\t")[-2:] for line in lines) recs = ((char.strip(), dialog.strip()) for char, dialog in recs) recs = ((char, dialog) for char, dialog in recs if char and dialog) yield from recs
def _read_char_names(): with IO.reader(path) as inp: for line in inp: parts = line.strip().split('\t') if len(parts) >= 2: yield parts[-2]
def read_lines(path: Union[str, Path]): with IO.reader(path) as f: lines = f.readlines() lines = [l.strip() for l in lines] return lines
def read_tsv(path: str): assert os.path.exists(path) with IO.reader(path) as f: yield from (line.split('\t') for line in f)
def read_lines(path): if type(path) is str: with IO.reader(path) as reader: yield from read_lines_reader(reader) else: return read_lines_reader(path)