def load_data(path, exts, text_processor=text_processor()): src_data, trg_data = [], [] path = os.path.expanduser(path) with open(path + exts[0]) as src, open(path + exts[1]) as trg: for src_line, trg_line in zip(src, trg): src_line, trg_line = src_line.strip(), trg_line.strip() if text_processor is not None: src_line = text_processor(src_line) trg_line = text_processor(trg_line) if src_line and trg_line: src_data.append(src_line), trg_data.append(trg_line) return src_data, trg_data
def load_lines(path, processor=text_processor()): lines = [] with open(os.path.expanduser(path)) as f: for line in f: line = line.strip() if processor is not None: line = processor(line) if line: lines.append(line) return lines
def load_lines(path, processor=text_processor()): lines = [] if os.path.isfile(path): input_files = [path] else: input_files = [os.path.join(path, f) for f in os.listdir(path)] for path in input_files: with open(path) as f: for line in f: line = line.strip() if processor is not None: line = processor(line) if line: lines.append(line) return lines
def load_lines(path, processor=text_processor()): """Auxiliary function for sentence-per-line data""" if os.path.isdir(path): input_files = [os.path.join(path, f) for f in os.listdir(path)] elif os.path.isfile(path): input_files = [path] else: return for path in input_files: with open(os.path.expanduser(path)) as f: for line in f: line = line.strip() if processor is not None: line = processor(line) if not line: continue yield line
parser.add_argument('--level', default='token') parser.add_argument('--concat', action='store_true') parser.add_argument('--cache_data', action='store_true') args = parser.parse_args() prefix = '{source}.{level}.{min_len}.{min_freq}.{concat}.{max_size}' \ .format(**vars(args)) print("Loading data...") # preprocess if not args.cache_data or not os.path.isfile('data/%s_train.pt' % prefix): if args.source == 'twisty': src, trg = load_twisty(min_len=args.min_len, level=args.level, concat=args.concat, processor=text_processor(lower=False)) train, test, valid = load_dataset(src, trg, args.batch_size, min_freq=args.min_freq, max_size=args.max_size, gpu=args.gpu, dev=args.dev, test=args.test) elif args.source == 'penn': train, test, valid = load_penn("~/corpora/penn", args.batch_size, min_freq=args.min_freq, max_size=args.max_size, gpu=args.gpu) else:
parser.add_argument('--min_len', default=0, type=int) parser.add_argument('--min_freq', default=5, type=int) parser.add_argument('--max_size', default=50000, type=int) parser.add_argument('--level', default='token') parser.add_argument('--concat', action='store_true') parser.add_argument('--cache_data', action='store_true') args = parser.parse_args() print("Loading data...") prefix = '{source}.{level}.{min_len}.{min_freq}.{concat}.{max_size}' \ .format(**vars(args)) if not args.cache_data or not os.path.isfile('data/%s_train.pt' % prefix): if args.source == 'twisty': src, trg = load_twisty( min_len=args.min_len, level=args.level, concat=args.concat, processor=text_processor(lower=False)) train, test, valid = load_dataset( src, trg, args.batch_size, min_freq=args.min_freq, max_size=args.max_size, gpu=args.gpu, dev=args.dev, test=args.test) elif args.source == 'penn': train, test, valid = load_penn( "~/corpora/penn", args.batch_size, min_freq=args.min_freq, max_size=args.max_size, gpu=args.gpu) else: train, test, valid = load_from_lines( args.source_path, args.batch_size, min_freq=args.min_freq, max_size=args.max_size, gpu=args.gpu, dev=args.dev, test=args.text) if args.cache_data: train.to_disk('data/%s_train.pt' % prefix)
parser.add_argument('--visdom_server', default='localhost') parser.add_argument('--save', action='store_true') parser.add_argument('--prefix', default='model', type=str) args = parser.parse_args() if args.processed: print("Loading preprocessed datasets...") assert args.dict_path, "Processed data requires DICT_PATH" data, d = load_from_file(args.path), u.load_model(args.dict_path) train, test, valid = BlockDataset( data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True ).splits(test=0.1, dev=0.1) del data else: print("Processing datasets...") proc = text_processor( lower=args.lower, num=args.num, level=args.level) train_data = load_lines(args.path + 'train.txt', processor=proc) valid_data = load_lines(args.path + 'valid.txt', processor=proc) test_data = load_lines(args.path + 'test.txt', processor=proc) d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS) d.fit(train_data, valid_data) train = BlockDataset( train_data, d, args.batch_size, args.bptt, gpu=args.gpu) valid = BlockDataset( valid_data, d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) test = BlockDataset( test_data, d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) del train_data, valid_data, test_data
parser.add_argument('--visdom_server', default='localhost') parser.add_argument('--save', action='store_true') parser.add_argument('--prefix', default='model', type=str) args = parser.parse_args() if args.processed: print("Loading preprocessed datasets...") assert args.dict_path, "Processed data requires DICT_PATH" data, d = load_from_file(args.path), u.load_model(args.dict_path) train, test, valid = BlockDataset( data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True ).splits(test=0.1, dev=0.1) del data else: print("Processing datasets...") proc = text_processor( lower=args.lower, num=args.num, level=args.level) train_data = load_lines(args.path + 'train.txt', processor=proc) valid_data = load_lines(args.path + 'valid.txt', processor=proc) test_data = load_lines(args.path + 'test.txt', processor=proc) d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS, bos_token=u.BOS) d.fit(train_data, valid_data) train = BlockDataset( train_data, d, args.batch_size, args.bptt, gpu=args.gpu) valid = BlockDataset( valid_data, d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) test = BlockDataset( test_data, d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) del train_data, valid_data, test_data