def main(folder, domain, augmentation_type, num): """Print augmented data to stdout.""" aug_types = augmentation_type.split('+') data = [] domain = domains.new(domain) if folder == 'train': fname = './geoQueryData/train/geo880_train600.tsv' nums_data = 600 elif folder == 'dev': fname = './geoQueryData/dev/geo880_dev100.tsv' nums_data = 100 elif folder == 'test': fname = './geoQueryData/test/geo880_test280.tsv' nums_data = 280 with open(fname) as f: for line in f: x, y = line.strip().split('\t') y = domain.preprocess_lf(y) data.append((x, y)) augmenter = Augmenter(domain, data, aug_types) aug_data = augmenter.sample(num) path = f"./geoQueryData/{folder}/geo880_{folder}_{num + nums_data}_{augmentation_type}_recomb.tsv" with open(path, 'w') as f: for data in data: f.write(data[0] + '\t' + data[1] + '\n') for ex in aug_data: f.write(ex[0] + '\t' + ex[1] + '\n')
def run(): configure_theano() domain = None if OPTIONS.domain: domain = domains.new(OPTIONS.domain) train_raw, dev_raw = load_raw_all(domain=domain) random.seed(OPTIONS.model_seed) numpy.random.seed(OPTIONS.model_seed) #pair_stat = get_pair_stats(train_raw) # [pair co-occurances for attn2hist] spec = init_spec(train_raw) model = get_model(spec) global test_domain print('***', test_domain) if OPTIONS.model == 'attn2hist': model.spec.set_test_domain(test_domain) if train_raw: train_data = preprocess_data(model, train_raw) #if OPTIONS.model =='attn2hist': #set_pair_stats(model,train_data) # for the attn2hist model #test_ibm_model(model,train_data) # for the attn2hist model random.seed(OPTIONS.model_seed) dev_data = None if dev_raw: dev_data = preprocess_data(model, dev_raw) augmenter = get_augmenter(train_raw, domain) model.train(train_data, T=OPTIONS.num_epochs, eta=OPTIONS.learning_rate, dev_data=dev_data, l2_reg=OPTIONS.lambda_reg, distract_prob=OPTIONS.distract_prob, distract_num=OPTIONS.distract_num, concat_prob=OPTIONS.concat_prob, concat_num=OPTIONS.concat_num, augmenter=augmenter, aug_frac=OPTIONS.aug_frac) if OPTIONS.save_file: print >> sys.stderr, 'Saving parameters...' spec.save(OPTIONS.save_file) #if train_raw: # evaluate_train(model, train_data, domain=domain) if dev_raw: evaluate_dev(model, dev_raw, domain=domain) write_stats() if OPTIONS.shell: run_shell(model) elif OPTIONS.server: run_server(model, hostname=OPTIONS.hostname, port=OPTIONS.port)
def run(): configure_theano() domain = None if OPTIONS.domain: domain = domains.new(OPTIONS.domain) train_raw, dev_raw = load_raw_all(domain=domain) databases = load_databases(OPTIONS.domain_grammar, domain=domain) random.seed(OPTIONS.model_seed) numpy.random.seed(OPTIONS.model_seed) spec = init_spec(train_raw, databases) model = get_model(spec) constructor1 = CONTROLLERS[OPTIONS.general_ontology] constructor2 = CONTROLLERS[OPTIONS.domain_ontology] use_general_ontology = OPTIONS.use_generalontology use_domain_ontology = OPTIONS.use_geoontology or OPTIONS.use_atisontology or OPTIONS.use_overnightontology general_controller = constructor1(OPTIONS.general_grammar, use_general_ontology) domain_controller = constructor2(OPTIONS.domain_grammar, use_domain_ontology) constructor3 = CONVERTORS[OPTIONS.domain_convertor] domain_convertor = constructor3 if train_raw: train_data = preprocess_data(domain_convertor, domain_controller, general_controller, model, train_raw) random.seed(OPTIONS.model_seed) dev_data = None if dev_raw: dev_data = preprocess_data(domain_convertor, domain_controller, general_controller, model, dev_raw) augmenter = get_augmenter(train_raw, domain) model.train(train_data, T=OPTIONS.num_epochs, eta=OPTIONS.learning_rate, dev_data=dev_data, l2_reg=OPTIONS.lambda_reg, distract_prob=OPTIONS.distract_prob, distract_num=OPTIONS.distract_num, concat_prob=OPTIONS.concat_prob, concat_num=OPTIONS.concat_num, augmenter=augmenter, aug_frac=OPTIONS.aug_frac) if OPTIONS.save_file: print >> sys.stderr, 'Saving parameters...' spec.save(OPTIONS.save_file) evaluation_train_flag = False if train_raw and evaluation_train_flag: evaluate_train(model, domain_convertor, domain_controller, general_controller, train_data, domain=domain) if dev_raw: evaluate_dev(model, domain_convertor, domain_controller, general_controller, dev_raw, domain=domain) write_stats() if OPTIONS.shell: run_shell(model) elif OPTIONS.server: run_server(model, hostname=OPTIONS.hostname, port=OPTIONS.port)
def evaluate_multiple_domains(name, model, dataset, domain=None): sub_domains = {} for ex in dataset: sd = ex.sub_domain if sd not in sub_domains: sub_domains[sd] = [] sub_domains[sd].append(ex) for sub_domain_str in sub_domains: print sub_domain_str sub_domain = domains.new(sub_domain_str) evaluate(name, model, sub_domains[sub_domain_str], sub_domain)
def run(): configure_theano() domain = None if OPTIONS.domain: domain = domains.new(OPTIONS.domain) train_raw, dev_raw = load_raw_all(domain=domain) random.seed(OPTIONS.model_seed) numpy.random.seed(OPTIONS.model_seed) spec = init_spec(train_raw) model = get_model(spec) if train_raw: train_data = preprocess_data(model, train_raw) random.seed(OPTIONS.model_seed) dev_data = None if dev_raw: dev_data = preprocess_data(model, dev_raw) augmenter = get_augmenter(train_raw, domain) model.train(train_data, T=OPTIONS.num_epochs, eta=OPTIONS.learning_rate, dev_data=dev_data, l2_reg=OPTIONS.lambda_reg, distract_prob=OPTIONS.distract_prob, distract_num=OPTIONS.distract_num, concat_prob=OPTIONS.concat_prob, concat_num=OPTIONS.concat_num, augmenter=augmenter, aug_frac=OPTIONS.aug_frac) if OPTIONS.save_file: print >> sys.stderr, 'Saving parameters...' spec.save(OPTIONS.save_file) if train_raw: evaluate_train(model, train_data, domain=domain, sample_file='%s.train' % OPTIONS.sample_file) if dev_raw: evaluate_dev(model, dev_raw, domain=domain, sample_file='%s.dev' % OPTIONS.sample_file) write_stats() if OPTIONS.shell: run_shell(model) elif OPTIONS.server: run_server(model, hostname=OPTIONS.hostname, port=OPTIONS.port)
def main(): """Print augmented data to stdout.""" if len(sys.argv) < 5: print >> sys.stderr, 'Usage: %s [file] [domain] [aug-type] [num]' % sys.argv[ 0] sys.exit(1) fname, domain_name, aug_type_str, num = sys.argv[1:5] num = int(num) aug_types = aug_type_str.split('+') data = [] domain = domains.new(domain_name) with open(fname) as f: for line in f: x, y = line.strip().split('\t') y = domain.preprocess_lf(y) data.append((x, y)) augmenter = Augmenter(domain, data, aug_types) aug_data = augmenter.sample(num) for ex in aug_data: print '\t'.join(ex)
def main(): random.seed(0) base_data = gen_nested() base_train, base_test = base_data[:100], base_data[-500:] write_data('train_base100.tsv', base_train) write_data('test_base500.tsv', base_test) domain = domains.new('artificial') augmenter_entity = Augmenter(domain, base_train, ['entity']) augmenter_nesting = Augmenter(domain, base_train, ['nesting', 'entity']) deeper = sample_nested(depth=4, num=500) entity_data = augmenter_entity.sample(500) nesting_data = augmenter_nesting.sample(500) aug_nums = (25, 50, 75, 100, 150, 200, 250, 300, 400, 500) for n in aug_nums: write_data('train_base%d.tsv' % (100 + n), base_data[:(100 + n)]) write_data('train_base100_entity%d.tsv' % n, base_train + entity_data[:n]) write_data('train_base100_nesting%d.tsv' % n, base_train + nesting_data[:n]) write_data('train_base100_deeper%d.tsv' % n, base_train + deeper[:n])