예제 #1
0
def main(folder, domain, augmentation_type, num):
    """Print augmented data to stdout."""
    aug_types = augmentation_type.split('+')
    data = []
    domain = domains.new(domain)
    if folder == 'train':
        fname = './geoQueryData/train/geo880_train600.tsv'
        nums_data = 600
    elif folder == 'dev':
        fname = './geoQueryData/dev/geo880_dev100.tsv'
        nums_data = 100
    elif folder == 'test':
        fname = './geoQueryData/test/geo880_test280.tsv'
        nums_data = 280
    with open(fname) as f:
        for line in f:
            x, y = line.strip().split('\t')
            y = domain.preprocess_lf(y)
            data.append((x, y))
    augmenter = Augmenter(domain, data, aug_types)
    aug_data = augmenter.sample(num)
    path = f"./geoQueryData/{folder}/geo880_{folder}_{num + nums_data}_{augmentation_type}_recomb.tsv"
    with open(path, 'w') as f:
        for data in data:
            f.write(data[0] + '\t' + data[1] + '\n')
        for ex in aug_data:
            f.write(ex[0] + '\t' + ex[1] + '\n')
예제 #2
0
def run():
    configure_theano()
    domain = None
    if OPTIONS.domain:
        domain = domains.new(OPTIONS.domain)
    train_raw, dev_raw = load_raw_all(domain=domain)

    random.seed(OPTIONS.model_seed)
    numpy.random.seed(OPTIONS.model_seed)

    #pair_stat = get_pair_stats(train_raw) # [pair co-occurances for attn2hist]
    spec = init_spec(train_raw)
    model = get_model(spec)
    global test_domain
    print('***', test_domain)
    if OPTIONS.model == 'attn2hist':
        model.spec.set_test_domain(test_domain)

    if train_raw:
        train_data = preprocess_data(model, train_raw)
        #if OPTIONS.model =='attn2hist':
        #set_pair_stats(model,train_data) # for the attn2hist model
        #test_ibm_model(model,train_data) # for the attn2hist model
        random.seed(OPTIONS.model_seed)
        dev_data = None
        if dev_raw:
            dev_data = preprocess_data(model, dev_raw)
        augmenter = get_augmenter(train_raw, domain)
        model.train(train_data,
                    T=OPTIONS.num_epochs,
                    eta=OPTIONS.learning_rate,
                    dev_data=dev_data,
                    l2_reg=OPTIONS.lambda_reg,
                    distract_prob=OPTIONS.distract_prob,
                    distract_num=OPTIONS.distract_num,
                    concat_prob=OPTIONS.concat_prob,
                    concat_num=OPTIONS.concat_num,
                    augmenter=augmenter,
                    aug_frac=OPTIONS.aug_frac)

    if OPTIONS.save_file:
        print >> sys.stderr, 'Saving parameters...'
        spec.save(OPTIONS.save_file)
    #if train_raw:
    #  evaluate_train(model, train_data, domain=domain)
    if dev_raw:
        evaluate_dev(model, dev_raw, domain=domain)

    write_stats()

    if OPTIONS.shell:
        run_shell(model)
    elif OPTIONS.server:
        run_server(model, hostname=OPTIONS.hostname, port=OPTIONS.port)
예제 #3
0
def run():
  configure_theano()
  domain = None
  if OPTIONS.domain:
    domain = domains.new(OPTIONS.domain)
  train_raw, dev_raw = load_raw_all(domain=domain)
  databases = load_databases(OPTIONS.domain_grammar, domain=domain)
  random.seed(OPTIONS.model_seed)
  numpy.random.seed(OPTIONS.model_seed)
  spec = init_spec(train_raw, databases)
  model = get_model(spec)
  constructor1 = CONTROLLERS[OPTIONS.general_ontology]
  constructor2 = CONTROLLERS[OPTIONS.domain_ontology]
  use_general_ontology = OPTIONS.use_generalontology
  use_domain_ontology = OPTIONS.use_geoontology or OPTIONS.use_atisontology or OPTIONS.use_overnightontology
  general_controller = constructor1(OPTIONS.general_grammar, use_general_ontology)
  domain_controller = constructor2(OPTIONS.domain_grammar, use_domain_ontology)
  constructor3 = CONVERTORS[OPTIONS.domain_convertor]
  domain_convertor = constructor3



  if train_raw:
    train_data = preprocess_data(domain_convertor, domain_controller, general_controller, model, train_raw)
    random.seed(OPTIONS.model_seed)
    dev_data = None
    if dev_raw:
      dev_data = preprocess_data(domain_convertor, domain_controller, general_controller, model, dev_raw)
    augmenter = get_augmenter(train_raw, domain)
    model.train(train_data, T=OPTIONS.num_epochs, eta=OPTIONS.learning_rate,
                dev_data=dev_data, l2_reg=OPTIONS.lambda_reg,
                distract_prob=OPTIONS.distract_prob,
                distract_num=OPTIONS.distract_num,
                concat_prob=OPTIONS.concat_prob, concat_num=OPTIONS.concat_num,
                augmenter=augmenter, aug_frac=OPTIONS.aug_frac)

  if OPTIONS.save_file:
    print >> sys.stderr, 'Saving parameters...'
    spec.save(OPTIONS.save_file)


  evaluation_train_flag = False
  if train_raw and evaluation_train_flag:
    evaluate_train(model, domain_convertor, domain_controller, general_controller, train_data, domain=domain)
  if dev_raw:
    evaluate_dev(model, domain_convertor, domain_controller, general_controller, dev_raw, domain=domain)

  write_stats()

  if OPTIONS.shell:
    run_shell(model)
  elif OPTIONS.server:
    run_server(model, hostname=OPTIONS.hostname, port=OPTIONS.port)
예제 #4
0
def evaluate_multiple_domains(name, model, dataset, domain=None):
    sub_domains = {}
    for ex in dataset:
        sd = ex.sub_domain
        if sd not in sub_domains:
            sub_domains[sd] = []
        sub_domains[sd].append(ex)

    for sub_domain_str in sub_domains:
        print sub_domain_str
        sub_domain = domains.new(sub_domain_str)
        evaluate(name, model, sub_domains[sub_domain_str], sub_domain)
예제 #5
0
def run():
    configure_theano()
    domain = None
    if OPTIONS.domain:
        domain = domains.new(OPTIONS.domain)
    train_raw, dev_raw = load_raw_all(domain=domain)
    random.seed(OPTIONS.model_seed)
    numpy.random.seed(OPTIONS.model_seed)
    spec = init_spec(train_raw)
    model = get_model(spec)

    if train_raw:
        train_data = preprocess_data(model, train_raw)
        random.seed(OPTIONS.model_seed)
        dev_data = None
        if dev_raw:
            dev_data = preprocess_data(model, dev_raw)
        augmenter = get_augmenter(train_raw, domain)
        model.train(train_data,
                    T=OPTIONS.num_epochs,
                    eta=OPTIONS.learning_rate,
                    dev_data=dev_data,
                    l2_reg=OPTIONS.lambda_reg,
                    distract_prob=OPTIONS.distract_prob,
                    distract_num=OPTIONS.distract_num,
                    concat_prob=OPTIONS.concat_prob,
                    concat_num=OPTIONS.concat_num,
                    augmenter=augmenter,
                    aug_frac=OPTIONS.aug_frac)

    if OPTIONS.save_file:
        print >> sys.stderr, 'Saving parameters...'
        spec.save(OPTIONS.save_file)

    if train_raw:
        evaluate_train(model,
                       train_data,
                       domain=domain,
                       sample_file='%s.train' % OPTIONS.sample_file)
    if dev_raw:
        evaluate_dev(model,
                     dev_raw,
                     domain=domain,
                     sample_file='%s.dev' % OPTIONS.sample_file)

    write_stats()

    if OPTIONS.shell:
        run_shell(model)
    elif OPTIONS.server:
        run_server(model, hostname=OPTIONS.hostname, port=OPTIONS.port)
예제 #6
0
def main():
    """Print augmented data to stdout."""
    if len(sys.argv) < 5:
        print >> sys.stderr, 'Usage: %s [file] [domain] [aug-type] [num]' % sys.argv[
            0]
        sys.exit(1)
    fname, domain_name, aug_type_str, num = sys.argv[1:5]
    num = int(num)
    aug_types = aug_type_str.split('+')
    data = []
    domain = domains.new(domain_name)
    with open(fname) as f:
        for line in f:
            x, y = line.strip().split('\t')
            y = domain.preprocess_lf(y)
            data.append((x, y))
    augmenter = Augmenter(domain, data, aug_types)
    aug_data = augmenter.sample(num)
    for ex in aug_data:
        print '\t'.join(ex)
예제 #7
0
def main():
    random.seed(0)
    base_data = gen_nested()
    base_train, base_test = base_data[:100], base_data[-500:]
    write_data('train_base100.tsv', base_train)
    write_data('test_base500.tsv', base_test)

    domain = domains.new('artificial')
    augmenter_entity = Augmenter(domain, base_train, ['entity'])
    augmenter_nesting = Augmenter(domain, base_train, ['nesting', 'entity'])
    deeper = sample_nested(depth=4, num=500)
    entity_data = augmenter_entity.sample(500)
    nesting_data = augmenter_nesting.sample(500)
    aug_nums = (25, 50, 75, 100, 150, 200, 250, 300, 400, 500)
    for n in aug_nums:
        write_data('train_base%d.tsv' % (100 + n), base_data[:(100 + n)])
        write_data('train_base100_entity%d.tsv' % n,
                   base_train + entity_data[:n])
        write_data('train_base100_nesting%d.tsv' % n,
                   base_train + nesting_data[:n])
        write_data('train_base100_deeper%d.tsv' % n, base_train + deeper[:n])