예제 #1
0
def generate_name_dataset(config):
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile = config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    _ = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d", len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d", len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s',
                max_names)
    logger.info('generating new datasets based on entity names')
    #     dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
    #     if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names,
               etrain2types,
               max_names[0],
               outfile=dsdir + '/train.txt')
    gen_new_ds(edev2names,
               edev2types,
               max_names[1],
               outfile=dsdir + '/dev.txt')
    gen_new_ds(etest2names,
               etest2types,
               max_names[2],
               outfile=dsdir + '/test.txt')
예제 #2
0
def generate_name_dataset(config):
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile=config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    _ = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d",len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d",len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s', max_names)
    logger.info('generating new datasets based on entity names')
#     dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
#     if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt')
    gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt')
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
예제 #3
0
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')

if __name__ == '__main__':
    print 'loading config file', sys.argv[1]
    config = cmn.loadConfig(sys.argv[1])
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile=config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    
    upto = -1
    (t2idx, idx2t) = cmn.loadtypes(targetTypesFile)
    numtargets = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d",len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d",len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s', max_names)
    logger.info('generating new datasets based on entity names')
    dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
    if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt')
    gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt')
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
    
예제 #4
0
    batch_size = int(config['batchsize'])
    targetTypesFile = config['typefile']
    learning_rate = float(config['lrate'])
    networkfile = config['net']
    num_of_hidden_units = int(config['hidden_units'])
    n_epochs = int(config['nepochs'])
    maxngram = int(config['maxngram'])
    MLP = str_to_bool(config['mlp'])
    featuresToUse = [fea for fea in config['features'].split(' ')]
    npdir = config['npdir']
    if not os.path.exists(npdir): os.makedirs(npdir)

    (t2idx, idx2t) = loadtypes(targetTypesFile)
    numtype = len(t2idx)
    (etrain2types, etrain2names, _) = load_entname_ds(trainfile,
                                                      t2idx,
                                                      use_ix=True)
    print "number of train examples:" + str(len(etrain2names))
    (etest2types, etest2names, _) = load_entname_ds(testfile,
                                                    t2idx,
                                                    use_ix=True)
    print "number of test examples:" + str(len(etest2names))
    (edev2types, edev2names, _) = load_entname_ds(devfile, t2idx, use_ix=True)
    print "number of dev examples:" + str(len(edev2names))

    if args.loaddata:
        (in_trn_matrix, target_trn_matrix,
         trnents) = load_input_matrix('train')
        (in_tst_matrix, target_tst_matrix, tstents) = load_input_matrix('test')
        (in_dev_matrix, target_dev_matrix, devents) = load_input_matrix('dev')
    else:
예제 #5
0
 testfile=config['Etest']
 batch_size=int(config['batchsize'])
 targetTypesFile=config['typefile']
 learning_rate = float(config['lrate'])
 networkfile = config['net']
 num_of_hidden_units = int(config['hidden_units'])
 n_epochs = int(config['nepochs'])
 maxngram = int(config['maxngram'])
 MLP=str_to_bool(config['mlp'])
 featuresToUse= [fea for fea in config['features'].split(' ')]
 npdir = config['npdir']
 if not os.path.exists(npdir): os.makedirs(npdir)
 
 (t2idx, idx2t) = loadtypes(targetTypesFile)
 numtype = len(t2idx)
 (etrain2types, etrain2names,_) = load_entname_ds(trainfile, t2idx, use_ix=True)
 print "number of train examples:" + str(len(etrain2names))
 (etest2types, etest2names,_) = load_entname_ds(testfile, t2idx, use_ix=True)
 print "number of test examples:" + str(len(etest2names))
 (edev2types, edev2names,_) = load_entname_ds(devfile, t2idx, use_ix=True)
 print "number of dev examples:" + str(len(edev2names))
 
 if args.loaddata:
     (in_trn_matrix, target_trn_matrix, trnents) = load_input_matrix('train')
     (in_tst_matrix, target_tst_matrix, tstents) = load_input_matrix('test')
     (in_dev_matrix, target_dev_matrix, devents) = load_input_matrix('dev')
 else:
     word2cluster = load_brown_clusters_mapping(brownMappingFile)
     etrain2rawFea = build_features(etrain2names, etrain2types, word2cluster, featuresToUse, maxnamenum=3, upto=UPTO)
     edev2rawFea = build_features(edev2names, edev2types, word2cluster, featuresToUse, maxnamenum=1, upto=UPTO)
     etest2rawFea = build_features(etest2names, etest2types, word2cluster, featuresToUse, maxnamenum=1, upto=UPTO)