def generate_name_dataset(config): trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile = config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] (t2idx, _) = cmn.loadtypes(targetTypesFile) _ = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d", len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d", len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') # dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) # if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile=dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile=dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile=dsdir + '/test.txt')
def generate_name_dataset(config): trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile=config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] (t2idx, _) = cmn.loadtypes(targetTypesFile) _ = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d",len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d",len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') # dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) # if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt') if __name__ == '__main__': print 'loading config file', sys.argv[1] config = cmn.loadConfig(sys.argv[1]) trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile=config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] upto = -1 (t2idx, idx2t) = cmn.loadtypes(targetTypesFile) numtargets = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d",len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d",len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
batch_size = int(config['batchsize']) targetTypesFile = config['typefile'] learning_rate = float(config['lrate']) networkfile = config['net'] num_of_hidden_units = int(config['hidden_units']) n_epochs = int(config['nepochs']) maxngram = int(config['maxngram']) MLP = str_to_bool(config['mlp']) featuresToUse = [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir) (t2idx, idx2t) = loadtypes(targetTypesFile) numtype = len(t2idx) (etrain2types, etrain2names, _) = load_entname_ds(trainfile, t2idx, use_ix=True) print "number of train examples:" + str(len(etrain2names)) (etest2types, etest2names, _) = load_entname_ds(testfile, t2idx, use_ix=True) print "number of test examples:" + str(len(etest2names)) (edev2types, edev2names, _) = load_entname_ds(devfile, t2idx, use_ix=True) print "number of dev examples:" + str(len(edev2names)) if args.loaddata: (in_trn_matrix, target_trn_matrix, trnents) = load_input_matrix('train') (in_tst_matrix, target_tst_matrix, tstents) = load_input_matrix('test') (in_dev_matrix, target_dev_matrix, devents) = load_input_matrix('dev') else:
testfile=config['Etest'] batch_size=int(config['batchsize']) targetTypesFile=config['typefile'] learning_rate = float(config['lrate']) networkfile = config['net'] num_of_hidden_units = int(config['hidden_units']) n_epochs = int(config['nepochs']) maxngram = int(config['maxngram']) MLP=str_to_bool(config['mlp']) featuresToUse= [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir) (t2idx, idx2t) = loadtypes(targetTypesFile) numtype = len(t2idx) (etrain2types, etrain2names,_) = load_entname_ds(trainfile, t2idx, use_ix=True) print "number of train examples:" + str(len(etrain2names)) (etest2types, etest2names,_) = load_entname_ds(testfile, t2idx, use_ix=True) print "number of test examples:" + str(len(etest2names)) (edev2types, edev2names,_) = load_entname_ds(devfile, t2idx, use_ix=True) print "number of dev examples:" + str(len(edev2names)) if args.loaddata: (in_trn_matrix, target_trn_matrix, trnents) = load_input_matrix('train') (in_tst_matrix, target_tst_matrix, tstents) = load_input_matrix('test') (in_dev_matrix, target_dev_matrix, devents) = load_input_matrix('dev') else: word2cluster = load_brown_clusters_mapping(brownMappingFile) etrain2rawFea = build_features(etrain2names, etrain2types, word2cluster, featuresToUse, maxnamenum=3, upto=UPTO) edev2rawFea = build_features(edev2names, edev2types, word2cluster, featuresToUse, maxnamenum=1, upto=UPTO) etest2rawFea = build_features(etest2names, etest2types, word2cluster, featuresToUse, maxnamenum=1, upto=UPTO)