def build_targets_ds(config, all_contexts, nsamples_train, nsamples_dev, nsamples_test, nsamples_dev_big): logger.info("building targets dataset") entity_types = list(load_types(config['typefile'])) (t2idx, _) = cmn.loadtypes(config['typefile']) totals = len(all_contexts) targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32') for i, ctx in enumerate(all_contexts): types_idx = [t2idx[t] for t in ctx.all_types if t in t2idx] targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx)) dsdir = config['dsdir'] fp = h5py.File(dsdir + '_targets.hdf', mode='w') targets = fp.create_dataset('targets', targets_m.shape, dtype='int32') targets.attrs['type_to_ix'] = yaml.dump(t2idx) targets[...] = targets_m targets.dims[0].label = 'all_types' split_dict = { 'train': { 'targets': (0, nsamples_train) }, 'dev': { 'targets': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'targets': (nsamples_train + nsamples_dev, nsamples_train + nsamples_dev + nsamples_test) }, 'devbig': { 'targets': (nsamples_train + nsamples_dev + nsamples_test, totals) } } fp.attrs['split'] = H5PYDataset.create_split_array(split_dict) fp.flush() fp.close()
def generate_name_dataset(config): trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile = config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] (t2idx, _) = cmn.loadtypes(targetTypesFile) _ = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d", len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d", len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') # dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) # if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile=dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile=dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile=dsdir + '/test.txt')
def main(args): print 'loading config file', args[1] config = cmn.loadConfig(args[1]) dsdir = config['dsdir'] #first generating name datasets based on the number of names for each set if not os.path.exists(os.path.join(dsdir,'train.txt')): generate_name_dataset(config) trainfile = dsdir + '/train.txt' devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' targetTypesFile=config['typefile'] vectorFile = config['ent_vectors'] vectorFile_words = config['word_vectors'] if 'word_vectors' in config else vectorFile subword_vectorFile = config['fasttext_vecfile'] if 'fasttext_vecfile' in config else None ent2tfidf_features_path = config['ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None # the_features = config['features'].split(' ') #i.e. letters entvec words tc ngrams = [int(n) for n in config['ngrams_n'].split()] if 'ngrams_n' in config else [] ngrams_vecfiles = {ngram: config['ngrams'+str(ngram)+'_vecfile'] for ngram in ngrams} letter_vecfile = config['letters_vecfile'] if 'letters_vecfile' in config else None hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None hs_ngram_versions = config['hsngram_vecs'].split() if hs_ngram_path else None use_lowercase = str_to_bool(config['use_lowercase']) if 'use_lowercase' in config else False print "uselower: ", use_lowercase upto = -1 (t2idx, _) = cmn.loadtypes(targetTypesFile) trnMentions = load_ent_ds(trainfile) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions)) if not os.path.exists(os.path.join(dsdir,'_targets.h5py')): build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40) build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) if hs_ngram_path: build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1) for ng in ngrams: build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1) build_type_patterns(trnMentions, t2idx, dsdir, vectorFile) save_typevecmatrix(t2idx, dsdir, vectorFile) # build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1) build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1) else: build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
def generate_name_dataset(config): trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile=config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] (t2idx, _) = cmn.loadtypes(targetTypesFile) _ = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d",len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d",len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') # dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) # if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
max_names[2], outfile=dsdir + '/test.txt') if __name__ == '__main__': print 'loading config file', sys.argv[1] config = cmn.loadConfig(sys.argv[1]) trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile = config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] upto = -1 (t2idx, idx2t) = cmn.loadtypes(targetTypesFile) numtargets = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d", len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d", len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types,
def main(args): print 'loading config file', args[1] config = cmn.loadConfig(args[1]) dsdir = config['dsdir'] #first generating name datasets based on the number of names for each set if not os.path.exists(os.path.join(dsdir, 'train.txt')): generate_name_dataset(config) trainfile = dsdir + '/train.txt' devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' targetTypesFile = config['typefile'] vectorFile = config['ent_vectors'] vectorFile_words = config[ 'word_vectors'] if 'word_vectors' in config else vectorFile subword_vectorFile = config[ 'fasttext_vecfile'] if 'fasttext_vecfile' in config else None ent2tfidf_features_path = config[ 'ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None # the_features = config['features'].split(' ') #i.e. letters entvec words tc ngrams = [int(n) for n in config['ngrams_n'].split() ] if 'ngrams_n' in config else [] ngrams_vecfiles = { ngram: config['ngrams' + str(ngram) + '_vecfile'] for ngram in ngrams } letter_vecfile = config[ 'letters_vecfile'] if 'letters_vecfile' in config else None hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None hs_ngram_versions = config['hsngram_vecs'].split( ) if hs_ngram_path else None use_lowercase = str_to_bool( config['use_lowercase']) if 'use_lowercase' in config else False print "uselower: ", use_lowercase upto = -1 (t2idx, _) = cmn.loadtypes(targetTypesFile) trnMentions = load_ent_ds(trainfile) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions)) if not os.path.exists(os.path.join(dsdir, '_targets.h5py')): build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40) build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) if hs_ngram_path: build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1) for ng in ngrams: build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1) build_type_patterns(trnMentions, t2idx, dsdir, vectorFile) save_typevecmatrix(t2idx, dsdir, vectorFile) # build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1) build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1) else: build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt') if __name__ == '__main__': print 'loading config file', sys.argv[1] config = cmn.loadConfig(sys.argv[1]) trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile=config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] upto = -1 (t2idx, idx2t) = cmn.loadtypes(targetTypesFile) numtargets = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d",len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d",len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')