예제 #1
0
 def __init__(self, name, config):
     self.config = config
     self.name = name
     self.max_len = int(config[self.name + '_max_len'])
     self.dim_emb = int(config[self.name + '_emb_dim'])
     self.fuelfile = config['dsdir'] + '_' + name + '.h5py'
     self.nn_model = config[self.name + '_nn']
     self.emb_file = self.config['dsdir'] + '_' + self.name + '_embeddings.h5py'
     self.tune_tune = str_to_bool(self.config[self.name + '_tune_embedding'])
     self.load_emb = str_to_bool(self.config[self.name + '_load_embedding'])
     if self.load_emb == False: 
         assert self.tune_tune == True
     self.rec = False
     if 'lstm' in self.nn_model or 'rnn' in self.nn_model or 'gru' in self.nn_model: 
         self.rec = True    
예제 #2
0
 def __init__(self, name, config):
     self.config = config
     self.name = name
     self.max_len = int(config[self.name + '_max_len'])
     self.dim_emb = int(config[self.name + '_emb_dim'])
     self.fuelfile = config['dsdir'] + '_' + name + '.h5py'
     self.nn_model = config[self.name + '_nn']
     self.emb_file = self.config[
         'dsdir'] + '_' + self.name + '_embeddings.h5py'
     self.tune_tune = str_to_bool(self.config[self.name +
                                              '_tune_embedding'])
     self.load_emb = str_to_bool(self.config[self.name + '_load_embedding'])
     if self.load_emb == False:
         assert self.tune_tune == True
     self.rec = False
     if 'lstm' in self.nn_model or 'rnn' in self.nn_model or 'gru' in self.nn_model:
         self.rec = True
예제 #3
0
def main(args):
    print 'loading config file', args[1]
    config = cmn.loadConfig(args[1])
    dsdir = config['dsdir']
    #first generating name datasets based on the number of names for each set
    if not os.path.exists(os.path.join(dsdir,'train.txt')):
        generate_name_dataset(config) 
    
    trainfile = dsdir + '/train.txt'
    devfile = dsdir + '/dev.txt'
    testfile = dsdir + '/test.txt'
    targetTypesFile=config['typefile']
    vectorFile = config['ent_vectors']
    vectorFile_words = config['word_vectors'] if 'word_vectors' in config else vectorFile
    subword_vectorFile = config['fasttext_vecfile'] if 'fasttext_vecfile' in config else None
    ent2tfidf_features_path = config['ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None
#     the_features = config['features'].split(' ') #i.e. letters entvec words tc 
    ngrams = [int(n) for n in config['ngrams_n'].split()] if 'ngrams_n' in config else []
    ngrams_vecfiles = {ngram: config['ngrams'+str(ngram)+'_vecfile'] for ngram in ngrams}
    letter_vecfile = config['letters_vecfile'] if 'letters_vecfile' in config else None
    hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None
    hs_ngram_versions = config['hsngram_vecs'].split() if hs_ngram_path else None
    use_lowercase = str_to_bool(config['use_lowercase']) if 'use_lowercase' in config else False
    print "uselower: ", use_lowercase
    upto = -1
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    trnMentions = load_ent_ds(trainfile)
    devMentions = load_ent_ds(devfile)
    tstMentions = load_ent_ds(testfile)
    logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions))
    
    if not os.path.exists(os.path.join(dsdir,'_targets.h5py')):
        build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir)
        build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40)
        build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        if hs_ngram_path:
            build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1)
        for ng in ngrams:
            build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1)
        build_type_patterns(trnMentions, t2idx, dsdir, vectorFile)
        save_typevecmatrix(t2idx, dsdir, vectorFile)
#         build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1)
        build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1)
        build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
    else:
        build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
예제 #4
0
def build_model_new(fea2obj,
                    num_targets,
                    config,
                    kl_weight,
                    entropy_weight,
                    deterministic=False,
                    test=False):
    hidden_size = config['hidden_units'].split()
    use_highway = str_to_bool(
        config['use_highway']) if 'use_highway' in config else False
    use_gaus = str_to_bool(
        config['use_gaus']) if 'use_gaus' in config else False
    use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True
    n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0
    use_noise = str_to_bool(
        config['use_noise']) if 'use_noise' in config else False
    use_vae = str_to_bool(config['use_vae']) if 'use_vae' in config else False
    hu_decoder = int(
        config['hu_decoder']) if 'hu_decoder' in config else hidden_size
    logger.info(
        'use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s',
        use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z,
        hu_decoder, hidden_size)
    init_with_type = str_to_bool(
        config['init_with_type']) if 'init_with_type' in config else False
    y = T.matrix('targets', dtype='int32')

    drop_prob = float(config['dropout']) if 'dropout' in config else 0

    #build the feature vector with one model, e.g., with cnn or mean or lstm
    feature_vec, feature_vec_len = build_feature_vec(fea2obj, config)

    #drop out
    if drop_prob > 0:
        mask = T.cast(
            srng.binomial(n=1, p=1 - drop_prob, size=feature_vec.shape),
            'float32')
        if test:
            feature_vec *= (1 - drop_prob)
        else:
            feature_vec *= mask

    #Highway network
    if use_highway:
        g_mlp = MLP(activations=[Rectifier()],
                    dims=[feature_vec_len, feature_vec_len],
                    name='g_mlp')
        t_mlp = MLP(activations=[Logistic()],
                    dims=[feature_vec_len, feature_vec_len],
                    name='t_mlp')
        initialize([g_mlp, t_mlp])
        t = t_mlp.apply(feature_vec)
        z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec
        feature_vec = z

    #MLP(s)
    logger.info('feature vec length = %s and hidden layer units = %s',
                feature_vec_len, ' '.join(hidden_size))
    if len(hidden_size) > 1:
        #2 MLP on feature fector
        mlp = MLP(
            activations=[Rectifier(), Rectifier()],
            dims=[feature_vec_len,
                  int(hidden_size[0]),
                  int(hidden_size[1])],
            name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = int(hidden_size[1])
    else:
        hidden_size = int(hidden_size[0])
        mlp = MLP(activations=[Rectifier()],
                  dims=[feature_vec_len, hidden_size],
                  name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = hidden_size

    #compute y_hat initial guess
    hidden_to_output = Linear(name='hidden_to_output',
                              input_dim=last_hidden_size,
                              output_dim=num_targets)

    typemfile = None
    if init_with_type:
        typemfile = config['dsdir'] + '/_typematrix.npy'
        #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy'

    initialize_lasthid(hidden_to_output, typemfile)
    #         initialize([hidden_to_output])

    y_hat_init = Logistic().apply(hidden_to_output.apply(before_out))
    y_hat_init.name = 'y_hat_init'
    y_hat_init = debug_print(y_hat_init, 'yhat_init', False)
    logpy_xz_init = cross_entropy_loss(y_hat_init, y)
    logpy_xz = logpy_xz_init
    y_hat_recog = y_hat_init
    y_hat = y_hat_init
    KLD = 0

    if use_gaus:
        if use_vae:
            logger.info('using VAE')
            vae_conditional = str_to_bool(config['vae_cond'])
            if vae_conditional:
                y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(
                    kl_weight,
                    entropy_weight,
                    y_hat_init,
                    feature_vec,
                    feature_vec_len,
                    config,
                    y,
                    test=test,
                    deterministic=deterministic,
                    num_targets=num_targets,
                    n_latent_z=n_latent_z,
                    hidden_size=hidden_size,
                    hu_decoder=hu_decoder)
            else:
                y_hat, logpy_xz, KLD = build_vae_basic(
                    kl_weight,
                    feature_vec,
                    feature_vec_len,
                    config,
                    y,
                    test=test,
                    deterministic=deterministic,
                    num_targets=num_targets,
                    n_latent_z=n_latent_z,
                    hidden_size=hidden_size,
                    hu_decoder=hu_decoder)
                y_hat_recog = y_hat
        else:
            if use_rec:
                logger.info('Not using VAE... but using recursion')
                prior_in = T.concatenate([feature_vec, y_hat_init], axis=1)
                mu_prior, log_sigma_prior = prior_network(
                    x=prior_in,
                    n_input=feature_vec_len + num_targets,
                    hu_encoder=hidden_size,
                    n_latent=n_latent_z)
                z_prior = sampler(mu_prior,
                                  log_sigma_prior,
                                  deterministic=deterministic,
                                  use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl,
                                             n_latent=n_latent_z +
                                             feature_vec_len,
                                             hu_decoder=hu_decoder,
                                             n_out=num_targets,
                                             y=y)
                y_hat = (y_hat + y_hat_init) / 2.
                logpy_xz = (logpy_xz + logpy_xz_init) / 2.
            else:
                prior_in = T.concatenate([feature_vec], axis=1)
                mu_prior, log_sigma_prior = prior_network(
                    x=prior_in,
                    n_input=feature_vec_len,
                    hu_encoder=hidden_size,
                    n_latent=n_latent_z)
                z_prior = sampler(mu_prior,
                                  log_sigma_prior,
                                  deterministic=deterministic,
                                  use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl,
                                             n_latent=n_latent_z +
                                             feature_vec_len,
                                             hu_decoder=hu_decoder,
                                             n_out=num_targets,
                                             y=y)

            y_hat_recog = y_hat

    y_hat = debug_print(y_hat, 'y_hat', False)

    pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)])
    max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False)
    pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type])
    mean_cross = T.mean(logpy_xz)
    mean_kld = T.mean(KLD)
    cost = mean_kld + mean_cross
    cost.name = 'cost'
    mean_kld.name = 'kld'
    mean_cross.name = 'cross_entropy_loss'
    pat1.name = 'p@1'
    pat1_recog.name = 'p@1_recog'
    misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5))
    misclassify_rate.name = 'error_rate'

    return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
예제 #5
0
                                              t2idx=t2idx)
    return in_trn_matrix, in_tst_matrix, in_dev_matrix


print 'loading config file', sys.argv[1]
config = cmn.loadConfig(sys.argv[1])
hdf5_file = config['fuelfile']

trainfile = config['Etrain']
devfile = config['Edev']
testfile = config['Etest']
targetTypesFile = config['typefile']
max_len_name = int(config['max_name_length'])
vectorFile = config['ent_vectors']
if 'typecosine' in config:
    usetypecosine = cmn.str_to_bool(config['typecosine'])
brownMappingFile = config['brownclusters']
maxngram = int(config['maxngram'])
featuresToUse = [fea for fea in config['features'].split(' ')]
npdir = config['npdir']
if not os.path.exists(npdir): os.makedirs(npdir)

upto = -1
(t2idx, idx2t) = cmn.loadtypes(targetTypesFile)
numtargets = len(t2idx)
# load entity to type datasets
(etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
print "number of train examples:" + str(len(etrain2names))
(etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
print "number of test examples:" + str(len(etest2names))
(edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
예제 #6
0
    parser = get_argument_parser()
    args = parser.parse_args()

    config = loadConfig(args.config)
    brownMappingFile = config['brownclusters']
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    batch_size = int(config['batchsize'])
    targetTypesFile = config['typefile']
    learning_rate = float(config['lrate'])
    networkfile = config['net']
    num_of_hidden_units = int(config['hidden_units'])
    n_epochs = int(config['nepochs'])
    maxngram = int(config['maxngram'])
    MLP = str_to_bool(config['mlp'])
    featuresToUse = [fea for fea in config['features'].split(' ')]
    npdir = config['npdir']
    if not os.path.exists(npdir): os.makedirs(npdir)

    (t2idx, idx2t) = loadtypes(targetTypesFile)
    numtype = len(t2idx)
    (etrain2types, etrain2names, _) = load_entname_ds(trainfile,
                                                      t2idx,
                                                      use_ix=True)
    print "number of train examples:" + str(len(etrain2names))
    (etest2types, etest2names, _) = load_entname_ds(testfile,
                                                    t2idx,
                                                    use_ix=True)
    print "number of test examples:" + str(len(etest2names))
    (edev2types, edev2names, _) = load_entname_ds(devfile, t2idx, use_ix=True)
예제 #7
0
    parser = get_argument_parser()
    args = parser.parse_args()

    config = loadConfig(args.config)
    brownMappingFile=config['brownclusters']
    trainfile=config['Etrain']
    devfile=config['Edev']
    testfile=config['Etest']
    batch_size=int(config['batchsize'])
    targetTypesFile=config['typefile']
    learning_rate = float(config['lrate'])
    networkfile = config['net']
    num_of_hidden_units = int(config['hidden_units'])
    n_epochs = int(config['nepochs'])
    maxngram = int(config['maxngram'])
    MLP=str_to_bool(config['mlp'])
    featuresToUse= [fea for fea in config['features'].split(' ')]
    npdir = config['npdir']
    if not os.path.exists(npdir): os.makedirs(npdir)
    
    (t2idx, idx2t) = loadtypes(targetTypesFile)
    numtype = len(t2idx)
    (etrain2types, etrain2names,_) = load_entname_ds(trainfile, t2idx, use_ix=True)
    print "number of train examples:" + str(len(etrain2names))
    (etest2types, etest2names,_) = load_entname_ds(testfile, t2idx, use_ix=True)
    print "number of test examples:" + str(len(etest2names))
    (edev2types, edev2names,_) = load_entname_ds(devfile, t2idx, use_ix=True)
    print "number of dev examples:" + str(len(edev2names))
    
    if args.loaddata:
        (in_trn_matrix, target_trn_matrix, trnents) = load_input_matrix('train')
예제 #8
0
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False ):
    hidden_size = config['hidden_units'].split()
    use_highway = str_to_bool(config['use_highway']) if 'use_highway' in config else False
    use_gaus = str_to_bool(config['use_gaus']) if 'use_gaus' in config else False 
    use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True
    n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0
    use_noise = str_to_bool(config['use_noise']) if 'use_noise' in config else False
    use_vae=str_to_bool(config['use_vae']) if 'use_vae' in config else False
    hu_decoder = int(config['hu_decoder']) if 'hu_decoder' in config else hidden_size
    logger.info('use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size)
    init_with_type = str_to_bool(config['init_with_type']) if 'init_with_type' in config else False
    y = T.matrix('targets', dtype='int32')
    
    drop_prob = float(config['dropout']) if 'dropout' in config else 0
    
    #build the feature vector with one model, e.g., with cnn or mean or lstm
    feature_vec, feature_vec_len = build_feature_vec(fea2obj, config)
    
    #drop out
    if drop_prob > 0:
        mask = T.cast(srng.binomial(n=1, p=1-drop_prob, size=feature_vec.shape), 'float32')
        if test:
            feature_vec *= (1 - drop_prob)
        else:
            feature_vec *= mask
            

    #Highway network
    if use_highway:
        g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp')
        t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp')
        initialize([g_mlp, t_mlp])
        t = t_mlp.apply(feature_vec)
        z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec
        feature_vec = z
        
    #MLP(s)         
    logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size))
    if len(hidden_size) > 1:
        #2 MLP on feature fector    
        mlp = MLP(activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = int(hidden_size[1])
    else:
        hidden_size = int(hidden_size[0])
        mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = hidden_size

        
    #compute y_hat initial guess
    hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets)
    
    typemfile = None
    if init_with_type:
        typemfile = config['dsdir'] + '/_typematrix.npy'
        #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy'
        
    initialize_lasthid(hidden_to_output, typemfile)
#         initialize([hidden_to_output])
    
    y_hat_init = Logistic().apply(hidden_to_output.apply(before_out))
    y_hat_init.name='y_hat_init'
    y_hat_init = debug_print(y_hat_init, 'yhat_init', False)
    logpy_xz_init = cross_entropy_loss(y_hat_init, y)
    logpy_xz = logpy_xz_init  
    y_hat_recog = y_hat_init
    y_hat = y_hat_init
    KLD = 0
    
    if use_gaus:     
        if use_vae:
            logger.info('using VAE')
            vae_conditional=str_to_bool(config['vae_cond']) 
            if vae_conditional:
                y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y,
                    test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder)
            else:
                y_hat, logpy_xz, KLD = build_vae_basic(kl_weight, feature_vec, feature_vec_len, config, y, 
                    test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder)
                y_hat_recog = y_hat
        else:
            if use_rec:
                logger.info('Not using VAE... but using recursion')
                prior_in = T.concatenate([feature_vec, y_hat_init], axis=1)
                mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len+num_targets, hu_encoder=hidden_size, n_latent=n_latent_z)
                z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y)
                y_hat = (y_hat + y_hat_init) / 2. 
                logpy_xz = (logpy_xz + logpy_xz_init) / 2.
            else:
                prior_in = T.concatenate([feature_vec], axis=1)
                mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z)
                z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y)
            
            y_hat_recog = y_hat
                

    y_hat = debug_print(y_hat, 'y_hat', False)

    pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)])
    max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False)
    pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type])
    mean_cross = T.mean(logpy_xz)
    mean_kld = T.mean(KLD)
    cost = mean_kld + mean_cross 
    cost.name = 'cost'; mean_kld.name = 'kld'; mean_cross.name = 'cross_entropy_loss'; pat1.name = 'p@1'; pat1_recog.name = 'p@1_recog'
    misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5))
    misclassify_rate.name = 'error_rate'

    return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
예제 #9
0
    (in_tst_matrix, _, _) = make_input_matrix(etest2rawFea, etest2types, fea2id, 'test', npdir=npdir, t2idx=t2idx)
    (in_dev_matrix, _, _) = make_input_matrix(edev2rawFea, edev2types, fea2id, 'dev', npdir=npdir, t2idx=t2idx)
    return in_trn_matrix, in_tst_matrix, in_dev_matrix

print 'loading config file', sys.argv[1]
config = cmn.loadConfig(sys.argv[1])
hdf5_file = config['fuelfile'] 

trainfile=config['Etrain']
devfile=config['Edev']
testfile=config['Etest']
targetTypesFile=config['typefile']
max_len_name= int(config['max_name_length'])
vectorFile=config['ent_vectors']
if 'typecosine' in config:
    usetypecosine = cmn.str_to_bool(config['typecosine'])
brownMappingFile=config['brownclusters']
maxngram = int(config['maxngram'])
featuresToUse= [fea for fea in config['features'].split(' ')]
npdir = config['npdir']
if not os.path.exists(npdir): os.makedirs(npdir)

upto = -1
(t2idx, idx2t) = cmn.loadtypes(targetTypesFile)
numtargets = len(t2idx)
# load entity to type datasets
(etrain2types, etrain2names,_) = cmn.load_entname_ds(trainfile, t2idx)
print "number of train examples:" + str(len(etrain2names))
(etest2types, etest2names,_) = cmn.load_entname_ds(testfile, t2idx)
print "number of test examples:" + str(len(etest2names))
(edev2types, edev2names,_) = cmn.load_entname_ds(devfile, t2idx)
예제 #10
0
def main(args):
    print 'loading config file', args[1]
    config = cmn.loadConfig(args[1])
    dsdir = config['dsdir']
    #first generating name datasets based on the number of names for each set
    if not os.path.exists(os.path.join(dsdir, 'train.txt')):
        generate_name_dataset(config)

    trainfile = dsdir + '/train.txt'
    devfile = dsdir + '/dev.txt'
    testfile = dsdir + '/test.txt'
    targetTypesFile = config['typefile']
    vectorFile = config['ent_vectors']
    vectorFile_words = config[
        'word_vectors'] if 'word_vectors' in config else vectorFile
    subword_vectorFile = config[
        'fasttext_vecfile'] if 'fasttext_vecfile' in config else None
    ent2tfidf_features_path = config[
        'ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None
    #     the_features = config['features'].split(' ') #i.e. letters entvec words tc
    ngrams = [int(n) for n in config['ngrams_n'].split()
              ] if 'ngrams_n' in config else []
    ngrams_vecfiles = {
        ngram: config['ngrams' + str(ngram) + '_vecfile']
        for ngram in ngrams
    }
    letter_vecfile = config[
        'letters_vecfile'] if 'letters_vecfile' in config else None
    hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None
    hs_ngram_versions = config['hsngram_vecs'].split(
    ) if hs_ngram_path else None
    use_lowercase = str_to_bool(
        config['use_lowercase']) if 'use_lowercase' in config else False
    print "uselower: ", use_lowercase
    upto = -1
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    trnMentions = load_ent_ds(trainfile)
    devMentions = load_ent_ds(devfile)
    tstMentions = load_ent_ds(testfile)
    logger.info("#train : %d #dev : %d #test : %d", len(trnMentions),
                len(devMentions), len(tstMentions))

    if not os.path.exists(os.path.join(dsdir, '_targets.h5py')):
        build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir)
        build_entvec_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        dsdir,
                        vectorFile,
                        upto=-1)
        build_letters_ds(trnMentions,
                         devMentions,
                         tstMentions,
                         t2idx,
                         dsdir,
                         letter_vecfile,
                         max_len_name=40)
        build_typecosine_ds(trnMentions,
                            devMentions,
                            tstMentions,
                            t2idx,
                            dsdir,
                            vectorFile,
                            upto=-1)
        if hs_ngram_path:
            build_hsNgram_ds(config,
                             trnMentions,
                             devMentions,
                             tstMentions,
                             t2idx,
                             dsdir,
                             hs_ngram_path,
                             hs_ngram_versions,
                             vectorsize=300,
                             upto=-1)
        for ng in ngrams:
            build_ngram_ds(trnMentions,
                           devMentions,
                           tstMentions,
                           t2idx,
                           dsdir,
                           ngrams_vecfiles[ng],
                           ng,
                           upto=-1)
        build_type_patterns(trnMentions, t2idx, dsdir, vectorFile)
        save_typevecmatrix(t2idx, dsdir, vectorFile)
        #         build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_subwords_ds(trnMentions,
                          devMentions,
                          tstMentions,
                          t2idx,
                          dsdir,
                          subword_vectorFile,
                          use_lowercase=use_lowercase,
                          upto=-1)
        build_words_ds(trnMentions,
                       devMentions,
                       tstMentions,
                       t2idx,
                       dsdir,
                       vectorFile_words,
                       use_lowercase=use_lowercase,
                       upto=-1)
        build_entvec_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        dsdir,
                        vectorFile,
                        upto=-1)
        build_desc_features_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               ent2tfidf_features_path,
                               t2idx,
                               dsdir,
                               vectorFile_words,
                               use_lowercase=True,
                               upto=-1)
    else:
        build_desc_features_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               ent2tfidf_features_path,
                               t2idx,
                               dsdir,
                               vectorFile_words,
                               use_lowercase=True,
                               upto=-1)