def train(session,
		  features,
		  labels, 
		  training_steps, 
		  batch_size,
		  dropout_hidden,
		  dropout_embedding,
		  batch_gen=util.Batch_Gen,
		  report_at=0):
	'''
	Standard training function using dropout (on embedding and hidden layers).
	'''
	sess=session
	loss=tf.get_collection(tf.GraphKeys.LOSSES)[-1]
	training_objective=tf.get_default_graph().get_operation_by_name('training_objective')
	batch_gen=batch_gen(features, labels, batch_size)
	for i_step in range(training_steps):
		curr_features, curr_labels=batch_gen.next()
		# monitor training
		if report_at>0 and i_step%report_at==0:
			curr_loss,preds=sess.run([loss,'output_layer:0'],
							 feed_dict={'input_layer:0':curr_features,
									    'dropout_embedding:0':.0,
									    'dropout_hidden:0':.0,
									    'actual_values:0':curr_labels})
			preds=pd.DataFrame(data=preds, columns=curr_labels.columns, index=curr_labels.index)
			perf=util.eval(true=curr_labels, prediction=preds)
			print(i_step, curr_loss, np.mean(perf))
		#acutal training step
		sess.run(training_objective,
				 feed_dict={'input_layer:0':curr_features,
						    'dropout_embedding:0':dropout_embedding,
						    'dropout_hidden:0':dropout_hidden
						    ,
						    'actual_values:0':curr_labels})
def test(session, features, labels):
	sess=session
	preds=sess.run('output_layer:0',
							 feed_dict={'input_layer:0':features,
									    'dropout_embedding:0':.0,
									    'dropout_hidden:0':.0})
	preds=pd.DataFrame(data=preds, columns=labels.columns, index=labels.index)
	return util.eval(labels, preds)
예제 #3
0
 def eval(self, gold_lex):
     return(util.eval(gold_lex, self.predict(gold_lex.index)))
예제 #4
0
def main(results_path='results', metric='r'):
    RESULTS = results_path + '/'

    if not os.path.exists(RESULTS):
        os.makedirs(RESULTS)

    ### settings
    for setting in SETTINGS:
        print('Now processing {}'.format(setting.name))

        ### check if this setting has already been processed
        if os.path.isdir(RESULTS + setting.name):
            print('\t{} has already been processed!'.format(setting.name))
        else:

            labels = setting.load_data()
            embs = setting.load_embeddings()

            models = {
                'turney':
                turney.Bootstrapper(embs),
                'densifier':
                densifier.Densifier(embs),
                'my_model_relu':
                my_model_relu,
                'my_model_sigmoid':
                my_model_sigmoid,
                'aicyber':
                aicyber.mlp_ensemble(),
                'li_regressor':
                li_regressor(),
                'linear_model':
                li_regressor(init_fun=sklearn.linear_model.LinearRegression)
            }

            results_setting={key:pd.DataFrame(columns=labels.columns)\
             for key in list(models)}

            ### Crossvalidation
            k = 0
            for  train_index, test_index in KFold(n_splits=10, shuffle=True).\
               split(labels):
                k += 1
                train = labels.iloc[train_index]
                test = labels.iloc[test_index]
                print(k)

                train_features = util.feature_extraction(train.index, embs)
                test_features = util.feature_extraction(test.index, embs)

                ### methods
                for model_name in list(models):
                    model = models[model_name]
                    print(model_name)

                    ### case distinction because models do not share the same
                    ###	interface
                    tf.reset_default_graph()
                    preds = None
                    if model_name in [
                            'aicyber', 'li_regressor', 'linear_model'
                    ]:
                        model.fit(train_features.copy(), train.copy())
                        preds = model.predict(test_features.copy())
                    elif model_name in ['my_model_relu', 'my_model_sigmoid']:
                        # print(train)
                        # sess=tf.Session()
                        session = model.fit(train_features.copy(),
                                            train.copy())
                        preds = model.predict(test_features.copy(),
                                              session,
                                              var_names=train.columns)
                        del session
                    else:
                        model.fit(train.copy())
                        preds = model.predict(test.index.copy())
                        ###
                        print(test)
                        print(preds)
                        ###
                    perf = util.eval(test, preds, metric)
                    print(perf)
                    results_setting[model_name].loc[k] = perf
                    print(results_setting[model_name])

            os.makedirs(RESULTS + setting.name)
            ### after cv, for each individual results data frame, average results and save data
            for model_name in list(models):
                curr_results = results_setting[model_name]
                curr_results = util.average_results_df(curr_results)
                fname = '{}{}/{}.tsv'.format(RESULTS, setting.name, model_name)
                util.save_tsv(curr_results, fname)
            print('\tFinished processing {}'.format(setting.name))

            ### delete respective setting to free up memory
        del setting
예제 #5
0
   split(labels):
    k += 1
    train = labels.iloc[train_index]
    test = labels.iloc[test_index]
    print(k)

    for config in configs:
        print(config)

        threshold = config[0]
        alpha = config[1]

        ds.fit(seed_lexicon=train,
               binarization_threshold=threshold,
               alpha=alpha)
        prediction = ds.predict(words=test.index)
        performance = util.eval(test, prediction)
        print(performance)
        results_config[str(config)].loc[k] = performance

meta_df = pd.DataFrame(columns=['threshold', 'alpha'])

for config in configs:
    results_df = results_config[str(config)]
    results_df = util.average_results_df(results_df)
    fname = 'results/{}.tsv'.format(str(config))
    util.save_tsv(results_df, fname)
    meta_df.loc[fname] = config

util.save_tsv(meta_df, 'results/meta.tsv')
예제 #6
0
 def eval(self, gold_lex):
     if self.induced_lexicon is None:
         raise ValueError(
             'Embeddings need to be transformed first! Run "fit"!')
     else:
         return (util.eval(gold_lex, self.predict(gold_lex.index)))