Пример #1
0
	def test(self):
		#self.merge()
		#self.compress()
		#return
		embedding_size = 100
		for CLUSTER_MIN_SIZE in range(4,19,2):
			for dsname in ['webkb','er']:
				mln = MLN(dsname)
				db = DBManager(dsname,mln)
				print('merge db dom sizes:')
				dom_obj_map = db.get_dom_objs_map(mln,db.merge_db_file)
				cf = common_f()
				#cf.delete_files(mln.pickle_location)
				#cf.remove_irrelevant_atoms()
				embedding_size += 100
				embedding_size = embedding_size%1000

				db.set_atoms()
				bmf = bmf_cluster(dsname)
				bmf.cluster(db,1,mln.pdm,dom_obj_map)

				print('original db dom sizes(after compression):')
				orig_dom_objs_map = db.get_dom_objs_map(mln,mln.orig_db_file)
				CLUSTER_MIN_SIZE = 10
				w2v = word2vec(dsname,db,CLUSTER_MIN_SIZE,embedding_size)
				print('w2v cluster dom sizes:')
				w2v_dom_objs_map = db.get_dom_objs_map(mln,w2v.w2v__cluster_db_file)
				cr = cf.calculate_cr(orig_dom_objs_map,w2v_dom_objs_map)


				print('cr : ' + str(cr))
				rc = random_cluster(dsname)
				rc.generate_random_db(db,w2v.pred_atoms_reduced_numbers,mln,w2v_dom_objs_map)
				print('random cluster dom sizes')
				db.get_dom_objs_map(mln,mln.random__cluster_db_file)




				kmc = kmeans_cluster(dsname)
				kmc.cluster(db,str(cr),mln.pdm,w2v_dom_objs_map,mln.dom_pred_map)
				print('kmeans cluster dom sizes:')
				kmeans_dom_objs_map = db.get_dom_objs_map(mln,kmc.kmeans__cluster_db_file)
				mln.create_magician_mln()
				#magician(dsname,mln)
				tuffy(dsname)
				orig_meta_map = {}

				orig_meta_map['bmf'] = bmf.bmf_orig_meta_map
				orig_meta_map['w2v'] = w2v.w2v_orig_meta_map
				orig_meta_map['random'] = rc.rand_orig_meta_map
				orig_meta_map['kmeans'] = kmc.kmeans_orig_meta_map
				print('Dataset : ' + dsname +  '; CR : ' + str(cr))
				p = performance(dsname,embedding_size)
				p.compare_marginal(mln,orig_meta_map,cr)
				p.compare_map(mln,orig_meta_map,cr)
			break
Пример #2
0
def exp_hidden_state(hidden_state_candidates):
	results = "batch_size = 512,nb_epoch = 70,timestep=50 \n"
	results += "hidden state size\tRMSE\tTPA\tTPPA\n"	
	for hidden_state in hidden_state_candidates:
		y_test, predictions = predict(512,70,50, hidden_state) # !!!!!!!!!!!!timestep
		rmse, tp_acc, tpp_acc,cm1,cm2 = performance(y_test, predictions)
		results += str(hidden_state)+"\t"+str(rmse)+"\t"+str(tp_acc)+"\t"+str(tpp_acc )+"\n"

	with open("./exp/hiddenstate.txt","w") as file:
		file.write(results)
Пример #3
0
def exp_timesteps(hidden_state_candidates, timestep_candidates):
	results = "batch_size = 512,nb_epoch = 70 \n"
	for hidden_state in hidden_state_candidates:
		results += "hidden state size: " + str(hidden_state) +"------------------\n"
		results += "timestep size\tRMSE\tTPA\tTPPA\n"
		for timestep in timestep_candidates:
			y_test, predictions = predict(512,70,timestep,hidden_state)
			rmse,tp_acc,tpp_acc,cm1,cm2 = performance(y_test, predictions)
			results += str(timestep)+"\t"+str(rmse)+"\t"+str(tp_acc)+"\t"+str(tpp_acc )+"\n"

	with open("./exp/timestep.txt","w") as file:
		file.write(results)
Пример #4
0
def exp_epoch(layers_candidates, timesteps_candidates, epoch_candidates):
	results = "batch_size = 512 \n"
	for layers in layers_candidates:
		for timesteps in timesteps_candidates:
			results += "layers: " + str(layers) + "; timesteps: " + str(timesteps) + "-------------------\n"
			results += "number of epoch\tRMSE\tTPA\tTPPA\n"
			for epoch in epoch_candidates:
				y_test, predictions = predict(batch_size = 512, nb_epoch = epoch, timestep = timesteps, hidden_state = 50, layers = layers)
				rmse,tp_acc,tpp_acc,cm1,cm2 = performance(y_test, predictions)
				results += str(epoch)+"\t"+str(rmse)+"\t"+str(tp_acc)+"\t"+str(tpp_acc )+"\n"
	with open("./exp/epoch.txt","w") as file:
		file.write(results)
Пример #5
0
def exp_batchsize(layers_candidates, batchsize_candidates):
	results = "nb_epoch = 70, timestep=30 \n"
	for layers in layers_candidates:
		results += "layers: " + str(layers) + "---------------------\n"
		results += "batch_size\tRMSE\tTPA\tTPPA\n"
		for batch_size in batchsize_candidates:
			y_test, predictions = predict(batch_size = batch_size, nb_epoch = 70, timestep=30, hidden_state=50, layers = layers)
			rmse,tp_acc,tpp_acc,cm1,cm2 = performance(y_test, predictions)
			results += str(batch_size)+"\t"+str(rmse)+"\t"+str(tp_acc)+"\t"+str(tpp_acc )+"\n"

	with open("./exp/batchsize.txt","w") as file:
		file.write(results)
Пример #6
0
def evaluate(date=yesterday,company={}):
	companies = Company.objects.filter(**company)
	c_list = []
	for _company in companies:
		c_dict = {"ticker":_company.ticker,}
		pred = predict(date=date,company=c_dict)
		perf = performance(date=date,company=c_dict)
		try:
			accu = 100*abs((perf['avg_change'] - pred['avg_change'])/perf['avg_change'])
		except Exception, e:
			accu = None
		try:
			p_accu = 100*abs((perf['avg_perc'] - pred['avg_perc'])/perf['avg_perc'])
		except Exception, e:
			p_accu = None
Пример #7
0
def main():
    model = TextCNN(vocab_size, embedding_size, num_class).to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    train(model, criterion, optimizer)

    input_batch, target_batch = make_data(sen_length, sen_list, word2idx, label)
    model_name = "mdl/10_model.mdl"
    pre = predict(model, model_name, input_batch)
    pre_list = pre[:, 0].cpu().numpy()
    pre_list = pre_list.tolist()
    label_list = []
    for i in label:
        label_list.append(int(i))
    micro_F1, macro_F1, ave_acc = performance(label_list, pre_list)
    print("micro_F1:", micro_F1)
    print("macro_F1:", macro_F1)
    print(f"ave_acc:{ave_acc}")
Пример #8
0
def exp_layer_depth(layers):
	prediction_len = 20
	predict(batch_size = 512, nb_epoch = 70,timestep = 50,hidden_state = 50, layers = layers, save = True,
		predict_multiple = True, prediction_len = prediction_len, predict_full = True)

	# Plot the predictions 
	orig = np.load("./data/y_test.npy")
	predictions = np.load("./data/predictions.npy")

	rmse,tp_acc,tpp_acc,cm1,cm2 = performance(orig, predictions)
	print "Tendency Prediction Confusion Matrix (0:down; 1:up or equal): "
	print cm1
	print "Turning Point Prediction Confusion Matrix (0:not a turning point; 1:summit; 2:vale): "
	print cm2
	print "rmse: " + str(rmse) + "TPA: " + str(tp_acc) + "TPPA: " + str(tpp_acc)

	predictions_multi = np.load("./data/predictions_multi.npy")
	predictions_full = np.load("./data/predictions_full.npy")
	fig = plt.figure(facecolor='white')
	plot_results(predictions, orig, fig, sublocation = 121, show = False)
	# plot_results(predictions_full, orig, fig, sublocation = 222, show = False)
	plot_results_multiple(predictions_multi, orig, prediction_len = prediction_len, 
		fig = fig, sublocation = 122, show = True)
Пример #9
0
def MLPtrain_workflow(train, val, save_path, nb_members, nb_hid, nb_epochs):
    
    # generate folder for saved MLP networks and results 
    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    os.makedirs(save_path)
    save_path_results = '{0}/results'.format(save_path)
    os.makedirs(save_path_results)
    save_path_MLPs = '{0}/saved_MLPs'.format(save_path)
    os.makedirs(save_path_MLPs)
    
    # -----------------------------------------------------------------------------
    # 1. TRAINING OF MLPS
    
    # determine variables used for MLP optimisation 
    variables = ['SD', 'SWE', 'Day of year', 'days without snow', 'number frost-defrost',
                 'accum pos degrees', 'average age SC', 'number layer', 'accum solid precip',
                 'accum solid precip in last 10 days', 'total precip last 10 days', 'average temp last 6 days']
    MLP_train = train[variables]
    # delete all rows with nan values 
    MLP_train = MLP_train.dropna()
    # select explanatory and target variables 
    y_train = MLP_train['SWE'].values.astype('float32') 
    x_train = MLP_train.drop('SWE', axis=1).values.astype('float32')
        
    # determination of MLP setup 
    activ_fc = 'tanh'
    init_w = 2
    init_b = 2
    optAlg = 'Adadelta'
    batch_size = 100
    shuf_data = 1
    
    # optimise MLP ensmeble by function; trained MLP networks and scaler for standardisation are saved to save_path
    MLP(x_train, y_train, nb_epochs, nb_members, nb_hid, save_path_MLPs,
        activ_fc, init_w, init_b, optAlg, batch_size, shuf_data)
    del train
    
    # -----------------------------------------------------------------------------
    # 2. EVALUATION ON VALIDATION DATA SET
    
    # Perturbe SD on validation data set and estimate SWE; evaluate estimated SWE 
    # against observation of SWE
    # load scaler for standardisation         
    [scalerIn, scalerOut] = pickle.load(open(save_path_MLPs + '/scaler', "rb"))
    
    # determine variables used for MLP optimisation 
    variables = ['SD', 'SWE', 'Day of year', 'days without snow', 'number frost-defrost',
                 'accum pos degrees', 'average age SC', 'number layer', 'accum solid precip',
                 'accum solid precip in last 10 days', 'total precip last 10 days', 'average temp last 6 days']
    MLP_val = val[variables]
    # delete all rows with nan values 
    MLP_val = MLP_val.dropna()
    # select explanatory and target variables 
    y_val = MLP_val['SWE'].values.astype('float32')    
    x_val = MLP_val.drop('SWE', axis=1).values.astype('float32')
    # find index for SD for perturbation 
    idx_SD = MLP_val.drop('SWE', axis=1).columns.get_loc('SD')
    
    # initialise matrix for ensemble with 400 members
    ensemble400 = np.empty((len(y_val), 20*20))
    
    # assign model setup 
    nb_members = 20 
    
    for mb in range(nb_members):
        # create network graph
        tf.reset_default_graph()
        imported_graph = tf.train.import_meta_graph(save_path_MLPs +  "/mb_{0}.ckpt.meta".format(mb))
        with tf.Session() as sess:
            # restore parameter
            imported_graph.restore(sess, save_path_MLPs +  "/mb_{0}.ckpt".format(mb))
            
            # get prediction with noisy inputs as an ensemble 
            for k in range(x_val.shape[0]):
                line_input = x_val[k, :]
                input_net = np.tile(line_input, (20, 1))
                SD = line_input[idx_SD]
                if SD < 20: 
                    SD_low = SD - 1
                    SD_high = SD + 1
                else: 
                    SD_low = SD * 0.95
                    SD_high = SD * 1.05
                SD_noise_1rec = np.random.uniform(low=SD_low, high=SD_high, size=20)
                input_net[:, idx_SD] = SD_noise_1rec
                input_net_std = scalerIn.transform(input_net)
                predict_std = sess.run("op_to_restore:0", feed_dict={"input:0": input_net_std}).flatten()
                ensemble400[k, mb*20:(mb+1)*20] = scalerOut.inverse_transform(predict_std).flatten()   
    
    # determine 20 quantiles, to get 20 members 
    ensemble20 = np.quantile(ensemble400, np.arange(0.025, 1, 1/20), axis=1).transpose()
    
    # save ensemble          
    pickle.dump(ensemble20, open(save_path_results + '/ensembleVal_SD_pt', "wb"))   

    # evaluate on validation data set            
    # apply performance function, saves graphics and results (as csv) in folder assigned above
    performance(ensemble20, y_val, save_path_results)
        
    print(datetime.now())
    print('Evaluation on validation data set done')
Пример #10
0
 def handle_nav(self):
     # 建立画pdf的对象
     self.pdfs = PdfPages(self.full_dir[:-1] + '/allfigs.pdf')
     self.performance = performance(self.nav)
     self.performance.get_performance(foldername=self.folder_name[:-1])
     self.performance.plot_performance(foldername=self.folder_name[:-1], pdfs=self.pdfs)
Пример #11
0
    def __init__(self,aircraft):
        self.aerodynamics     =aerodynamics.aerodynamics(aircraft)
        self.thrust           =thrust.thrustAnalysis(aircraft)
#        self.mass             =mass.generalAviationMass(aircraft)
        self.performance      =performance.performance(aircraft)
Пример #12
0
from target import target  #importing Target Generation module
from sol import sol  #importing Solution module
from verify import verify  #importing verify module
from performance import performance  #importing performance module

if __name__ == '__main__':  #calling main function
    import sys  #importing sys module
    import os  #importing os module
    #First argument i.e sys.argv[0] is always aes.py
    #Second argument i.e sys.argv[1] is the called function
    #Comparing the second command line argument with respective functions to call
    if sys.argv[1] == 'target':  #Comparing with target
        target(sys.argv[2], sys.argv[3])  #Calling target function
    elif sys.argv[1] == 'sol':  #Comparing with sol
        sol(sys.argv[2], sys.argv[3], sys.argv[4])  #Calling sol function
    elif sys.argv[1] == 'verify':  #Comparing with verify
        verify(sys.argv[2], sys.argv[3], sys.argv[4])  #Calling verify function
    elif sys.argv[1] == 'performance':  #Comparing with performance
        performance(sys.argv[2])  #Calling performance function
Пример #13
0
        datas = pd.read_excel('datas_final.xlsx',
                              index_col=0,
                              parse_dates=True).dropna()
        return datas


if __name__ == "__main__":
    datas = main().data_get()

    print("------------------Equal weighted------------------")
    weights_EW, result_EW = benchmark.benchmark(datas,
                                                period=para.period,
                                                rollingtime=para.rollingtime,
                                                method='EW')
    pd.DataFrame(
        performance.performance(result_EW)).to_csv(para.performance_output +
                                                   'result_EW_performance.csv')
    pd.DataFrame(performance.performance_anl(result_EW)).to_csv(
        para.performance_output + 'result_EW_performance_anl.csv')
    weights_EW.to_excel(para.weights_output + 'weights_EW.xlsx')
    result_EW.to_excel(para.results_output + 'result_EW.xlsx')

    print("------------------Variance equal weighted------------------")
    weights_EV, result_EV = benchmark.benchmark(datas,
                                                period=para.period,
                                                rollingtime=para.rollingtime,
                                                method='EV')
    pd.DataFrame(
        performance.performance(result_EV)).to_csv(para.performance_output +
                                                   'result_EV_performance.csv')
    pd.DataFrame(performance.performance_anl(result_EV)).to_csv(
Пример #14
0
 def initialize_performance(self):
     holding_days = pd.Series(self.bkt_position.holding_matrix.index, index=self.bkt_position.holding_matrix.index)
     holding_days = holding_days[self.bkt_start:self.bkt_end]
     self.bkt_performance = performance(self.account_value, benchmark = self.benchmark_value,
         info_series=self.info_series, risk_free_rate = self.bkt_data.const_data['risk_free_rate'],
         holding_days=holding_days, cash_ratio=self.real_pct_position.cash)
def train_test(
        data, instance_testing_size, 
        forecast_horizon, feature_or_covariate_set, 
        history_length, model='knn', base_models=None,
        model_type='regression', model_parameters=None, 
        feature_scaler='logarithmic', target_scaler='logarithmic', 
        labels=None, performance_measures=['MAPE'], 
        performance_mode='normal', performance_report=True, 
        save_predictions=True, verbose=0):
    
    
    """
    Parameters:
        data:    Pandas DataFrame
            a preprocessed DataFrame to be used for training the model and making predictions on the test part
        
        instance_testing_size:    int or float
            the size of testing instances
        
        forecast_horizon:    int
            forecast horizon to gap consideration in data splitting process by the gap, we mean the number of temporal units
            which are excluded from data to simulate the situation of real prediction in which we do not have access to the
            information of forecast horizon-1 units before the time point of the target variable.

        feature_or_covariate_set:    list<string>
            a list of covariates or features which feature selection process will be based on them if historical data is provided, 
            the input will be considered as a feature list, otherwise as a covariate list

        history_length:    int
            history length of the input "data", history length is just used for the reports in "train_test"

        model:    string or callable or dict
            string: one of the pre-defined model names 
            function: a user-defined function
            dict: pre-defined model names and corresponding hyper parameters
            pre-defined model names: 'knn', 'nn' , 'gbm', 'glm'

        model_type:    string

        model_parameters:    list<int> or None

        feature_scaler:    string

        target_scaler:    string

        labels:    list<int> or None

        performance_measures:    list<string>
            a list of performance measures that the user wants to calculate the errors on predictions of test dataset 
        
        performance_mode:    string

        performance_report:    bool
            if True, some tables containing a report on models and their corresponding errors (based on performance_measurements) 
            will be saved in the same directory
        
        save_predictions:    bool
            if True, the prediction values of trained models for training data and validation data through train_and_evaluate 
            process will be saved in the same directory as your program is running as in ‘.csv’ format
        
        verbose:    int
            the level of produced detailed logging information
            available options:
            0: no logging
            1: only important information logging 
            2: all details logging


    Returns:
        model:    string or callable or dict
            exactly same as the 'model' parameter

        model_parameters:    list<int>
    """

    warnings.filterwarnings("once")

    ################################ checking for TypeError and other possible mistakes in the inputs
    if not(isinstance(data, pd.DataFrame)):
        raise TypeError("Expected a pandas DataFrame for data.")

    if not(isinstance(instance_testing_size, int) or isinstance(instance_testing_size, float)):
        raise TypeError("Expected an integer or a float number for instance_testing_size.")
    
    if not(isinstance(forecast_horizon, int)):
        raise TypeError("Expected an integer for forecast_horizon.")
    
    if not(isinstance(feature_or_covariate_set, list)):
        raise TypeError("Expected a list of strings for feature_or_covariate_set.")
    
    if not(isinstance(history_length, int)):
        raise TypeError("Expected an integer for history_length.")
    
    if not(isinstance(model, str) or callable(model) or isinstance(model, dict)):
        raise TypeError("Expected a string or function or a dictionary of model parameters for model.")
    
    if not(isinstance(model_type, str)):
        raise TypeError("Expected a string for model_type.")
    
    if not(isinstance(model_parameters, dict) or model_parameters == None):
        raise TypeError("Expected a dictionary or None value for model_parameters.")
    
    if not(isinstance(feature_scaler, str) or feature_scaler == None):
        raise TypeError("Expected a string or None value for feature_scaler.")
    
    if not(isinstance(target_scaler, str) or target_scaler == None):
        raise TypeError("Expected a string or None value for target_scaler.")

    if not(isinstance(labels, list) or labels == None):
        raise TypeError("Expected a list or None value for labels.")
    
    if not(isinstance(performance_measures, list)):
        raise TypeError("Expected a list for performance_measures.")
    
    if not(isinstance(performance_mode, str)):
        raise TypeError("Expected a string for performance_mode.")
    
    if not(isinstance(performance_report, bool)):
        raise TypeError("Expected a bool variable for performance_report.")
    
    if not(isinstance(save_predictions, bool)):
        raise TypeError("Expected a bool variable for save_predictions.")
    
    if not(isinstance(verbose, int)):
        raise TypeError("Expected an integer (0 or 1 or 2) for verbose.")
    ################################

    # classification checking
    if model_type == 'classification':
        if not set(performance_measures) <= set(configurations.CLASSIFICATION_PERFORMANCE_MEASURES):
            raise Exception("Error: The input 'performance_measures' is not valid according to 'model_type=classification'.")
        if performance_mode != 'normal':
            performance_mode = 'normal'
            print("Warning: The input 'performance_mode' is set to 'normal' according to model_type=classification'.")
        if target_scaler is not None:
            target_scaler = None
            print("Warning: The input 'target_scaler' is set to None according to model_type=classification'.")

    # get some information of the data
    target_mode, target_granularity, granularity, data = get_target_quantities(data=data.copy())
    
    # get the target temporal id from temporal id
    # if target temporal id is already in the data, call is from inside the predict function
    # otherwise backup file must be removed
    if 'target temporal id' in data.columns:
        data = data.rename(columns={'target temporal id':'temporal id'})
    else:
        data, _ = get_target_temporal_ids(temporal_data = data.copy(), forecast_horizon = forecast_horizon,
                                               granularity = granularity)
        if os.path.isfile('test_process_backup.csv'):
            os.remove('test_process_backup.csv')
    
    # check rows related to future prediction are removed and if not then remove them
    temp_data = data.sort_values(by = ['temporal id','spatial id']).copy()
    number_of_spatial_units = len(temp_data['spatial id'].unique())
    if all(temp_data.tail(granularity*forecast_horizon*number_of_spatial_units)['Target'].isna()):
        data = temp_data.iloc[:-(granularity*forecast_horizon*number_of_spatial_units)]
    
    # check if model is a string or function
    model_name = ''
    if isinstance(model, str) == False:
        model_name = model.__name__
        if model_name in ['nn', 'knn', 'glm', 'gbm']:
            raise TypeError("Name of the user defined model matches the name of one of our predefined models.")
    else:
        model_name = model

    # find labels for classification problem
    if labels == None:
        if model_type == 'regression':    # just an empty list
            labels = []
        elif model_type == 'classification':    # unique values in 'Target' column of data
            labels = data.Target.unique()
            labels.sort()

    # select features
    processed_data = select_features(
        data=data.copy(), 
        ordered_covariates_or_features=feature_or_covariate_set
    )

    # splitting data in the way is set for train_test
    training_data, _, testing_data, gap_data = split_data(
        data=processed_data.copy(), 
        splitting_type='instance', 
        instance_testing_size=instance_testing_size, 
        instance_validation_size=None, 
        instance_random_partitioning=False, 
        fold_total_number=0, 
        fold_number=0, 
        forecast_horizon=forecast_horizon,         
        granularity=granularity, 
        verbose=verbose
    )

    # separate some data which are needed later
    base_data = training_data['Target'].values.tolist()
    training_target = training_data[['spatial id', 'temporal id', 'Target', 'Normal target']]
    test_target = testing_data[['spatial id', 'temporal id', 'Target', 'Normal target']]

    # scaling data
    training_data, testing_data = data_scaling(
        train_data=training_data.copy(), 
        test_data=testing_data.copy(), 
        feature_scaler=feature_scaler, 
        target_scaler=target_scaler
    )

    # training model with processed data    
    training_predictions, testing_predictions, trained_model, number_of_parameters = inner_train_evaluate(
        training_data=training_data.copy(), 
        validation_data=testing_data.copy(), 
        model=model, 
        model_type=model_type, 
        model_parameters=model_parameters, 
        labels=labels, 
        base_models = base_models,
        verbose=verbose
    )

    # target descale
    training_predictions = target_descale(
        scaled_data=list(training_predictions), 
        base_data=base_data, 
        scaler=target_scaler
    )

    testing_predictions = target_descale(
        scaled_data=list(testing_predictions), 
        base_data=base_data, 
        scaler=target_scaler
    )

    # checking for some files to exit which will be used in the next phases
    test_process_backup_file_name = 'test_process_backup.csv'
    if pathlib.Path(test_process_backup_file_name).is_file() == False:
        if model_type == 'regression':
            df = pd.DataFrame(columns=['spatial id', 'temporal id', 'Target', 'Normal target', 'prediction'])
        elif model_type == 'classification':
            df = pd.DataFrame(columns=['spatial id', 'temporal id', 'Target', 'Normal target']+\
                              ['prediction class '+str(class_num) for class_num in range(np.array(testing_predictions).shape[1])])
        df.to_csv(test_process_backup_file_name, index=False)


    # getting back previous points (useful for one-by-one method, also works for one-as-whole method)
    previous_test_points = pd.read_csv(test_process_backup_file_name)

    # append current point to previous points
    test_target = test_target.append(previous_test_points[['spatial id', 'temporal id', 'Target', 'Normal target']], ignore_index=True)
    if model_type == 'regression':
        previous_testing_predictions = previous_test_points['prediction'].tolist()
        testing_predictions = list(testing_predictions) + previous_testing_predictions
    elif model_type == 'classification':
        previous_testing_predictions = previous_test_points.filter(regex='^prediction class ',axis=1)
        testing_predictions = np.concatenate((np.array(testing_predictions),np.array(previous_testing_predictions)))
        testing_predictions_df = pd.DataFrame(testing_predictions)
        testing_predictions_df.columns = ['prediction class '+str(class_num) for class_num in testing_predictions_df.columns]

    # saving test_target+testing_predictions into a backup file to be used in the next point
    df_for_backup = test_target.copy()
    if model_type == 'regression':
        df_for_backup.insert(loc=len(df_for_backup.columns), column='prediction', value=testing_predictions)
    elif model_type == 'classification':
        df_for_backup = pd.concat([df_for_backup,testing_predictions_df],axis = 1)
    df_for_backup.to_csv(test_process_backup_file_name, index=False)

    # get normal data
    training_target, test_target, training_prediction, test_prediction = get_normal_target(
        training_target=training_target.append(gap_data[['spatial id', 'temporal id', 'Target', 'Normal target']], ignore_index=True), 
        test_target=test_target.copy(), 
        training_prediction=list(training_predictions) + gap_data['Target'].tolist(), 
        test_prediction=testing_predictions, 
        target_mode=target_mode, 
        target_granularity=target_granularity
    )

    # make copy of some data to be stores later
    test_target_normal, test_prediction_normal = test_target.copy(), test_prediction.copy()

    # including performance_mode
    training_target, test_target, training_prediction, test_prediction = apply_performance_mode(
        training_target=training_target.copy(), 
        test_target=test_target.copy(), 
        training_prediction=list(training_prediction), 
        test_prediction=test_prediction, 
        performance_mode=performance_mode
    )

    # computing trivial values for the test set (just when want to calculate MASE)
    if 'MASE' in performance_measures:
        _, _, _, testing_true_values, testing_predicted_values, testing_trivial_values = get_trivial_values(
            train_true_values_df=training_target.copy(), 
            validation_true_values_df=test_target.copy(), 
            train_prediction=list(training_prediction), 
            validation_prediction=test_prediction, 
            forecast_horizon=forecast_horizon, 
            granularity=granularity
        )

        # computing performnace on test dataset
        test_prediction_errors = performance(
            true_values=testing_true_values, 
            predicted_values=testing_predicted_values, 
            performance_measures=performance_measures, 
            trivial_values=testing_trivial_values, 
            model_type=model_type, 
            num_params=number_of_parameters, 
            labels=labels)

    else:
        # computing performnace on test dataset
        test_prediction_errors = performance(
            true_values=test_target['Normal target'], 
            predicted_values=test_prediction, 
            performance_measures=performance_measures, 
            trivial_values=[], 
            model_type=model_type, 
            num_params=number_of_parameters, 
            labels=labels)
    
    # checking for existance of some directories for logging purpose
    if pathlib.Path('prediction/test process').is_dir() == False:
        pathlib.Path('prediction/test process').mkdir(parents=True, exist_ok=True)
    if pathlib.Path('performance/test process').is_dir() == False:
        pathlib.Path('performance/test process').mkdir(parents=True, exist_ok=True)

    # saving predictions based on model_type
    pred_file_name = 'prediction/test process/test prediction forecast horizon = %s.csv' % (forecast_horizon)
    testing_predictions = np.array(testing_predictions)

    if save_predictions == True:
        if model_type == 'regression':
            df = pd.DataFrame()
            df['real'] = test_target_normal['Normal target'].values.tolist()
            df['prediction'] = list(test_prediction_normal)
            df.insert(0, 'temporal id', test_target_normal['temporal id'].values.tolist(), True)
            df.insert(0, 'spatial id', test_target_normal['spatial id'].values.tolist(), True)
            df.insert(0, 'model name', model_name, True)
            df.to_csv(pred_file_name, index=False)
        elif model_type == 'classification':
            df = pd.DataFrame()
            df['real'] = test_target_normal['Normal target'].values.tolist()
            for i in range(len(labels)):
                col_name = 'class ' + str(labels[i])
                df[col_name] = testing_predictions[:, i]
            df.insert(0, 'temporal id', test_target_normal['temporal id'].values.tolist(), True)
            df.insert(0, 'spatial id', test_target_normal['spatial id'].values.tolist(), True)
            df.insert(0, 'model name', model_name, True)
            df.to_csv(pred_file_name, index=False)
    
    # saving performance (same approach for both regression and classification)
    performance_file_name = 'performance/test process/test performance report forecast horizon = %s.csv' % (forecast_horizon)

    # selecting temporal and futuristic features or covariates from the feature_or_covariate_set list
    check_list = [item for item in feature_or_covariate_set if item.count(' ') != 0]

    # type_flag for detecting feature type (False) or covariate type (True)
    # check if all elements in check_list meet the condition for being covariate type
    type_flag = all(re.search(' t$', element) or re.search(' t[+]$', element) for element in check_list)

    processed_feature_or_covariate_set = []    # a list to be saved in performance report file

    if type_flag == 1:
        for item in feature_or_covariate_set:
            if item.count(' ') != 0:
                processed_feature_or_covariate_set.append(item[:-2])
            else:
                processed_feature_or_covariate_set.append(item)
    else:
        processed_feature_or_covariate_set = feature_or_covariate_set.copy()
        
    if performance_report == True:
        df_data = {
            'model name': list([model_name]), 
            'history length': list([history_length]), 
            'feature or covariate set': ', '.join(processed_feature_or_covariate_set)
        }
        df = pd.DataFrame(df_data, columns=list(df_data.keys()))
        for i in range(len(performance_measures)):
            df[performance_measures[i]] = list([float(test_prediction_errors[i])])
        df.to_csv(performance_file_name, index=False)
    
    return trained_model
Пример #16
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    best_network_file = "./model/pretrain/network_model_pretrain.best"
    print >> sys.stderr, "Read model from ./model/model.pkl"
    best_network_model = torch.load(best_network_file)

    embedding_matrix = numpy.load(embedding_file)

    "Building torch model"
    network_model = network.Network(pair_feature_dimention,
                                    mention_feature_dimention,
                                    word_embedding_dimention, span_dimention,
                                    1000, embedding_size, embedding_dimention,
                                    embedding_matrix).cuda()
    print >> sys.stderr, "save model ..."
    #torch.save(network_model,network_file)

    net_copy(network_model, best_network_model)

    reduced = ""
    if args.reduced == 1:
        reduced = "_reduced"

    print >> sys.stderr, "prepare data for train ..."
    train_docs = DataReader.DataGnerater("train" + reduced)
    print >> sys.stderr, "prepare data for dev and test ..."
    dev_docs = DataReader.DataGnerater("dev" + reduced)
    test_docs = DataReader.DataGnerater("test" + reduced)

    l2_lambda = 1e-6
    lr = 0.0002
    dropout_rate = 0.5
    shuffle = True
    times = 0
    best_thres = 0.5

    model_save_dir = "./model/pretrain/"

    last_cost = 0.0
    all_best_results = {
        'thresh': 0.0,
        'accuracy': 0.0,
        'precision': 0.0,
        'recall': 0.0,
        'f1': 0.0
    }

    for echo in range(100):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:", echo

        #if echo == 100:
        #    lr = lr/2.0
        #if echo == 150:
        #    lr = lr/2.0

        #optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, network_model.parameters()), lr=lr, weight_decay=l2_lambda)
        #optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda)
        optimizer = optim.RMSprop(network_model.parameters(),
                                  lr=lr,
                                  eps=1e-5,
                                  weight_decay=l2_lambda)

        pair_cost_this_turn = 0.0
        ana_cost_this_turn = 0.0

        pair_nums = 0
        ana_nums = 0

        pos_num = 0
        neg_num = 0
        inside_time = 0.0

        for data in train_docs.train_generater(shuffle=shuffle, top=True):

            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,top_x = data
            mention_index = autograd.Variable(
                torch.from_numpy(mention_word_index).type(
                    torch.cuda.LongTensor))
            mention_span = autograd.Variable(
                torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(
                torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(
                torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(
                torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(
                torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(
                torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(
                torch.from_numpy(anaphoricity_word_indexs).type(
                    torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(
                torch.from_numpy(anaphoricity_spans).type(
                    torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(
                torch.from_numpy(anaphoricity_features).type(
                    torch.cuda.FloatTensor))

            reindex = autograd.Variable(
                torch.from_numpy(top_x["score_index"]).type(
                    torch.cuda.LongTensor))

            start_index = autograd.Variable(
                torch.from_numpy(top_x["starts"]).type(torch.cuda.LongTensor))
            end_index = autograd.Variable(
                torch.from_numpy(top_x["ends"]).type(torch.cuda.LongTensor))

            top_gold = autograd.Variable(
                torch.from_numpy(top_x["top_gold"]).type(
                    torch.cuda.FloatTensor))

            anaphoricity_gold = anaphoricity_target.tolist()
            ana_lable = autograd.Variable(
                torch.cuda.FloatTensor([anaphoricity_gold]))

            optimizer.zero_grad()

            output, output_reindex = network_model.forward_top_pair(
                word_embedding_dimention, mention_index, mention_span,
                candi_index, candi_spans, pair_feature, anaphors, antecedents,
                reindex, start_index, end_index, dropout_rate)
            loss = F.binary_cross_entropy(
                output, top_gold,
                size_average=False) / train_docs.scale_factor_top

            ana_output, _ = network_model.forward_anaphoricity(
                word_embedding_dimention, anaphoricity_index,
                anaphoricity_span, anaphoricity_feature, dropout_rate)
            ana_loss = F.binary_cross_entropy(
                ana_output, ana_lable,
                size_average=False) / train_docs.anaphoricity_scale_factor_top

            loss_all = loss + ana_loss

            loss_all.backward()
            pair_cost_this_turn += loss.data[0]
            optimizer.step()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain", echo, "Pair total cost:", pair_cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)
        print >> sys.stderr, "Learning Rate", lr

        print >> sys.stderr, "save model ..."
        torch.save(network_model,
                   model_save_dir + "network_model_pretrain.%d.top" % echo)

        #if cost_this_turn > last_cost:
        #    lr = lr*0.7
        gold = []
        predict = []

        ana_gold = []
        ana_predict = []

        for data in dev_docs.train_generater(shuffle=False, top=True):

            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative, anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target, top_x = data

            mention_index = autograd.Variable(
                torch.from_numpy(mention_word_index).type(
                    torch.cuda.LongTensor))
            mention_span = autograd.Variable(
                torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(
                torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(
                torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(
                torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(
                torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(
                torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(
                torch.from_numpy(anaphoricity_word_indexs).type(
                    torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(
                torch.from_numpy(anaphoricity_spans).type(
                    torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(
                torch.from_numpy(anaphoricity_features).type(
                    torch.cuda.FloatTensor))

            reindex = autograd.Variable(
                torch.from_numpy(top_x["score_index"]).type(
                    torch.cuda.LongTensor))
            start_index = autograd.Variable(
                torch.from_numpy(top_x["starts"]).type(torch.cuda.LongTensor))
            end_index = autograd.Variable(
                torch.from_numpy(top_x["ends"]).type(torch.cuda.LongTensor))

            gold += top_x["top_gold"].tolist()
            ana_gold += anaphoricity_target.tolist()

            output, output_reindex = network_model.forward_top_pair(
                word_embedding_dimention, mention_index, mention_span,
                candi_index, candi_spans, pair_feature, anaphors, antecedents,
                reindex, start_index, end_index, 0.0)

            predict += output.data.cpu().numpy().tolist()

            ana_output, _ = network_model.forward_anaphoricity(
                word_embedding_dimention, anaphoricity_index,
                anaphoricity_span, anaphoricity_feature, 0.0)
            ana_predict += ana_output.data.cpu().numpy()[0].tolist()

        gold = numpy.array(gold, dtype=numpy.int32)
        predict = numpy.array(predict)

        best_results = {
            'thresh': 0.0,
            'accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }

        thresh_list = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
        for thresh in thresh_list:
            evaluation_results = get_metrics(gold, predict, thresh)
            if evaluation_results["f1"] >= best_results["f1"]:
                best_results = evaluation_results

        print "Pair accuracy: %f and Fscore: %f with thresh: %f"\
                %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
        sys.stdout.flush()

        if best_results["f1"] > all_best_results["f1"]:
            all_best_results = best_results
            print >> sys.stderr, "New High Result, Save Model"
            torch.save(network_model,
                       model_save_dir + "network_model_pretrain.top.best")

        ana_gold = numpy.array(ana_gold, dtype=numpy.int32)
        ana_predict = numpy.array(ana_predict)
        best_results = {
            'thresh': 0.0,
            'accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }
        for thresh in thresh_list:
            evaluation_results = get_metrics(ana_gold, ana_predict, thresh)
            if evaluation_results["f1"] >= best_results["f1"]:
                best_results = evaluation_results
        print "Anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\
                %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
        sys.stdout.flush()

        if (echo + 1) % 10 == 0:
            best_network_model = torch.load(model_save_dir +
                                            "network_model_pretrain.top.best")
            print "DEV:"
            performance.performance(dev_docs, best_network_model)
            print "TEST:"
            performance.performance(test_docs, best_network_model)
Пример #17
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    #network_file = "./model/model.pkl"
    #network_file = "./model/pretrain/network_model_pretrain.20"
    network_file = "./model/pretrain/network_model_pretrain.top.best"
    if os.path.isfile(network_file):
        print >> sys.stderr, "Read model from ./model/model.pkl"
        network_model = torch.load(network_file)
    else:
        embedding_matrix = numpy.load(embedding_file)
        #print len(embedding_matrix)

        "Building torch model"
        network_model = network.Network(pair_feature_dimention,
                                        mention_feature_dimention,
                                        word_embedding_dimention,
                                        span_dimention, 1000, embedding_size,
                                        embedding_dimention,
                                        embedding_matrix).cuda()
        print >> sys.stderr, "save model ..."
        torch.save(network_model, network_file)

    reduced = ""
    if args.reduced == 1:
        reduced = "_reduced"

    print >> sys.stderr, "prepare data for train ..."
    train_docs = DataReader.DataGnerater("train" + reduced)
    #train_docs = DataReader.DataGnerater("dev"+reduced)
    print >> sys.stderr, "prepare data for dev and test ..."
    dev_docs = DataReader.DataGnerater("dev" + reduced)
    #test_docs = DataReader.DataGnerater("test"+reduced)

    l2_lambda = 1e-6
    lr = 0.00002
    dropout_rate = 0.5
    shuffle = True
    times = 0
    best_thres = 0.5

    reinforce = True

    model_save_dir = "./model/pretrain/"

    metrics = performance.performance(dev_docs, network_model)

    p, r, f = metrics["b3"]

    f_b = [f]

    #for echo in range(30,200):
    for echo in range(20):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:", echo

        #if echo == 100:
        #    lr = lr/2.0
        #if echo == 150:
        #    lr = lr/2.0

        #optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, network_model.parameters()), lr=lr, weight_decay=l2_lambda)
        #optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda)
        cost = 0.0
        optimizer = optim.RMSprop(network_model.parameters(),
                                  lr=lr,
                                  eps=1e-5,
                                  weight_decay=l2_lambda)

        pair_cost_this_turn = 0.0
        ana_cost_this_turn = 0.0

        pair_nums = 0
        ana_nums = 0

        pos_num = 0
        neg_num = 0
        inside_time = 0.0

        score_softmax = nn.Softmax()

        cluster_info = []
        new_cluster_num = 0
        cluster_info.append(-1)
        action_list = []
        new_cluster_info = []
        tmp_data = []

        #for data in train_docs.rl_case_generater():
        for data in train_docs.rl_case_generater(shuffle=True):
            inside_time += 1

            this_doc = train_docs
            tmp_data.append(data)

            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data

            gold_chain = this_doc.gold_chain[rl["did"]]
            gold_dict = {}
            for chain in gold_chain:
                for item in chain:
                    gold_dict[item] = chain

            mention_index = autograd.Variable(
                torch.from_numpy(mention_word_index).type(
                    torch.cuda.LongTensor))
            mention_span = autograd.Variable(
                torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(
                torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(
                torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(
                torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(
                torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(
                torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(
                torch.from_numpy(anaphoricity_word_indexs).type(
                    torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(
                torch.from_numpy(anaphoricity_spans).type(
                    torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(
                torch.from_numpy(anaphoricity_features).type(
                    torch.cuda.FloatTensor))

            output, pair_score = network_model.forward_all_pair(
                word_embedding_dimention, mention_index, mention_span,
                candi_index, candi_spans, pair_feature, anaphors, antecedents,
                dropout_rate)
            ana_output, ana_score = network_model.forward_anaphoricity(
                word_embedding_dimention, anaphoricity_index,
                anaphoricity_span, anaphoricity_feature, dropout_rate)

            reindex = autograd.Variable(
                torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))

            scores_reindex = torch.transpose(
                torch.cat((pair_score, ana_score), 1), 0, 1)[reindex]
            #scores_reindex = torch.transpose(torch.cat((pair_score,-1-0.3*ana_score),1),0,1)[reindex]

            for s, e in zip(rl["starts"], rl["ends"]):
                #action_prob: scores_reindex[s:e][1]
                score = score_softmax(
                    torch.transpose(scores_reindex[s:e], 0,
                                    1)).data.cpu().numpy()[0]
                this_action = utils.sample_action(score)
                #this_action = ac_list.index(max(score.tolist()))
                action_list.append(this_action)

                if this_action == len(score) - 1:
                    should_cluster = new_cluster_num
                    new_cluster_num += 1
                    new_cluster_info.append(1)
                else:
                    should_cluster = cluster_info[this_action]
                    new_cluster_info.append(0)

                cluster_info.append(should_cluster)

            if rl["end"] == True:
                ev_document = utils.get_evaluation_document(
                    cluster_info, this_doc.gold_chain[rl["did"]],
                    candi_ids_return, new_cluster_num)
                p, r, f = evaluation.evaluate_documents([ev_document],
                                                        evaluation.b_cubed)
                trick_reward = utils.get_reward_trick(cluster_info, gold_dict,
                                                      new_cluster_info,
                                                      action_list,
                                                      candi_ids_return)

                #reward = f + trick_reward
                average_f = float(sum(f_b)) / len(f_b)

                reward = (f - average_f) * 10

                f_b.append(f)
                if len(f_b) > 128:
                    f_b = f_b[1:]

                index = 0
                for data in tmp_data:
                    mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
                    target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data

                    mention_index = autograd.Variable(
                        torch.from_numpy(mention_word_index).type(
                            torch.cuda.LongTensor))
                    mention_span = autograd.Variable(
                        torch.from_numpy(mention_span).type(
                            torch.cuda.FloatTensor))
                    candi_index = autograd.Variable(
                        torch.from_numpy(candi_word_index).type(
                            torch.cuda.LongTensor))
                    candi_spans = autograd.Variable(
                        torch.from_numpy(candi_span).type(
                            torch.cuda.FloatTensor))
                    pair_feature = autograd.Variable(
                        torch.from_numpy(feature_pair).type(
                            torch.cuda.FloatTensor))
                    anaphors = autograd.Variable(
                        torch.from_numpy(pair_anaphors).type(
                            torch.cuda.LongTensor))
                    antecedents = autograd.Variable(
                        torch.from_numpy(pair_antecedents).type(
                            torch.cuda.LongTensor))

                    anaphoricity_index = autograd.Variable(
                        torch.from_numpy(anaphoricity_word_indexs).type(
                            torch.cuda.LongTensor))
                    anaphoricity_span = autograd.Variable(
                        torch.from_numpy(anaphoricity_spans).type(
                            torch.cuda.FloatTensor))
                    anaphoricity_feature = autograd.Variable(
                        torch.from_numpy(anaphoricity_features).type(
                            torch.cuda.FloatTensor))

                    rl_costs = autograd.Variable(
                        torch.from_numpy(rl["costs"]).type(
                            torch.cuda.FloatTensor))
                    rl_costs = torch.transpose(rl_costs, 0, 1)

                    output, pair_score = network_model.forward_all_pair(
                        word_embedding_dimention, mention_index, mention_span,
                        candi_index, candi_spans, pair_feature, anaphors,
                        antecedents, dropout_rate)
                    ana_output, ana_score = network_model.forward_anaphoricity(
                        word_embedding_dimention, anaphoricity_index,
                        anaphoricity_span, anaphoricity_feature, dropout_rate)

                    reindex = autograd.Variable(
                        torch.from_numpy(rl["reindex"]).type(
                            torch.cuda.LongTensor))

                    optimizer.zero_grad()
                    loss = None
                    scores_reindex = torch.transpose(
                        torch.cat((pair_score, ana_score), 1), 0, 1)[reindex]
                    #scores_reindex = torch.transpose(torch.cat((pair_score,-1-0.3*ana_score),1),0,1)[reindex]

                    for s, e in zip(rl["starts"], rl["ends"]):
                        #action_prob: scores_reindex[s:e][1]
                        this_action = action_list[index]
                        #current_reward = reward + trick_reward[index]
                        current_reward = reward

                        #this_loss = -reward*(torch.transpose(F.log_softmax(torch.transpose(scores_reindex[s:e],0,1)),0,1)[this_action])
                        this_loss = -current_reward * (torch.transpose(
                            F.log_softmax(
                                torch.transpose(scores_reindex[s:e], 0, 1)), 0,
                            1)[this_action])

                        if loss is None:
                            loss = this_loss
                        else:
                            loss += this_loss
                        index += 1
                    #loss /= len(rl["starts"])
                    loss /= len(rl["starts"])
                    #loss = loss/train_docs.scale_factor
                    ## policy graident
                    cost += loss.data[0]
                    loss.backward()
                    optimizer.step()

                new_cluster_num = 0
                cluster_info = []
                cluster_info.append(-1)
                tmp_data = []
                action_list = []
                new_cluster_info = []
            #if inside_time%50 == 0:
            #    performance.performance(dev_docs,network_model)
            #    print
            #    sys.stdout.flush()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)
        print >> sys.stderr, "cost:", cost
        #print >> sys.stderr,"save model ..."
        #torch.save(network_model, model_save_dir+"network_model_pretrain.%d"%echo)

        performance.performance(dev_docs, network_model)

        sys.stdout.flush()
Пример #18
0
 def open_performance(self):
     t2 = Toplevel(self.master)
     self.master.withdraw()
     performance(t2, self.master, self.previous)
     t2.wm_protocol("WM_DELETE_WINDOW", self.previous.destroy)
Пример #19
0
 def initialize_performance(self):
     holding_days = pd.Series(self.bkt_position.holding_matrix.index, index=self.bkt_position.holding_matrix.index)
     holding_days = holding_days[self.bkt_start:self.bkt_end]
     self.bkt_performance = performance(self.account_value, benchmark = self.benchmark_value,
         info_series=self.info_series, risk_free_rate = self.risk_free_rate, holding_days=holding_days)
Пример #20
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    best_network_file = "./model/network_model_pretrain.best"
    print >> sys.stderr,"Read model from",best_network_file
    best_network_model = torch.load(best_network_file)
        
    embedding_matrix = numpy.load(embedding_file)

    "Building torch model"
    network_model = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda()
    print >> sys.stderr,"save model ..."

    net_copy(network_model,best_network_model)

    reduced=""
    if args.reduced == 1:
        reduced="_reduced"

    print >> sys.stderr,"prepare data for train ..."
    train_docs = DataReader.DataGnerater("train"+reduced)
    print >> sys.stderr,"prepare data for dev and test ..."
    dev_docs = DataReader.DataGnerater("dev"+reduced)
    test_docs = DataReader.DataGnerater("test"+reduced)


    l2_lambda = 1e-6
    lr = nnargs["lr"]
    dropout_rate = nnargs["dropout_rate"]
    epoch = nnargs["epoch"]

    model_save_dir = "./model/bp/"
   
    last_cost = 0.0
    all_best_results = {
        'thresh': 0.0,
        'accuracy': 0.0,
        'precision': 0.0,
        'recall': 0.0,
        'f1': 0.0
        }
  
    optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps=1e-5)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=75, gamma=0.5)

    for echo in range(epoch):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:",echo
        
        scheduler.step()

        pair_cost_this_turn = 0.0
        ana_cost_this_turn = 0.0

        pair_nums = 0
        ana_nums = 0

        for data in train_docs.train_generater(shuffle=True):

            mention_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor))
            mention_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(torch.from_numpy(data["candi_word_index"]).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(torch.from_numpy(data["candi_span"]).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(torch.from_numpy(data["pair_features"]).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(torch.from_numpy(data["pair_anaphors"]).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(torch.from_numpy(data["pair_antecedents"]).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(torch.from_numpy(data["anaphoricity_feature"]).type(torch.cuda.FloatTensor))
            
            reindex = autograd.Variable(torch.from_numpy(data["top_score_index"]).type(torch.cuda.LongTensor))
            start_index = autograd.Variable(torch.from_numpy(data["top_starts"]).type(torch.cuda.LongTensor))
            end_index = autograd.Variable(torch.from_numpy(data["top_ends"]).type(torch.cuda.LongTensor))
            top_gold = autograd.Variable(torch.from_numpy(data["top_gold"]).type(torch.cuda.FloatTensor))

            anaphoricity_target = data["anaphoricity_target"]
            anaphoricity_gold = anaphoricity_target.tolist()
            ana_lable = autograd.Variable(torch.cuda.FloatTensor([anaphoricity_gold]))

            optimizer.zero_grad()

            output,output_reindex = network_model.forward_top_pair(nnargs["word_embedding_dimention"],mention_index,mention_span,candi_index,candi_spans,pair_feature,anaphors,antecedents,reindex,start_index,end_index,dropout_rate)
            loss = F.binary_cross_entropy(output,top_gold,size_average=False)/train_docs.scale_factor_top

            ana_output,_,_ = network_model.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate)
            ana_loss = F.binary_cross_entropy(ana_output,ana_lable,size_average=False)/train_docs.anaphoricity_scale_factor_top

            loss_all = loss + ana_loss    
            
            loss_all.backward()
            pair_cost_this_turn += loss.data[0]
            optimizer.step()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain",echo,"Pair total cost:",pair_cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time)
        print >> sys.stderr, "Learning Rate",lr

        gold = []
        predict = []

        ana_gold = []
        ana_predict = []

        for data in dev_docs.train_generater(shuffle=False):

            mention_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor))
            mention_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(torch.from_numpy(data["candi_word_index"]).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(torch.from_numpy(data["candi_span"]).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(torch.from_numpy(data["pair_features"]).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(torch.from_numpy(data["pair_anaphors"]).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(torch.from_numpy(data["pair_antecedents"]).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(torch.from_numpy(data["anaphoricity_feature"]).type(torch.cuda.FloatTensor))

            
            reindex = autograd.Variable(torch.from_numpy(data["top_score_index"]).type(torch.cuda.LongTensor))
            start_index = autograd.Variable(torch.from_numpy(data["top_starts"]).type(torch.cuda.LongTensor))
            end_index = autograd.Variable(torch.from_numpy(data["top_ends"]).type(torch.cuda.LongTensor))
            top_gold = autograd.Variable(torch.from_numpy(data["top_gold"]).type(torch.cuda.FloatTensor))

            anaphoricity_target = data["anaphoricity_target"]
            anaphoricity_gold = anaphoricity_target.tolist()
            ana_lable = autograd.Variable(torch.cuda.FloatTensor([anaphoricity_gold]))
            
            gold += data["top_gold"].tolist()
            ana_gold += anaphoricity_target.tolist()
        
            output,output_reindex = network_model.forward_top_pair(nnargs["word_embedding_dimention"],mention_index,mention_span,candi_index,candi_spans,pair_feature,anaphors,antecedents,reindex,start_index,end_index,0.0)

            predict += output.data.cpu().numpy().tolist()

            ana_output,_,_ = network_model.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0)
            ana_predict += ana_output.data.cpu().numpy()[0].tolist()
        
        gold = numpy.array(gold,dtype=numpy.int32)
        predict = numpy.array(predict)

        best_results = {
            'thresh': 0.0,
            'accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }

        thresh_list = [0.3,0.35,0.4,0.45,0.5,0.55,0.6]
        for thresh in thresh_list:
            evaluation_results = get_metrics(gold, predict, thresh)
            if evaluation_results["f1"] >= best_results["f1"]:
                best_results = evaluation_results
 
        print "Pair accuracy: %f and Fscore: %f with thresh: %f"\
                %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
        sys.stdout.flush() 

        if best_results["f1"] >= all_best_results["f1"]:
            all_best_results = best_results
            print >> sys.stderr, "New High Result, Save Model"
            torch.save(network_model, model_save_dir+"network_model_pretrain.best.top")

        ana_gold = numpy.array(ana_gold,dtype=numpy.int32)
        ana_predict = numpy.array(ana_predict)
        best_results = {
            'thresh': 0.0,
            'accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }
        for thresh in thresh_list:
            evaluation_results = get_metrics(ana_gold, ana_predict, thresh)
            if evaluation_results["f1"] >= best_results["f1"]:
                best_results = evaluation_results
        print "Anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\
                %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
        sys.stdout.flush() 

        if (echo+1)%10 == 0:
            best_network_model = torch.load(model_save_dir+"network_model_pretrain.best.top") 
            print "DEV:"
            performance.performance(dev_docs,best_network_model)
            print "TEST:"
            performance.performance(test_docs,best_network_model)
Пример #21
0
 def performance(self):
     self.performance = performance()
Пример #22
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    best_network_file = "./model/network_model_pretrain.best.top"
    print >> sys.stderr, "Read model from ", best_network_file
    best_network_model = torch.load(best_network_file)

    embedding_matrix = numpy.load(embedding_file)
    "Building torch model"
    worker = network.Network(
        nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"],
        nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000,
        nnargs["embedding_size"], nnargs["embedding_dimention"],
        embedding_matrix).cuda()
    net_copy(worker, best_network_model)

    best_network_file = "./model/network_model_pretrain.best.top"
    print >> sys.stderr, "Read model from ", best_network_file
    best_network_model = torch.load(best_network_file)

    manager = network.Network(
        nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"],
        nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000,
        nnargs["embedding_size"], nnargs["embedding_dimention"],
        embedding_matrix).cuda()
    net_copy(manager, best_network_model)

    reduced = ""
    if args.reduced == 1:
        reduced = "_reduced"

    print >> sys.stderr, "prepare data for train ..."
    #train_docs_iter = DataReader.DataGnerater("train"+reduced)
    train_docs_iter = DataReader.DataGnerater("dev" + reduced)
    print >> sys.stderr, "prepare data for dev and test ..."
    dev_docs_iter = DataReader.DataGnerater("dev" + reduced)
    test_docs_iter = DataReader.DataGnerater("test" + reduced)

    print "Performance after pretraining..."
    print "DEV"
    metric = performance.performance(dev_docs_iter, worker, manager)
    print "Average:", metric["average"]
    print "TEST"
    metric = performance.performance(test_docs_iter, worker, manager)
    print "Average:", metric["average"]
    print "***"
    print
    sys.stdout.flush()

    lr = nnargs["lr"]
    top_k = nnargs["top_k"]

    model_save_dir = "./model/reinforce/"
    utils.mkdir(model_save_dir)

    score_softmax = nn.Softmax()

    optimizer_manager = optim.RMSprop(manager.parameters(), lr=lr, eps=1e-6)
    optimizer_worker = optim.RMSprop(worker.parameters(), lr=lr, eps=1e-6)

    MAX_AVE = 2048

    for echo in range(nnargs["epoch"]):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:", echo

        reward_log = Logger(Tensorboard + args.tb +
                            "/acl2018/%d/reward/" % echo,
                            flush_secs=3)
        entropy_log_manager = Logger(Tensorboard + args.tb +
                                     "/acl2018/%d/entropy/worker" % echo,
                                     flush_secs=3)
        entropy_log_worker = Logger(Tensorboard + args.tb +
                                    "/acl2018/%d/entropy/manager" % echo,
                                    flush_secs=3)

        #train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl')
        train_docs = utils.load_pickle(args.DOCUMENT + 'dev_docs.pkl')
        docs_by_id = {doc.did: doc for doc in train_docs}

        ave_reward = []
        ave_manager_entropy = []
        ave_worker_entropy = []

        print >> sys.stderr, "Link docs ..."
        tmp_data = []
        cluster_info = {0: [0]}
        cluster_list = [0]
        current_new_cluster = 1
        predict_action_embedding = []
        choose_action = []
        mid = 1

        step = 0

        statistic = {
            "worker_hits": 0,
            "manager_hits": 0,
            "total": 0,
            "manager_predict_last": 0,
            "worker_predict_last": 0
        }

        for data in train_docs_iter.rl_case_generater(shuffle=True):

            rl = data["rl"]

            scores_manager, representations_manager = get_score_representations(
                manager, data)

            for s, e in zip(rl["starts"], rl["ends"]):
                action_embeddings = representations_manager[s:e]

                probs = F.softmax(torch.transpose(scores_manager[s:e], 0, 1))

                m = Categorical(probs)
                this_action = m.sample()
                index = this_action.data.cpu().numpy()[0]

                if index == (e - s - 1):
                    should_cluster = current_new_cluster
                    cluster_info[should_cluster] = []
                    current_new_cluster += 1
                else:
                    should_cluster = cluster_list[index]

                choose_action.append(index)
                cluster_info[should_cluster].append(mid)
                cluster_list.append(should_cluster)
                mid += 1

                cluster_indexs = torch.cuda.LongTensor(
                    cluster_info[should_cluster])
                action_embedding_predict = torch.mean(
                    action_embeddings[cluster_indexs], 0, keepdim=True)
                predict_action_embedding.append(action_embedding_predict)

            tmp_data.append(data)

            if rl["end"] == True:

                inside_index = 0
                manager_path = []
                worker_path = []

                doc = docs_by_id[rl["did"]]

                for data in tmp_data:

                    rl = data["rl"]
                    pair_target = data["pair_target"]
                    anaphoricity_target = 1 - data["anaphoricity_target"]
                    target = numpy.concatenate(
                        (pair_target, anaphoricity_target))[rl["reindex"]]

                    scores_worker, representations_worker = get_score_representations(
                        worker, data)

                    for s, e in zip(rl["starts"], rl["ends"]):
                        action_embeddings = representations_worker[s:e]
                        score = score_softmax(
                            torch.transpose(scores_worker[s:e], 0,
                                            1)).data.cpu().numpy()[0]

                        action_embedding_choose = predict_action_embedding[
                            inside_index]
                        similarities = torch.sum(
                            torch.abs(action_embeddings -
                                      action_embedding_choose), 1)
                        similarities = similarities.data.cpu().numpy()

                        action_probabilities = []
                        action_list = []
                        action_candidates = heapq.nlargest(
                            top_k, -similarities)
                        for action in action_candidates:
                            action_index = numpy.argwhere(
                                similarities == -action)[0][0]
                            action_probabilities.append(score[action_index])
                            action_list.append(action_index)

                        manager_action = choose_action[inside_index]
                        if not manager_action in action_list:
                            action_list.append(manager_action)
                            action_probabilities.append(score[manager_action])

                        this_target = target[s:e]
                        manager_action = choose_action[inside_index]

                        sample_action = utils.sample_action(
                            numpy.array(action_probabilities))
                        worker_action = action_list[sample_action]

                        if this_target[worker_action] == 1:
                            statistic["worker_hits"] += 1
                        if this_target[manager_action] == 1:
                            statistic["manager_hits"] += 1
                        if worker_action == (e - s - 1):
                            statistic["worker_predict_last"] += 1
                        if manager_action == (e - s - 1):
                            statistic["manager_predict_last"] += 1
                        statistic["total"] += 1

                        inside_index += 1

                        #link = manager_action
                        link = worker_action
                        m1, m2 = rl['ids'][s + link]
                        doc.link(m1, m2)

                        manager_path.append(manager_action)
                        worker_path.append(worker_action)

                reward = doc.get_f1()
                for data in tmp_data:
                    for s, e in zip(rl["starts"], rl["ends"]):
                        ids = rl['ids'][s:e]
                        ana = ids[0, 1]
                        old_ant = doc.ana_to_ant[ana]
                        doc.unlink(ana)
                        costs = rl['costs'][s:e]
                        for ant_ind in range(e - s):
                            costs[ant_ind] = doc.link(ids[ant_ind, 0],
                                                      ana,
                                                      hypothetical=True,
                                                      beta=1)
                        doc.link(old_ant, ana)
                        #costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor))

                inside_index = 0
                worker_entropy = 0.0

                for data in tmp_data:
                    new_step = step
                    # worker
                    scores_worker, representations_worker = get_score_representations(
                        worker, data, dropout=nnargs["dropout_rate"])
                    optimizer_worker.zero_grad
                    worker_loss = None
                    for s, e in zip(rl["starts"], rl["ends"]):
                        costs = rl['costs'][s:e]
                        costs = autograd.Variable(
                            torch.from_numpy(costs).type(
                                torch.cuda.FloatTensor))
                        action = worker_path[inside_index]
                        score = F.softmax(
                            torch.transpose(scores_worker[s:e], 0, 1))
                        if not score.size()[1] == costs.size()[0]:
                            continue
                        score = torch.squeeze(score)

                        baseline = torch.sum(costs * score)
                        this_cost = torch.log(
                            score[action]) * -1.0 * (reward - baseline)

                        if worker_loss is None:
                            worker_loss = this_cost
                        else:
                            worker_loss += this_cost
                        worker_entropy += torch.sum(
                            score * torch.log(score + 1e-7)
                        ).data.cpu().numpy()[
                            0]  #+ 0.001*torch.sum(score*torch.log(score+1e-7))
                        inside_index += 1

                    worker_loss.backward()
                    torch.nn.utils.clip_grad_norm(worker.parameters(),
                                                  nnargs["clip"])
                    optimizer_worker.step()

                    ave_worker_entropy.append(worker_entropy)
                    if len(ave_worker_entropy) >= MAX_AVE:
                        ave_worker_entropy = ave_worker_entropy[1:]
                    entropy_log_worker.log_value(
                        'entropy',
                        float(sum(ave_worker_entropy)) /
                        float(len(ave_worker_entropy)), new_step)
                    new_step += 1

                inside_index = 0
                manager_entropy = 0.0
                for data in tmp_data:
                    new_step = step
                    rl = data["rl"]

                    ave_reward.append(reward)
                    if len(ave_reward) >= MAX_AVE:
                        ave_reward = ave_reward[1:]
                    reward_log.log_value(
                        'reward',
                        float(sum(ave_reward)) / float(len(ave_reward)),
                        new_step)

                    scores_manager, representations_manager = get_score_representations(
                        manager, data, dropout=nnargs["dropout_rate"])

                    optimizer_manager.zero_grad
                    manager_loss = None
                    for s, e in zip(rl["starts"], rl["ends"]):
                        score = F.softmax(
                            torch.transpose(scores_manager[s:e], 0, 1))
                        costs = rl['costs'][s:e]
                        costs = autograd.Variable(
                            torch.from_numpy(costs).type(
                                torch.cuda.FloatTensor))
                        if not score.size()[1] == costs.size()[0]:
                            continue

                        action = manager_path[inside_index]
                        score = torch.squeeze(score)

                        baseline = torch.sum(costs * score)
                        this_cost = torch.log(score[action]) * -1.0 * (
                            reward - baseline
                        )  # + 0.001*torch.sum(score*torch.log(score+1e-7))

                        #this_cost = torch.sum(score*costs) + 0.001*torch.sum(score*torch.log(score+1e-7))

                        if manager_loss is None:
                            manager_loss = this_cost
                        else:
                            manager_loss += this_cost

                        manager_entropy += torch.sum(
                            score *
                            torch.log(score + 1e-7)).data.cpu().numpy()[0]
                        inside_index += 1

                    manager_loss.backward()
                    torch.nn.utils.clip_grad_norm(manager.parameters(),
                                                  nnargs["clip"])
                    optimizer_manager.step()

                    ave_manager_entropy.append(manager_entropy)
                    if len(ave_manager_entropy) >= MAX_AVE:
                        ave_manager_entropy = ave_manager_entropy[1:]
                    entropy_log_manager.log_value(
                        'entropy',
                        float(sum(ave_manager_entropy)) /
                        float(len(ave_manager_entropy)), new_step)
                    new_step += 1

                step = new_step
                tmp_data = []
                cluster_info = {0: [0]}
                cluster_list = [0]
                current_new_cluster = 1
                mid = 1
                predict_action_embedding = []
                choose_action = []

        end_time = timeit.default_timer()
        print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time -
                                                            start_time)
        print >> sys.stderr, "save model ..."
        #print "Top k",top_k
        print "Worker Hits", statistic[
            "worker_hits"], "Manager Hits", statistic[
                "manager_hits"], "Total", statistic["total"]
        print "Worker predict last", statistic[
            "worker_predict_last"], "Manager predict last", statistic[
                "manager_predict_last"]
        #torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo)
        #torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo)

        print "DEV"
        metric = performance.performance(dev_docs_iter, worker, manager)
        print "Average:", metric["average"]
        print "DEV manager"
        metric = performance_manager.performance(dev_docs_iter, worker,
                                                 manager)
        print "Average:", metric["average"]
        print "TEST"
        metric = performance.performance(test_docs_iter, worker, manager)
        print "Average:", metric["average"]
        print
        sys.stdout.flush()
Пример #23
0
attention=AttentionLayer()
test_model=load_model(out_name+'_best_model',custom_objects={'AttentionLayer':attention})
loss,acc=test_model.evaluate(test_seq,test_label)
print('loss:',loss)
print('acc:',acc)
out=test_model.predict(test_seq)
pred_proba=out[:,1]
pred=np.argmax(out,axis=1)
pred=np.argmax(pred_proba)
pred=np.array([1 if x>0.5 else 0 for x in pred_proba])

acc=metrics.accuracy_score(test_label_,pred)
print(acc)
roc_auc=metrics.roc_auc_score(test_label_,pred_proba)
MCC=metrics.matthews_corrcoef(test_label_, pred)
precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(test_label_, pred)
performance_result={
    'acc':[acc],
    'roc_auc':[roc_auc],
    'precision':[precision],
    'recall':[recall],
    'SN':[SN],
    'SP':[SP],
    'GM':[GM],
    'MCC':[MCC],
    'TP':[TP],
    'FP':[FP],
    'TN':[TN],
    'FN':[FN]
}
result={
Пример #24
0
    def test(self):
        #self.merge()
        #self.compress()
        #return
        embedding_size = 100
        for CLUSTER_MIN_SIZE in range(4, 19, 2):
            for dsname in ['webkb']:
                mln = MLN(dsname)
                db = DBManager(dsname, mln)
                print('merge db dom sizes:')
                db.set_doms_atoms(mln, db.merge_db_file)

                cf = common_f()
                #cf.delete_files(mln.pickle_location)
                if dsname == 'er':
                    cf.remove_irrelevant_atoms()

                embedding_size = 300
                print('generating sentences')
                start = time.time()
                cnn_atoms, ntn_atoms = db.pred_atoms, db.pred_atoms
                while True:
                    #cnn_atoms = self.embed(cnn_atoms,mln.pdm,mln.dom_sizes_map,True)
                    ntn_atoms = self.embed(ntn_atoms, mln.pdm,
                                           mln.dom_sizes_map, False)

                sg = None
                if dsname == 'review':

                    return
                    #end = time.time()
                    #print('Time : ',end-start)
                else:
                    sg = sentence_generator(mln.pdm, db.pred_atoms,
                                            db.TEST_SIZE, db)
                    #print('calling w2v')
                    #wv = word2vec_cnn()
                    #print('making images')
                    #wv.make_images(sg.sentences,mln.pdm,db.pred_atoms,mln.dom_sizes_map,dsname,sg.train_atoms,sg.test_atoms,db.TEST_SIZE)

                cor = corrupt(dsname, db.pred_atoms, mln.pdm, db.dom_objs_map,
                              sg.sentences)
                return
                bmf = bmf_cluster(dsname)
                bmf.cluster(db, 1, mln.pdm, dom_obj_map)

                print('original db dom sizes(after compression):')
                orig_dom_objs_map = db.get_dom_objs_map(mln, mln.orig_db_file)
                CLUSTER_MIN_SIZE = 10
                w2v = word2vec(dsname, db, CLUSTER_MIN_SIZE, embedding_size)
                print('w2v cluster dom sizes:')
                w2v_dom_objs_map = db.get_dom_objs_map(
                    mln, w2v.w2v__cluster_db_file)
                cr = cf.calculate_cr(orig_dom_objs_map, w2v_dom_objs_map)

                print('cr : ' + str(cr))
                rc = random_cluster(dsname)
                rc.generate_random_db(db, w2v.pred_atoms_reduced_numbers, mln,
                                      w2v_dom_objs_map)
                print('random cluster dom sizes')
                db.get_dom_objs_map(mln, mln.random__cluster_db_file)

                kmc = kmeans_cluster(dsname)
                kmc.cluster(db, str(cr), mln.pdm, w2v_dom_objs_map,
                            mln.dom_pred_map)
                print('kmeans cluster dom sizes:')
                kmeans_dom_objs_map = db.get_dom_objs_map(
                    mln, kmc.kmeans__cluster_db_file)
                mln.create_magician_mln()
                magician(dsname, mln)
                #tuffy(dsname)
                orig_meta_map = {}

                orig_meta_map['bmf'] = bmf.bmf_orig_meta_map
                orig_meta_map['w2v'] = w2v.w2v_orig_meta_map
                orig_meta_map['random'] = rc.rand_orig_meta_map
                orig_meta_map['kmeans'] = kmc.kmeans_orig_meta_map
                print('Dataset : ' + dsname + '; CR : ' + str(cr))
                p = performance(dsname, embedding_size)
                p.compare_marginal(mln, orig_meta_map, cr)
Пример #25
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    best_network_file = "./model/network_model_pretrain.best.top.pair"
    print >> sys.stderr,"Read model from ",best_network_file
    best_network_model = torch.load(best_network_file)

    embedding_matrix = numpy.load(embedding_file)
    "Building torch model"
    network_model = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda()
    net_copy(network_model,best_network_model)

    best_network_file = "./model/network_model_pretrain.best.top.ana"
    print >> sys.stderr,"Read model from ",best_network_file
    best_network_model = torch.load(best_network_file)

    ana_network = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda()
    net_copy(ana_network,best_network_model)

    reduced=""
    if args.reduced == 1:
        reduced="_reduced"

    print >> sys.stderr,"prepare data for train ..."
    train_docs_iter = DataReader.DataGnerater("train"+reduced)
    print >> sys.stderr,"prepare data for dev and test ..."
    dev_docs_iter = DataReader.DataGnerater("dev"+reduced)
    test_docs_iter = DataReader.DataGnerater("test"+reduced)

    print "Performance after pretraining..."
    print "DEV"
    metric = performance.performance(dev_docs_iter,network_model,ana_network) 
    print "Average:",metric["average"]
    print "TEST"
    metric = performance.performance(test_docs_iter,network_model,ana_network) 
    print "Average:",metric["average"]
    print "***"
    print
    sys.stdout.flush()

    l2_lambda = 1e-6
    #lr = 0.00001
    #lr = 0.000005
    lr = 0.000002
    #lr = 0.0000009
    dropout_rate = 0.5
    shuffle = True
    times = 0

    reinforce = True

    model_save_dir = "./model/reinforce/"
    utils.mkdir(model_save_dir)

    score_softmax = nn.Softmax()
    optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps = 1e-6)
    ana_optimizer = optim.RMSprop(ana_network.parameters(), lr=lr, eps = 1e-6)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)
    ana_scheduler = lr_scheduler.StepLR(ana_optimizer, step_size=15, gamma=0.5)
   
    for echo in range(30):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:",echo

        scheduler.step()
        ana_scheduler.step()

        train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl')

        docs_by_id = {doc.did: doc for doc in train_docs}
       
        print >> sys.stderr,"Link docs ..."
        tmp_data = []
        path = []
        for data in train_docs_iter.rl_case_generater(shuffle=True):
            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data

            mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor))
            mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor))

            output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,0.0)
            ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0)
            ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents, 0.0)

            reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))

            scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex]
            ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex]

            doc = docs_by_id[rl['did']]

            for s,e in zip(rl["starts"],rl["ends"]):
                score = score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1)).data.cpu().numpy()[0]
                pair_score = score_softmax(torch.transpose(scores_reindex[s:e-1],0,1)).data.cpu().numpy()[0]

                ana_action = utils.sample_action(score)
                if ana_action == (e-s-1):
                    action = ana_action
                else:
                    pair_action = utils.sample_action(pair_score*score[:-1])
                    action = pair_action
                path.append(action)
                link = action
                m1, m2 = rl['ids'][s + link]
                doc.link(m1, m2)

            tmp_data.append((mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return))
                
            if rl["end"] == True:
                doc = docs_by_id[rl['did']]
                reward = doc.get_f1()
                inside_index = 0
                for mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return in tmp_data:

                    for (start, end) in zip(rl['starts'], rl['ends']):
                        ids = rl['ids'][start:end]
                        ana = ids[0, 1]
                        old_ant = doc.ana_to_ant[ana]
                        doc.unlink(ana)
                        costs = rl['costs'][start:end]
                        for ant_ind in range(end - start):
                            costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1)
                        doc.link(old_ant, ana) 

                    cost = 0.0
                    mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor))
                    mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
                    candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
                    candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
                    pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
                    anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
                    antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))
                    anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor))
                    anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor))
                    anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor))
        
                    ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate)
                    ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate)
        
                    reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))
        
                    ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex]
        
                    ana_optimizer.zero_grad()
                    ana_loss = None
                    i = inside_index
                    for s,e in zip(rl["starts"],rl["ends"]):
                        costs = rl["costs"][s:e]
                        costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor))
                        score = torch.squeeze(score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1)))
                        baseline = torch.sum(score*costs) 

                        action = path[i]
                        this_cost = torch.log(score[action])*-1.0*(reward-baseline)
                        
                        if ana_loss is None:
                            ana_loss = this_cost
                        else:
                            ana_loss += this_cost
                        i += 1
                    ana_loss.backward()
                    torch.nn.utils.clip_grad_norm(ana_network.parameters(), 5.0)
                    ana_optimizer.step()
        
                    mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor))
                    mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
                    candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
                    candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
                    pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
                    anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
                    antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))
        
                    anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor))
                    anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor))
                    anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor))
        
                    output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate)
        
                    ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate)
        
                    reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))
        
                    scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex]
        
                    pair_loss = None
                    optimizer.zero_grad()
                    i = inside_index
                    index = 0
                    for s,e in zip(rl["starts"],rl["ends"]):
                        action = path[i]
                        if (not (action == (e-s-1))) and (anaphoricity_target[index] == 1):
                            costs = rl["costs"][s:e-1]
                            costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor))
                            score = torch.squeeze(score_softmax(torch.transpose(scores_reindex[s:e-1],0,1)))
                            baseline = torch.sum(score*costs)
                            this_cost = torch.log(score[action])*-1.0*(reward-baseline)
                            if pair_loss is None:
                                pair_loss = this_cost
                            else:
                                pair_loss += this_cost
                        i += 1
                        index += 1
                    if pair_loss is not None:
                        pair_loss.backward()
                        torch.nn.utils.clip_grad_norm(network_model.parameters(), 5.0)
                        optimizer.step()
                    inside_index = i

                tmp_data = []
                path = []
                        
        end_time = timeit.default_timer()
        print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time)
        print >> sys.stderr, "cost:",cost
        print >> sys.stderr,"save model ..."
        torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo)
        torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo)
        
        print "DEV"
        metric = performance.performance(dev_docs_iter,network_model,ana_network) 
        print "Average:",metric["average"]
        print "DEV Ana: ",metric["ana"]
        print "TEST"
        metric = performance.performance(test_docs_iter,network_model,ana_network) 
        print "Average:",metric["average"]
        print "TEST Ana: ",metric["ana"]
        print

        sys.stdout.flush()