예제 #1
0
def objective_function_veracity_branchLSTM_fullPHEME(params):
    path = 'saved_data_fullPHEME'

    train = [
        'ebola-essien', 'ferguson', 'gurlitt', 'ottawashooting',
        'prince-toronto', 'putinmissing', 'sydneysiege'
    ]

    test = 'charliehebdo'
    max_branch_len = 25
    x_train = []
    y_train = []

    for t in train:
        temp_x_train = np.load(os.path.join(path, t, 'train_array.npy'))
        temp_y_train = np.load(os.path.join(path, t, 'labels.npy'))

        temp_x_train = pad_sequences(temp_x_train,
                                     maxlen=max_branch_len,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)

        x_train.extend(temp_x_train)
        y_train.extend(temp_y_train)

    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)

    x_test = np.load(os.path.join(path, test, 'train_array.npy'))
    y_test = np.load(os.path.join(path, test, 'labels.npy'))
    ids_test = np.load(os.path.join(path, test, 'ids.npy'))
    #%
    y_train = to_categorical(y_train, num_classes=None)

    model = heteroscedastic_model(x_train, y_train, params, output_classes=3)
    mb_size = params['mb_size']
    num_epochs = params['num_epochs']
    model.fit(x_train, [y_train, y_train],
              batch_size=mb_size,
              epochs=num_epochs,
              shuffle=False,
              class_weight=None)

    verbose = False
    predictions_test = model.predict(x_test,
                                     batch_size=mb_size,
                                     verbose=verbose)
    softmax_test = predictions_test[1]
    y_pred = np.argmax(softmax_test, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_test, y_test, y_pred)

    mactest_F = f1_score(tree_label, tree_prediction, average='macro')

    output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK}
    #%%
    return output
예제 #2
0
def objective_function_branchLSTM_Twitter15(params):
    #%%
    path = 'preprocessing/saved_data_15'

    #   fold 0 is development set

    train = '0/train'
    test = '0/test'

    max_branch_len = 25

    x_train = []
    y_train = []

    temp_x_train = np.load(os.path.join(path, train, 'train_array.npy'))
    y_train = np.load(os.path.join(path, train, 'labels.npy'))

    #   pad sequences to the size of the largest
    x_train = pad_sequences(temp_x_train,
                            maxlen=max_branch_len,
                            dtype='float32',
                            padding='post',
                            truncating='post',
                            value=0.)

    x_test = np.load(os.path.join(path, test, 'train_array.npy'))
    y_test = np.load(os.path.join(path, test, 'labels.npy'))
    ids_test = np.load(os.path.join(path, test, 'ids.npy'))

    #%
    y_train = to_categorical(y_train, num_classes=None)

    #    y_pred, confidence = LSTM_model(x_train, y_train, x_test, params)

    model = heteroscedastic_model(x_train, y_train, params, output_classes=4)
    mb_size = params['mb_size']
    num_epochs = params['num_epochs']
    model.fit(x_train, [y_train, y_train],
              batch_size=mb_size,
              epochs=num_epochs,
              shuffle=False,
              class_weight=None)

    verbose = False
    predictions_test = model.predict(x_test,
                                     batch_size=mb_size,
                                     verbose=verbose)
    softmax_test = predictions_test[1]
    y_pred = np.argmax(softmax_test, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_test, y_test, y_pred)

    mactest_F = f1_score(tree_label, tree_prediction, average='macro')

    output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK}
    #%%
    return output
예제 #3
0
def eval_veracity_LSTM_CV(params,dataset='15'):   

    path = 'preprocessing/saved_data_'+dataset
    
    folds = ['0','1', '2','3', '4']

    num_epochs = params['num_epochs'] 
    mb_size = params['mb_size']
    

    for f in folds:
        
        print(f)
        test = f+'/test'
        train = f+'/train'
        
        x_test = np.load(os.path.join(path,test, 'train_array.npy'))
        y_test = np.load(os.path.join(path,test, 'labels.npy'))
        ids_test = np.load(os.path.join(path,test, 'ids.npy'))
        
        predictions_train = []

        
        x_train = np.load(os.path.join(path,train, 'train_array.npy'))
        y_train = np.load(os.path.join(path,train, 'labels.npy'))
        y_train = to_categorical(y_train, num_classes=4)
        ids_train = np.load(os.path.join(path,train, 'ids.npy'))
        
        model = heteroscedastic_model(x_train, y_train, params, output_classes=4)
        model.fit(x_train,[y_train, y_train], batch_size=mb_size,
                  epochs=num_epochs, shuffle=False, class_weight=None)
        
        predictions = models.predict_on_data(model,params, x_train, y_train, x_test, y_test, num_classes=3, verbose=True)
        
        tree_results_train = branch2tree(ids_train, predictions['train'])
        
        predictions['train']['tree_results'] = tree_results_train
        
        predictions_train.append(predictions['train'])

        filename = 'output/model'+f+'.h5'
        model.save(filename)
        json_string = model.to_json()
        with open('output/my_model_architecture'+f+'.h5','w') as fout:
            json.dump(json_string,fout)
        
        model.save_weights('output/my_model_weights'+f+'.h5')
        # I need to improve this
        
        
        tree_results_test = branch2tree(ids_test, predictions['test'])
        
        
        predictions['test']['tree_results'] = tree_results_test
        predictions['train']['tree_results'] = predictions_train
        
        
        filename = 'output/predictions'+f+'.pkl'
        fout = open(filename, "wb")
        pickle.dump(predictions, fout)
        fout.close()
        
        eval_info_test = eval_branches(tree_results_test)
#        eval_info_train = [eval_branches(i)  for i in tree_results_train]
        
        eval_info = {}
        eval_info['test'] = eval_info_test
#        eval_info['train'] = eval_info_train
        
        filename = 'output/eval_info'+f+'.pkl'
        fout = open(filename, "wb")
        pickle.dump(eval_info, fout)
        fout.close()
예제 #4
0
def eval_veracity_LSTM_CV(params, branch=True):
    #%%
    if branch:
        path = 'saved_data_fullPHEME'
    else:
        path = 'saved_data_timelinefullPHEME'

    folds = [
        'ebola-essien', 'ferguson', 'gurlitt', 'ottawashooting',
        'prince-toronto', 'putinmissing', 'sydneysiege', 'charliehebdo',
        'germanwings-crash'
    ]

    num_epochs = params['num_epochs']
    mb_size = params['mb_size']

    for number in range(len(folds)):

        x_temp = np.load(os.path.join(path, 'ebola-essien', 'train_array.npy'))
        y_temp = np.load(os.path.join(path, 'ebola-essien', 'labels.npy'))
        model = heteroscedastic_model(x_temp, y_temp, params, output_classes=3)
        #        del x_temp

        print(number)
        test = folds[number]
        train = deepcopy(folds)
        del train[number]

        x_test = np.load(os.path.join(path, test, 'train_array.npy'))
        y_test = np.load(os.path.join(path, test, 'labels.npy'))
        ids_test = np.load(os.path.join(path, test, 'ids.npy'))

        predictions_train = []

        for t in train:
            x_train = np.load(os.path.join(path, t, 'train_array.npy'))
            y_train = np.load(os.path.join(path, t, 'labels.npy'))
            y_train = to_categorical(y_train, num_classes=3)
            ids_train = np.load(os.path.join(path, t, 'ids.npy'))

            model.fit(x_train, [y_train, y_train],
                      batch_size=mb_size,
                      epochs=num_epochs,
                      shuffle=False,
                      class_weight=None)

            predictions = models.predict_on_data(model,
                                                 params,
                                                 x_train,
                                                 y_train,
                                                 x_test,
                                                 y_test,
                                                 num_classes=3,
                                                 verbose=True)

            tree_results_train = branch2tree(ids_train, predictions['train'])

            predictions['train']['tree_results'] = tree_results_train

            predictions_train.append(predictions['train'])

        filename = 'output/model' + str(test) + '.h5'
        model.save(filename)
        json_string = model.to_json()
        with open('output/my_model_architecture' + str(test) + '.h5',
                  'w') as f:
            json.dump(json_string, f)

        model.save_weights('output/my_model_weights' + str(test) + '.h5')
        # I need to improve this

        tree_results_test = branch2tree(ids_test, predictions['test'])

        predictions['test']['tree_results'] = tree_results_test
        predictions['train']['tree_results'] = predictions_train

        filename = 'output/predictions' + str(test) + '.pkl'
        f = open(filename, "wb")
        pickle.dump(predictions, f)
        f.close()

        eval_info_test = eval_branches(tree_results_test)
        eval_info = {}
        eval_info['test'] = eval_info_test
        filename = 'output/eval_info' + str(test) + '.pkl'
        f = open(filename, "wb")
        pickle.dump(eval_info, f)
        f.close()