示例#1
0
def main(argv):
    # config the CPU/GPU in TF, assume only one GPU is in use.
    # For multi-gpu setting, please refer to
    #   https://www.tensorflow.org/guide/gpu#using_multiple_gpus

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if len(gpus) == 0 or FLAGS.gpu_id == None:
        device_id = "/device:CPU:0"
    else:
        tf.config.experimental.set_visible_devices(gpus[FLAGS.gpu_id], 'GPU')
        device_id = '/device:GPU:0'

    A_mat, X_mat, z_vec, train_idx, val_idx, test_idx = load_data(FLAGS.dataset)
    An_mat = preprocess_graph(A_mat)

    N = A_mat.shape[0]
    K = z_vec.max() + 1

    with tf.device(device_id):
        gcn = GCN(An_mat, X_mat, [FLAGS.hidden1, K])
        gcn.train(train_idx, z_vec[train_idx], val_idx, z_vec[val_idx])
        test_res = gcn.evaluate(test_idx, z_vec[test_idx], training=False)
        # gcn = GCN(An_mat_diag, X_mat_stack, [FLAGS.hidden1, K])
        # gcn.train(train_idx_recal, z_vec[train_idx], val_idx_recal, z_vec[val_idx])
        # test_res = gcn.evaluate(test_idx_recal, z_vec[test_idx], training=False)
        print("Dataset {}".format(FLAGS.dataset),
              "Test loss {:.4f}".format(test_res[0]),
              "test acc {:.4f}".format(test_res[1]))
示例#2
0
from sklearn.decomposition import PCA
from models.utils import load_data, plot_prediction, print_stats
from models.positive_models import PositiveLinearRegression

if __name__ == '__main__':
    n_components = 8

    data_len, weeks, ts, xs = load_data()

    means, std_devs = [], []

    pca = PCA(n_components=n_components)
    xs_pca = pca.fit(xs).transform(xs)

    linear_model = PositiveLinearRegression()
    linear_model.fit(xs_pca, ts)

    mean, std_dev = print_stats(xs_pca, ts, linear_model)
    plot_prediction(weeks, xs_pca, ts, linear_model)

    means.append(mean)
    std_devs.append(std_dev)
示例#3
0
Usage:
  ./train.py -i <filename> -a <r> -n <n> -h <n>
  ./train.py -h | --help
Options:
  -i <filename> Instance filename (data)
  -a <r>        Alpha
  -n <n>        Neurons
  -h <n>        Hidden layers
  -h --help     Show this screen.
"""
from docopt import docopt
from models.utils import load_data, stats
from sklearn.neural_network import MLPRegressor


if __name__ == '__main__':
    opts = docopt(__doc__)

    # Read the penalty parameter
    alpha = float(opts['-a'])
    hidden = int(opts['-h'])
    neurons = int(opts['-n'])
    filename = opts['-i']

    data_len, weeks, ts, xs = load_data(filename=filename)

    MLPR_model = MLPRegressor(solver='lbfgs', activation='logistic', alpha=alpha)
    MLPR_model.fit(xs, ts)

    mean, std_dev = stats(xs, ts, MLPR_model)
def explained_variance():
    # explained variance of embeddings
    d = './results/pretrained_embeddings/'

    metric = r'Sensitivity'  # Change to Specificty, YI, YI_max, etc.
    k = 0  # Must correspont to the metric. See load_results in analyse_results.py.
    mt = 'simple'

    vs = []
    evs = []

    for j, s in enumerate(['none', 'chemical', 'species', 'both']):
        tmp1, tmp2 = [], []
        for model1, model2 in product(models, models):

            X, y = load_data('./data/%s_data_test.csv' % s)
            y = np.asarray(y)

            f = d + model1 + '_chemical_entity_embeddings.npy'
            X1 = np.load(f)
            f = d + model1 + '_chemical_ids.npy'
            ids1 = dict(np.load(f))

            f = d + model2 + '_taxonomy_entity_embeddings.npy'
            X2 = np.load(f)
            f = d + model2 + '_taxonomy_ids.npy'
            ids2 = dict(np.load(f))

            X = np.asarray([
                np.concatenate([X1[int(ids1[c])], X2[int(ids2[s])], [conc]],
                               axis=0) for c, s, conc in X
                if c in ids1 and s in ids2
            ])
            X = normalize(X, norm='l2', axis=0)  # normalize over each feature

            pca = PCA(n_components=10)
            pca.fit(X)
            ev = sum(pca.explained_variance_ratio_)

            f = 'results/%s_%s_pretrained_%s_%s.csv' % (s, mt, model1, model2)
            p = load_predictions(
                f.replace('/', '/predictions_').replace('csv', 'npy'))

            v = p['value'][k]

            tmp1.append(ev)
            tmp2.append(v)

        evs.append(tmp1)
        vs.append(tmp2)

    colours = ['red', 'blue', 'green', 'black']
    labels = [r'$\it{(i)}$', r'$\it{(ii)}$', r'$\it{(iii)}$', r'$\it{(iv)}$']
    plt.figure(figsize=(10, 10))
    for i in range(4):
        x = evs[i]
        y = vs[i]
        my_fitting, stats = poly.polyfit(x, y, 1, full=True)
        R2 = stats[0][0]
        plt.scatter(x, y, color=colours[i])
        plt.plot(np.unique(x),
                 np.poly1d(my_fitting[::-1])(np.unique(x)),
                 color=colours[i],
                 linewidth=4,
                 label=labels[i])

    plt.xlabel('Explained variance', fontsize=18)
    plt.ylabel(metric, fontsize=18)
    plt.legend(fontsize=18)

    plt.savefig('./plots/%s_ev_vs_%s.png' % (mt, metric))
示例#5
0
# -*- coding: utf-8 -*-
# @Time    : 2018/8/6 23:49
# @Author  : quincyqiang
# @File    : 01_memorization_baseline.py
# @Software: PyCharm
from models.utils import load_data

# 1 加载数据
ner_dataset_dir = '../data/ner_dataset.csv'
data = load_data(ner_dataset_dir)


# 2 构建数据
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False

    def get_next(self):
        try:
            s = self.data[self.data['Sentence #'] == "Sentence: {}".format(
                self.n_sent)]
            self.n_sent += 1
            return s['Word'].tolist(), s['POS'].tolist(), s['Tag'].tolist()
        except:
            self.empty = True
            return None, None, None


# getter=SentenceGetter(data)
示例#6
0
def main(args, params):
    
    #To approx 0.15/0.15/0.70 split in total data when splitting chemicals/species.
    sizes = {'none':(0.225,0.225),'species':(0.22,0.24),'chemical':(0.23,0.23),'both':(0.45,0.47)}
    SAMPLING = args.sampling
    if args.CREATE_DATA:
        valid_size,test_size = sizes[SAMPLING]
        
        X,y = load_data(DATA_FILE)
        train, valid, test = train_test_split_custom(X, y, valid_size=valid_size, test_size=test_size, sampling=SAMPLING, random_state=RANDOM_SEED)
        print(len(valid[1])/sum(map(len,[train[1],test[1],valid[1]])),len(test[1])/sum(map(len,[train[1],test[1],valid[1]])))
        save_data('data/%s_data_train.csv' % SAMPLING, train)
        save_data('data/%s_data_valid.csv' % SAMPLING, valid)
        save_data('data/%s_data_test.csv' % SAMPLING, test)
    try:
        train = load_data('data/%s_data_train.csv' % SAMPLING)
        valid = load_data('data/%s_data_valid.csv' % SAMPLING)
        test = load_data('data/%s_data_test.csv' % SAMPLING)
        print('Train Split',len(train[1])/sum(map(len,[train[1],test[1],valid[1]])))
        print('Valid Split',len(valid[1])/sum(map(len,[train[1],test[1],valid[1]])))
        print('Test Split',len(test[1])/sum(map(len,[train[1],test[1],valid[1]])))
        oversample = RandomOverSampler(sampling_strategy='minority')
        train = oversample.fit_resample(*train)
        train = shuffle(*train)
        test = shuffle(*test)
        valid = shuffle(*valid)
        
    except:
        args.CREATE_DATA = True
        return main(args,params)
    
    
    params['cw'] = None
    
    if args.SIMPLE: SAMPLING+='_simple'
    else: SAMPLING+='_complex'
    
    if args.model == "onehot":
        fit_onehot(train, valid, test,
                   results_file='results/%s_one_hot.csv' % SAMPLING, 
                   hp_file = 'pred_hp/%s_one_hot.csv' % SAMPLING,
                   params=params)
        
    if args.model == "hier": 
        fit_hier_embeddings(train, valid, test,
                            chemical_hier_embeddings_files,
                            taxonomy_hier_embeddings_files,
                            results_file='results/%s_hierarchy_embedding.csv' % SAMPLING,
                            hp_file='pred_hp/%s_hierarchy_embedding.csv' % SAMPLING,
                            params=params)
        
 
    if args.model == "pretrained":
        for model1 in models:
            for model2 in models:
                fit_pretrained(train, valid, test,
                               KGE_EMBEDDINGS_DIR+model1,
                               KGE_EMBEDDINGS_DIR+model2,
                               results_file='results/%s_pretrained_' % SAMPLING +model1+'_'+model2+'.csv',
                               hp_file='pred_hp/%s_pretrained_' % SAMPLING +model1+'_'+model2+'.csv',
                               params=params)
            
    if args.model == "allpretrained":
        fit_pretrained(train, valid, test,
                            [KGE_EMBEDDINGS_DIR+m for m in models],
                            [KGE_EMBEDDINGS_DIR+m for m in models],
                            results_file='results/%s_all_pretrained_' % SAMPLING+'.csv',
                            hp_file='pred_hp/%s_all_pretrained_' % SAMPLING +'.csv',
                            params=params)
        
    #Select best models from pretrained and run them using sim embedding.
    if args.model in ['pretrainedensemble','sim']:
        best_models_auc = {}
        for model1 in models:
            for model2 in models:
                df = pd.read_csv('results/%s_pretrained_' % SAMPLING +model1+'_'+model2+'.csv',index_col='metric')
                best_models_auc[(model1,model2)] = df.loc['ba','value']
                
        best_models_auc = sorted(best_models_auc.items(),key=lambda x: x[1], reverse=True)

    if args.model == "sim":
        m,_ = best_models_auc[args.num_models-1]
        model1, model2 = m
        if args.MAX_TRIALS < 1:
            hp_file = "sim_hp/%s_joint_finetune_" % SAMPLING + model1+"_"+model2+".csv.json"
        else:
            hp_file = None
            
        hps = {}
        try:
            with open('pretrained_hp/%s_chemical_kg.json' % model1,'r') as f:
                tmp = json.load(f)
                for k in tmp:
                    hps[k+'1'] = tmp[k]
        except:
            pass 
        try:
            with open('pretrained_hp/%s_taxonomy_kg.json' % model2,'r') as f:
                tmp = json.load(f)
                for k in tmp:
                    hps[k+'2'] = tmp[k]
        except:
            pass
        
        try:
            with open('pred_hp/%s_pretrained_%s_%s.csv' % (SAMPLING,model1,model2),'r') as f:
                tmp = json.load(f)
                hps = {**hps,**tmp}
        except: 
            pass
        
        if hp_file:
            try:
                with open(hp_file, 'r') as f:
                    tmp = json.load(f)
                    hps = {**hps,**tmp}
                    
            except:
                print(model1,model2,'Missing HP file. Using default')

        params['use_pretrained'] = args.USE_PRETRAINED
        if not args.USE_PRETRAINED: SAMPLING+='_non_init'
        
        fit_sim_model(train, valid, test,
                        model1,
                        model2,
                        results_file='results/%s_joint_finetune_' % SAMPLING +model1+'_'+model2+'.csv',
                        embedding_file='sim_embeddings/%s_joint_finetune_' % SAMPLING +model1+'_'+model2,
                        hps = hps,
                        params=params)