Пример #1
0
def task3():
    sgd_params = {
        'GMM': {
            'alpha': 1.0,
            'mb_num': 200
        },
        'Peaks': {
            'alpha': 0.1,
            'mb_num': 250
        },
        'SwissRoll': {
            'alpha': 0.08,
            'mb_num': 250
        }
    }

    for data_set, params in sgd_params.items():
        X_tr, y_tr, X_te, y_te = get_data(data_set)
        _ = sgd(X_tr,
                y_tr,
                X_te,
                y_te,
                alpha=params['alpha'],
                mb_num=params['mb_num'],
                max_epochs=200,
                data_set=data_set)
 def feature_to_vector(self):
     """
     创建并初始化ont-hot向量:从csv中读取药物数据,然后将功效特征转换为one-hot向量
     :return:
     """
     series = get_data(medicine_path)
     # print("series", series)
     root_2_word = root_to_word(thesaurus_path)  # 获取同义词根到词的映射字典
     # print("root_2_word", len(root_2_word), root_2_word)
     word_2_root = word_to_root(thesaurus_path)  # 获取词到同义词根的映射字典
     # print("word_2_root", len(word_2_root), word_2_root)
     # 创建并初始化一个DataFrame存储one-hot向量,第一行的列索引为词根
     self.df = pd.DataFrame(np.zeros((len(series), len(root_2_word))),
                            columns=root_2_word.keys())
     for indexs in series.index:  # series去掉了nan值,index是不连贯的,所以用这种方法遍历
         item_str = series[indexs]
         if item_str == '':
             continue
         # item_list = item_str.strip().split()  # 针对以空格作为分隔符的症状数据
         item_list = re.split("、", item_str)  # 针对以“、”作为分隔符的功效数据
         for item in item_list:
             if item in word_2_root:
                 # 找到每个功效特征词的词根,然后在one-hot向量的相应索引处进行激活
                 self.df[word_2_root[item]].loc[indexs] = 1
             else:
                 print(item)  # 输出没有匹配的词,进行人工处理
     # 删除没有任何匹配的词根
     max_value = self.df.max()  # 返回df中每一列的最大值
     # print("max_value:", max_value)
     drop_list = list(
         max_value[max_value == 0].index)  # 找到最大值为0的列的索引(即没有出现过的词根)
     # print("drop_list:", len(drop_list))
     self.df = self.df.drop(drop_list, axis=1)  # 删除未出现过的词根
Пример #3
0
def evaluate(model, batch):
    """
    Evaluate the training and test set accuracy
    :return:
    """
    # Here goes one batch of training
    q_seq, q_mask, d_seq, d_mask, target_span = get_data(
        batch,
        config.mode.lower() == 'train')
    with torch.no_grad():
        # The loss is individual loss for each pair of question, context and answer
        loss, start_pos_pred, end_pos_pred = model(q_seq, q_mask, d_seq,
                                                   d_mask, target_span)
        start_pos_pred = start_pos_pred.tolist()
        end_pos_pred = end_pos_pred.tolist()
        f1 = 0
        for i, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(
                zip(start_pos_pred, end_pos_pred, batch.ans_tokens)):
            pred_ans_tokens = batch.context_tokens[i][
                pred_ans_start:pred_ans_end + 1]
            prediction = " ".join(pred_ans_tokens)
            ground_truth = " ".join(true_ans_tokens)
            f1 += f1_score(prediction, ground_truth)
        f1 = f1 / (i + 1)
        return f1
def load_data(train_size, test_size):
    keep_channels = ['C3']
    trial_len = 1.5

    # X, y = get_data("../data/CLASubjectA1601083StLRHand.mat", trial_len, keep_channels)
    X, y = get_data("../data/CLASubjectB1512153StLRHand.mat", trial_len,
                    keep_channels)

    X = X[y != 3]
    y = y[y != 3]
    # 0 is left hand
    y[y == 1] = 0
    # 1 is right hand
    y[y == 2] = 1
    interval_len = .45
    X = trim_intervals(X, .15, interval_len)

    num_channels = len(keep_channels)
    d2 = np.ceil(num_channels * interval_len / 0.005).astype(int)
    X = X.reshape(642, d2)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        test_size=test_size)

    return X_train, X_test, y_train, y_test
Пример #5
0
    def load_data(self):
        keep_channels = ['C3']
        trial_len = 1.5

        # X, y = get_data("../data/CLASubjectA1601083StLRHand.mat", trial_len, keep_channels)
        # "../data/CLASubjectB1512153StLRHand.mat"
        X, y = get_data(self.filename, trial_len, keep_channels)

        X = X[y != 3]
        y = y[y != 3]
        # 0 is left hand
        y[y == 1] = 0
        # 1 is right hand
        y[y == 2] = 1
        interval_len = .45
        X = trim_intervals(X, .15, interval_len)

        num_channels = len(keep_channels)
        d2 = np.ceil(num_channels * interval_len / 0.005).astype(int)
        X = X.reshape(642, d2)

        if 'pca' is self.features:
            X = self.PCA(X).numpy()

        if 'nn' is self.features:
            self.get_nn_features()

        self.X = X
        self.y = y
Пример #6
0
def task7():
    nn_sgd_params = {
        'GMM': {
            'layers_nums': (1, 3, 5, 10, 15),
            'alpha': 1.0,
            'mb_num': 200
        },
        'Peaks': {
            'layers_nums': (1, 3, 5, 10, 15),
            'alpha': 1.0,
            'mb_num': 200
        },
        'SwissRoll': {
            'layers_nums': (1, 3, 5, 10),
            'alpha': 1.0,
            'mb_num': 200
        }
    }

    for data_set, params in nn_sgd_params.items():
        X_tr, y_tr, X_te, y_te = get_data(data_set)
        n = X_tr.shape[0]
        layers_nums = params['layers_nums']
        for layers_num in layers_nums:
            layers = [n + 5 * i for i in range(layers_num)]
            _ = nn_sgd(X_tr,
                       y_tr,
                       X_te,
                       y_te,
                       layers=layers,
                       alpha=params['alpha'],
                       mb_num=params['mb_num'],
                       max_epochs=200,
                       data_set=data_set)
Пример #7
0
def main():
	# Prepare dataset.
	raw_file = './data/y_n_all' # raw data file.
	prepared_dir = './data/' # dir of prepared data(sentences cut down and labels are number)
	# cut_mode = 'character'
	cut_mode = 'jieba'
	# cut_mode = '2-gram'
	vocab_dir = './data/Bayes_vocabulary'
	prepared_data,prepared_label = data_utils.prepare_data(raw_file,prepared_dir,cut_method = cut_mode, vocab_dir = vocab_dir)
	print('Get prepared dataset.')
	# pdb.set_trace()
	# print(prepared_data)

	# Get training and test dataset.
	traning_dir = './data/train/'
	test_dir = './data/test/'
	ratio = 0
	train,test = data_utils.get_data(list(zip(prepared_data,prepared_label)),traning_dir,test_dir,ratio = ratio, cut_method = cut_mode)
	print('Get training and test dataset.')
	# pdb.set_trace()
	# print(train.data)
	# train
	model_name = 'Bayes'
	config_dir = './model/'
	train_model = init_model(model_name,config_dir)
	print('Initialize the model.')
	training(train_model,train)
	print('Training finished.')
	# store the variable.
	v = {'model':train_model}
	model_file = train_model.model_path + 'model.pickle'
	with open(model_file,'wb') as f:
		pickle.dump(v, f)
Пример #8
0
 def test(self, num_test=5):
     x_train, x_test, y_train, y_test = get_data()
     rand = np.random.randint(low=0, high=x_test.shape[0], size=num_test)
     x_test = x_test[rand]
     y_test = y_test[rand]
     predicts = self.model.predict(x_test)
     for i in range(num_test):
         compare([x_test[i], predicts[i], y_test[i]])
Пример #9
0
 def train(self):
     x_train, x_test, y_train, y_test = get_data()
     self.model.fit(x_train,
                    y_train,
                    epochs=36,
                    batch_size=32,
                    shuffle=True,
                    validation_data=(x_test, y_test))
     self.model.save('pretrained/model.h5')
Пример #10
0
def main():
    training_images, training_labels, testing_images, testing_labels = get_data(CIFAR10_FOLDER)

    (training_set, training_labels,
     testing_set, testing_labels) = subsample_data(training_images, training_labels,
                                                   testing_images, testing_labels,
                                                   training_num=5000, testing_num=500)
    normalize_data(training_set, testing_set)
    cross_validate_all_classifiers(training_set, training_labels,
                                   testing_set, testing_labels)
Пример #11
0
def evaluate_model(weight_name):
    batch_size = 10
    x_train, y_train = get_data(aug=True, name='train')
    x_test, y_test = get_data(aug=False, name='test')
    num_data = len(x_test)
    [x_test] = img_standardization(x_train, x_test)
    x_test = _parse_function(x_test, im_size=224)
    dataset_test = get_dataset(x_test, y_test, batch_size, resize=False)

    model = tf.keras.models.load_model('./weight/' + weight_name, compile=True)
    # because evaluate() will calculate loss and metrics['accuracy'], so recompiling the loaded model is necessary

    [loss, acc] = model.evaluate(dataset_test,
                                 steps=math.ceil(num_data / batch_size))

    print('TEST loss: ', loss)
    print('TEST acc: ', acc)

    return
Пример #12
0
def main(argv):
    conf = configparser.ConfigParser()
    print(conf.read(argv))
    max_smi_len = conf.getint('model', 'max_smi_len')
    max_seq_len = conf.getint('model', 'max_seq_len')

    data_path = conf.get('model', 'data_path')

    ligands = pd.read_csv(data_path + 'ligands.csv',
                          header=None,
                          names=['id', 'smi'])
    proteins = pd.read_csv(data_path + 'proteins.csv',
                           header=None,
                           names=['id', 'seq'])
    pairs = pd.read_csv(data_path + 'pairs.csv', header=None)

    print(ligands.shape, proteins.shape, pairs.shape)

    char_smi_set = json.load(open(conf.get('model', 'char_smi')))
    char_seq_set = json.load(open(conf.get('model', 'char_seq')))

    smi_feature, seq_feature = get_data(ligands, proteins, max_smi_len,
                                        max_seq_len, char_smi_set,
                                        char_seq_set)
    print(smi_feature.shape, seq_feature.shape)

    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))
    model = CNN(filter_num=conf.getint('model', 'filter_num'),
                smi_window_len=conf.getint('model', 'smi_window_len'),
                seq_window_len=conf.getint('model', 'seq_window_len'),
                max_smi_len=max_smi_len,
                max_seq_len=max_seq_len,
                char_smi_set_size=len(char_smi_set),
                char_seq_set_size=len(char_seq_set),
                embed_dim=conf.getint('model', 'embed_dim'))

    trainy = np.asarray(pairs.iloc[:, 2]).reshape([-1, 1])
    trainX = []
    for idx, row in pairs.iterrows():
        ligand_index = ligands[ligands.id == row[0]].index.values[0]
        protein_index = proteins[proteins.id == row[1]].index.values[0]
        trainX.append([smi_feature[ligand_index], seq_feature[protein_index]])
    trainX = np.asarray(trainX)

    model_path = os.path.join(conf.get('model', 'path', fallback='tmp'),
                              'all.model')
    model.train(sess,
                trainX,
                trainy,
                nb_epoch=conf.getint('model', 'num_epoch'),
                batch_size=conf.getint('model', 'batch_size'),
                model_path=model_path)
Пример #13
0
def main(argv):
    conf = configparser.ConfigParser()
    print(conf.read(argv))
    max_smi_len = conf.getint('model', 'max_smi_len')
    max_seq_len = conf.getint('model', 'max_seq_len')

    char_smi_set = json.load(open(conf.get('model', 'char_smi')))
    char_seq_set = json.load(open(conf.get('model', 'char_seq')))

    data_path = conf.get('data', 'path')
    data_predicted = conf.get('data', 'prediction').split(',')

    sess = tf.InteractiveSession(
        config=tf.ConfigProto(allow_soft_placement=True))

    ''' SMILES + seq '''
    model = CNN(filter_num=conf.getint('model', 'filter_num'),
                smi_window_len=conf.getint('model', 'smi_window_len'),
                seq_window_len=conf.getint('model', 'seq_window_len'),
                max_smi_len=max_smi_len,
                max_seq_len=max_seq_len,
                char_smi_set_size=len(char_smi_set),
                char_seq_set_size=len(char_seq_set),
                embed_dim=conf.getint('model', 'embed_dim'))
    ''' ECFP + seq '''
    # model = ECFPCNN(filter_num=conf.getint('model', 'filter_num'),
    #                 seq_window_len=conf.getint('model', 'seq_window_len'),
    #                 char_seq_set_size=len(char_seq_set),
    #                 embed_dim=conf.getint('model', 'embed_dim'),
    #                 max_smi_len=max_smi_len,
    #                 max_seq_len=max_seq_len)

    model_path = os.path.join(
        conf.get('model', 'path', fallback='tmp'), 'all.model')

    for data_name in data_predicted:
        path = data_path + data_name + '/'

        ligands = pd.read_csv(path + 'ligands.csv', header=None)
        proteins = pd.read_csv(path + 'proteins.csv', header=None)

        smi_feature, seq_feature = get_data(
            ligands, proteins, max_smi_len, max_seq_len, char_smi_set, char_seq_set)

        inputs = []
        for smif in smi_feature:
            inputs.append([smif, seq_feature[0]])
        res = model.predict(sess, np.asarray(inputs), batch_size=conf.getint(
            'model', 'batch_size'), model_path=model_path)
        names = [x.split('.')[0] for x in list(ligands.iloc[:, 0])]
        final_data = pd.DataFrame(np.asarray(list(zip(names, res))))
        final_data.to_csv(path + 'res.csv', index=None, header=None)
Пример #14
0
def main(input_file_path, output_dir_path, main_task, protect_att):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')
    df = get_data(input_file_path)

    logger.info('read all twits and removed duplicates')

    if main_task == 'sentiment':
        if protect_att == 'race':
            logger.info('making sentiment-race')
            pos_pos = get_attr_sentiments(df, happy, sad, 'aa',
                                          MIN_SENTENCE_LEN)
            pos_neg = get_attr_sentiments(df, happy, sad, 'wh',
                                          MIN_SENTENCE_LEN)
            neg_pos = get_attr_sentiments(df, sad, happy, 'aa',
                                          MIN_SENTENCE_LEN)
            neg_neg = get_attr_sentiments(df, sad, happy, 'wh',
                                          MIN_SENTENCE_LEN)
        else:
            logger.error('not supporting this task...')
            exit(-1)
    elif main_task == 'mention':
        if protect_att == 'race':
            logger.info('making mention-race')
            wh, aa = get_race(df, MIN_SENTENCE_LEN)
            pos_pos, neg_pos = mention_split(aa, MIN_SENTENCE_LEN)
            pos_neg, neg_neg = mention_split(wh, MIN_SENTENCE_LEN)
        else:
            logger.error('not supporting this task...')
            exit(-1)
    else:
        logger.error('not supporting this task...')
        exit(-1)

    logger.info('done collecting data')

    size = 100000
    sentences = pos_pos[:size] + pos_neg[:size] + neg_pos[:size] + neg_neg[:
                                                                           size]
    vocab = list(set([item for sublist in sentences for item in sublist]))
    id2voc = dict(enumerate(vocab))
    voc2id = {v: k for k, v in id2voc.iteritems()}

    to_file(output_dir_path, voc2id, vocab, pos_pos[:size], pos_neg[:size],
            neg_pos[:size], neg_neg[:size])
    logger.info('written to file. exiting.')
Пример #15
0
def step(model, optimizer, batch):
    """
    One batch of training
    :return: loss
    """
    # Here goes one batch of training
    q_seq, q_mask, d_seq, d_mask, target_span = get_data(
        batch,
        config.mode.lower() == 'train')
    model.zero_grad()
    # The loss is individual loss for each pair of question, context and answer
    loss, _, _ = model(q_seq, q_mask, d_seq, d_mask, target_span)
    loss.backward(retain_graph=True)
    optimizer.step()
    return loss
Пример #16
0
def step(model, optimizer, batch, params):
    """
    One batch of training
    :return: loss
    """
    # Here goes one batch of training
    q_seq, q_mask, d_seq, d_mask, target_span = get_data(
        batch,
        config.mode.lower() == 'train')
    model.zero_grad()
    # The loss is individual loss for each pair of question, context and answer
    loss, start_pos, end_pos = model(q_seq, q_mask, d_seq, d_mask, target_span)
    loss = torch.sum(loss)
    loss.backward(retain_graph=True
                  )  # TODO : Is this causing memory consumption to increase
    clip_grad_norm_(params, config.max_grad_norm)
    optimizer.step()

    del start_pos, end_pos
    del q_mask, q_seq, d_mask, d_seq, target_span
    return loss
Пример #17
0
def objective(params):
    train_data, val_data, test_data = get_data(params)
    # define a keras model only based on DNA

    try:
        K.clear_session()
        model = Janggu.create(get_model, params, train_data[0], train_data[1], name=params['name'])
        model.compile(optimizer=get_opt(params['opt']), loss='binary_crossentropy',
                      metrics=['acc'])
        hist = model.fit(train_data[0], train_data[1], epochs=params['epochs'], batch_size=64,
                         validation_data=val_data,
                         callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])
    except ValueError:
        traceback.print_stack()
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(repr(traceback.extract_tb(exc_traceback)))
        return {'status': 'fail'}
    print('#' * 40)
    for key in hist.history:
        print('{}: {}'.format(key, hist.history[key][-1]))
    print('#' * 40)
    pred_test = model.predict(test_data[0])
    pred_val = model.predict(val_data[0])

    model.evaluate(val_data[0], val_data[1], callbacks=['auprc', 'auroc'], datatags=['val'])
    model.evaluate(test_data[0], test_data[1], callbacks=['auprc', 'auroc'], datatags=['test'])

    auprc_val = average_precision_score(val_data[1][:], pred_val)
    auprc_test = average_precision_score(test_data[1][:], pred_test)
    model.summary()
    print('auprc_val: {:.2%}'.format(auprc_val))
    print('auprc_test: {:.2%}'.format(auprc_test))
    return {'loss': hist.history['val_loss'][-1], 'status': 'ok', 'all_losses': hist.history,
            'auprc_val': auprc_val,
            'auprc_test': auprc_test,
            'model_config': model.kerasmodel.to_json(),
            'model_weights': model.kerasmodel.get_weights(),
            'concrete_params': params,
            'modelname': model.name}
def infer(data_filepath='data/flowers.hdf5',
          z_dim=128,
          out_dir='gan',
          n_steps=10):

    G = load_model(out_dir)
    val_data = get_data(data_filepath, 'train')
    val_data = next(iterate_minibatches(val_data, 1))
    emb_fixed, txt_fixed = val_data[1], val_data[2]

    z_start = np.random.uniform(-1, 1, size=(1, z_dim))
    z_end = np.random.uniform(-1, 1, size=(1, z_dim))

    G.trainable = False
    for i in range(n_steps + 1):
        p = i / float(n_steps)
        z = z_start * (1 - p) + z_end * p
        fake_image = G.predict([z, emb_fixed])[0]
        img = ((fake_image + 1) * 0.5)
        plt.imsave("{}/fake_z_interpolation_i{}".format(out_dir, i), img)
        print(i,
              str(txt_fixed[0]).strip(),
              file=open("{}/fake_z_interpolation.txt".format(out_dir), "a"))
Пример #19
0
def train(data_filepath='data/flowers.hdf5',
          ndf=64,
          ngf=128,
          z_dim=128,
          emb_dim=128,
          lr_d=5e-5,
          lr_g=5e-5,
          n_iterations=int(1e6),
          batch_size=64,
          iters_per_checkpoint=100,
          n_checkpoint_samples=16,
          out_dir='wgan_gp_lr5e-5'):
    global BATCH_SIZE

    BATCH_SIZE = batch_size
    logger = SummaryWriter(out_dir)
    logger.add_scalar('d_lr', lr_d, 0)
    logger.add_scalar('g_lr', lr_g, 0)
    train_data = get_data(data_filepath, 'train')
    val_data = get_data(data_filepath, 'valid')
    data_iterator = iterate_minibatches(train_data, batch_size)
    val_data_iterator = iterate_minibatches(val_data, n_checkpoint_samples)
    val_data = next(val_data_iterator)
    img_fixed = images_from_bytes(val_data[0])
    emb_fixed = val_data[1]
    txt_fixed = val_data[2]

    img_shape = img_fixed[0].shape
    emb_shape = emb_fixed[0].shape
    print("emb shape {}".format(img_shape))
    print("img shape {}".format(emb_shape))
    z_shape = (z_dim, )

    # plot real text for reference
    log_images(img_fixed, 'real', '0', logger)
    log_text(txt_fixed, 'real', '0', logger)

    # build models
    D = build_discriminator(img_shape, emb_shape, emb_dim, ndf)
    G = build_generator(z_shape, emb_shape, emb_dim, ngf)

    # build model outputs
    real_inputs = Input(shape=img_shape)
    txt_inputs = Input(shape=emb_shape)
    z_inputs = Input(shape=(z_dim, ))

    fake_samples = G([z_inputs, txt_inputs])
    averaged_samples = RandomWeightedAverage()([real_inputs, fake_samples])
    D_real = D([real_inputs, txt_inputs])
    D_fake = D([fake_samples, txt_inputs])
    D_averaged = D([averaged_samples, txt_inputs])

    # The gradient penalty loss function requires the input averaged samples to
    # get gradients. However, Keras loss functions can only have two arguments,
    # y_true and y_pred. We get around this by making a partial() of the
    # function with the averaged samples here.
    loss_gp = partial(loss_gradient_penalty,
                      averaged_samples=averaged_samples,
                      gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT)
    # Functions need names or Keras will throw an error
    loss_gp.__name__ = 'loss_gradient_penalty'

    # define D graph and optimizer
    G.trainable = False
    D.trainable = True
    D_model = Model(inputs=[real_inputs, txt_inputs, z_inputs],
                    outputs=[D_real, D_fake, D_averaged])
    D_model.compile(optimizer=Adam(lr_d, beta_1=0.5, beta_2=0.9),
                    loss=[loss_wasserstein, loss_wasserstein, loss_gp])

    # define D(G(z)) graph and optimizer
    G.trainable = True
    D.trainable = False
    G_model = Model(inputs=[z_inputs, txt_inputs], outputs=D_fake)
    G_model.compile(Adam(lr=lr_g, beta_1=0.5, beta_2=0.9),
                    loss=loss_wasserstein)

    ones = np.ones((batch_size, 1), dtype=np.float32)
    minus_ones = -ones
    dummy = np.zeros((batch_size, 1), dtype=np.float32)

    # fix a z vector for training evaluation
    z_fixed = np.random.uniform(-1, 1, size=(n_checkpoint_samples, z_dim))

    for i in range(n_iterations):
        D.trainable = True
        G.trainable = False
        for j in range(N_CRITIC_ITERS):
            z = np.random.normal(0, 1, size=(batch_size, z_dim))
            real_batch = next(data_iterator)
            losses_d = D_model.train_on_batch(
                [images_from_bytes(real_batch[0]), real_batch[1], z],
                [ones, minus_ones, dummy])

        D.trainable = False
        G.trainable = True
        z = np.random.normal(0, 1, size=(batch_size, z_dim))
        real_batch = next(data_iterator)
        loss_g = G_model.train_on_batch([z, real_batch[1]], ones)

        print("iter", i)
        if (i % iters_per_checkpoint) == 0:
            G.trainable = False
            fake_image = G.predict([z_fixed, emb_fixed])
            log_images(fake_image, 'val_fake', i, logger)
            log_images(img_fixed, 'val_real', i, logger)
            log_text(txt_fixed, 'val_fake', i, logger)

        log_losses(losses_d, loss_g, i, logger)
Пример #20
0
# --- get settings --- #
# parse command line arguments, or use defaults
parser = utils.rgan_options_parser()
settings = vars(parser.parse_args())
# if a settings file is specified, it overrides command line arguments/defaults
if settings['settings_file']:
    settings = utils.load_settings_from_file(settings)

# --- get data, split --- #
data_path = './experiments/data/' + settings['data_load_from'] + '.data.npy'
print('Loading data from', data_path)
settings["eval_single"] = False
settings["eval_an"] = False
samples, labels, index = data_utils.get_data(
    settings["data"], settings["seq_length"], settings["seq_step"],
    settings["num_signals"], settings["sub_id"], settings["eval_single"],
    settings["eval_an"], data_path)
# --- save settings, data --- #
# no need
print('Ready to run with settings:')
for (k, v) in settings.items():
    print(v, '\t', k)
# add the settings to local environment
# WARNING: at this point a lot of variables appear
locals().update(settings)
json.dump(settings,
          open('./experiments/settings/' + identifier + '.txt', 'w'),
          indent=0)


class myADclass():
Пример #21
0
import os
import model
import tensorflow as tf
import data_utils
import configuration

data, labels, w2idx = data_utils.get_data(configuration.config['dataset'])

configuration.config['n_words'] = len(w2idx) + 1

with tf.Session() as sess:
    net = model.CNN(configuration.config, sess, w2idx)
    net.train(data, labels)
Пример #22
0
import sys
sys.path.append("F:\学习\python项目")
from matplotlib import pyplot
import data_utils
import numpy


def draw_pic(data):
    x = [i for i in range(len(data))]
    y = data
    pyplot.scatter(x, y, 1, "red")
    pyplot.show()


if __name__ == "__main__":
    data = data_utils.get_data("F:/工作/正常.txt")
    data = data_utils.change_bad_data(data)
    draw_pic(data)
Пример #23
0
def create_model(session, forward_only, beam_search):
    dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
    #加载预训练好的词向量文件
    vec_post, vec_response = data_utils.get_data(FLAGS.data_dir, FLAGS.post_vocab_size, FLAGS.response_vocab_size)
    print('============-===============', vec_post)
    print(len(vec_post[1]))
    model = seq2seq_model.Seq2SeqModel(
            FLAGS.post_vocab_size,
            FLAGS.response_vocab_size,
            _buckets,
            FLAGS.size,
            FLAGS.num_layers,
            FLAGS.max_gradient_norm,
            FLAGS.batch_size,
            FLAGS.learning_rate,
            FLAGS.learning_rate_decay_factor,
            wordEmbedding=vec_post,
            embedding_size=FLAGS.embedding_size,
            forward_only=forward_only,
            beam_search=beam_search,
            beam_size=FLAGS.beam_size,
            category=FLAGS.category,
            use_emb=FLAGS.use_emb,
            use_imemory=FLAGS.use_imemory,
            use_ememory=FLAGS.use_ememory,
            emotion_size=FLAGS.emotion_size,
            imemory_size=FLAGS.imemory_size,
            dtype=dtype)
    see_variable = True
    if see_variable == True:
        for i in tf.all_variables():
            print(i.name, i.get_shape())
    ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
    pre_ckpt = tf.train.get_checkpoint_state(FLAGS.pretrain_dir)
    #判断是否已经存在模型文件
    if ckpt: #and tf.gfile.Exists(ckpt.model_checkpoint_path+".index"):
        if FLAGS.load_model == 0:
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            model.saver.restore(session, ckpt.model_checkpoint_path)
        else:
            path = ckpt.model_checkpoint_path[:ckpt.model_checkpoint_path.find('-')+1]+str(FLAGS.load_model)
            print("Reading model parameters from %s" % path)
            model.saver.restore(session, path)
    else:
      #初始化,从新训练
        if pre_ckpt:
            session.run(tf.initialize_variables(model.initial_var))
            if FLAGS.pretrain > -1:
                path = pre_ckpt.model_checkpoint_path[:pre_ckpt.model_checkpoint_path.find('-')+1]+str(FLAGS.pretrain)
                print("Reading pretrain model parameters from %s" % path)
                model.pretrain_saver.restore(session, path)
            else:
                print("Reading pretrain model parameters from %s" % pre_ckpt.model_checkpoint_path)
                model.pretrain_saver.restore(session, pre_ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.global_variables_initializer())
            # vec_post, vec_response = data_utils.get_data(FLAGS.data_dir, FLAGS.post_vocab_size, FLAGS.response_vocab_size)
            # print('vec_post:', vec_post.shape)
            # print('vec_res:', vec_response)
            # initvec_post = tf.constant(vec_post, dtype=dtype, name='init_wordvector_post')
            #定位decoder词向量初始化,用预训练的词向量替换
            initvec_response = tf.constant(vec_response, dtype=dtype, name='init_wordvector_response')
            # embedding_post = [x for x in tf.trainable_variables() if x.name == 'embedding_attention_seq2seq/rnn/embedding_wrapper/embedding:0'][0]
            embedding_response = [x for x in tf.trainable_variables() if x.name == 'embedding_attention_seq2seq/embedding_attention_decoder/embedding:0'][0]
            print(type(embedding_response))
            print(embedding_response)
             # session.run(tf.assign(embedding_post, initvec_post))
             # session.run(tf.assign(embedding_response, initvec_response))
            # session.run(embedding_post.assign(initvec_post))
            session.run(embedding_response.assign(initvec_response))
#
    return model
import data_utils
import numpy as np
from sklearn.model_selection import train_test_split
import GAN

keep_channels = ['C3']
trial_len = 1.5
X, y = data_utils.get_data("../data/CLASubjectB1512153StLRHand.mat", trial_len,
                           keep_channels)

X = X[y != 3]
y = y[y != 3]
# 0 is left hand
y[y == 1] = 0
# 1 is right hand
y[y == 2] = 1

interval_len = .45
X = data_utils.trim_intervals(X, .15, interval_len)

num_channels = len(keep_channels)
d2 = np.ceil(num_channels * interval_len / 0.005).astype(int)
X = X.reshape(642, d2)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    test_size=0.2)

gan = GAN.GAN((X, y),
              g_in=X.shape[1],
Пример #25
0
import time
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform

from data_utils import get_data, split_data
from utils import DataFrameSelector, CombinedAttributesAdder, CustomLabelBinarizer

# laoding the dataset
housing = get_data()

# split into train_set and test_set
train_set, test_set = split_data(housing)

housing_train = train_set.drop(
    "median_house_value",
    axis=1)  # median_house_value column contains the target values
housing_test = test_set.drop("median_house_value", axis=1)
housing_train_labels = train_set["median_house_value"].copy()
housing_test_labels = test_set["median_house_value"].copy()

# data preparation and prediction going to be done in a pipeline

# pipeline to preprocess numerical features
numerical_attributes = train_set.drop(
Пример #26
0
def main(unused_argv):
  train_data, valid_data = data_utils.get_data()
  trainer = Trainer(train_data, valid_data, data_utils.IMAGE_NEW_SIZE ** 2)
  trainer.run()
Пример #27
0
 arg_parser = ArgumentParser(description='train network')
 arg_parser.add_argument('--train', required=True,
                         help='HDF5 training data')
 arg_parser.add_argument('--test', required=True,
                         help='HDF5 testing data')
 arg_parser.add_argument('--epochs', type=int, default=100,
                         help='epochs for the training process')
 arg_parser.add_argument('--batch', type=int, default=64,
                         help='training batch size')
 arg_parser.add_argument('--seed', type=int, default=1234,
                         help='seed for the RNG')
 arg_parser.add_argument('file', help='HDF5 to store network')
 options = arg_parser.parse_args()
 np.random.seed(options.seed)
 (x_train, x_val, x_test,
  y_train, y_val, y_test) = get_data(options.train, options.test)
 input_shape = x_train.shape[1:]
 output_shape = y_train.shape[1:]
 model = config_model(input_shape, output_shape)
 history = model.fit(x_train, y_train, epochs=options.epochs,
                     batch_size=options.batch, verbose=0,
                     validation_data=(x_val, y_val))
 model.save(options.file)
 hist_filename = change_path_suffix(options.file, '_hist.h5')
 store_history(hist_filename, history)
 loss, accuracy = model.evaluate(x_train, y_train, verbose=0)
 print(f'training: loss = {loss:.3f}, accuracy = {accuracy:.3f}')
 loss, accuracy = model.evaluate(x_val, y_val, verbose=0)
 print(f'validation: loss = {loss:.3f}, accuracy = {accuracy:.3f}')
 loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
 print(f'test: loss = {loss:.3f}, accuracy = {accuracy:.3f}')
Пример #28
0
from tensorflow.contrib import layers
from tqdm import *


ENCODER_UNITS = 10
DECODER_UNITS = 10
EMBEDDING_SIZE = 10


tf.app.flags.DEFINE_boolean('train_mode', True, 'Run in a training mode')
tf.app.flags.DEFINE_integer('num_epochs', 5000, 'Number of epochs')
FLAGS = tf.app.flags.FLAGS

sess = tf.Session()

questions, q_seq_length, answers_inputs, answers_targets, a_seq_length = data_utils.get_data('questions_tokenized.txt',
                                                                                             'answers_tokenized.txt')
inputs = tf.placeholder(tf.int32, [None, None], name='encoder_inputs')
decoder_inputs = tf.placeholder(tf.int32, [None, None], name='decoder_inputs')
decoder_targets = tf.placeholder(tf.int32, [None, None], name='decoder_targets')

vocabulary, rev_vocabulary, vocabulary_size = data_utils.initialize_vocabulary('vocabulary.txt')
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_SIZE], -1.0, 1.0), dtype=tf.float32,
                         name='embeddings')

inputs_embed = tf.nn.embedding_lookup(embeddings, inputs)
decoder_inputs_embed = tf.nn.embedding_lookup(embeddings, decoder_inputs)

questions_seq_length_pc = tf.placeholder(tf.int32, [None], name='questions_sequence_length')
answers_seq_length_pc = tf.placeholder(tf.int32, [None], name='answers_sequence_length')

with tf.variable_scope('encoder'):
Пример #29
0
import json

dataset_used = 'MNIST'
device = 'cuda'
log_interval = 100
epochs = 5
batch_size = 32

model = Classifier(n_class=10).to(device)
model_name = 'classifier' + '_' + dataset_used + '_' + str(session_num)
if session_num > 1:
    last_model_name = 'classifier' + '_' + dataset_used + '_' + str(
        session_num - 1)
    model.load_state_dict(torch.load('models/' + last_model_name + '.pth'))
train_data_loader, val_data_loader, train_size = get_data(dataset_used,
                                                          batch_size,
                                                          get_mean_std=False)

optimizer = optim.Adam([{
    'params': model.encoder.parameters(),
    'lr': 1e-5
}, {
    'params': model.fc1.parameters()
}, {
    'params': model.fc2.parameters()
}],
                       lr=1e-3)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
losses = []
for epoch in range(epochs):
    epoch_loss = 0.0
Пример #30
0
def main(unused_argv):
    train_data, valid_data = data_utils.get_data()
    trainer = Trainer(train_data, valid_data, data_utils.IMAGE_NEW_SIZE**2)
    trainer.run()
Пример #31
0
*  initialize weights for generator and discriminator
*  learning rate scheduler
'''

# LOAD DATA
data_type = 'sine'
options = {
    'seq_length': 20,
    'num_samples': 16000,
    'num_signals': 1,
    'freq_low': 1,
    'freq_high': 5,
    'amplitude_low': 0.3,
    'amplitude_high': 0.9
}
data, _, _ = get_data(data_type, options)

print(data.shape)

batch_size = 8
shuffle = True
dataloader = DataLoader(data, batch_size=batch_size, shuffle=shuffle)

# INITIALIZE GEN AND DISC
z_dim = 10
device = 'cpu'  # GPU
hidden_dim_g = 200
hidden_dim_d = 100
in_dim = options['num_signals']

num_layers = 2
Пример #32
0
class_names = ['no break', 'break']  # 0 is no break and 1 is break
frame_range = list(range(0, 56, 4))
num_classes = len(class_names)
part_frame_range = list(chunks(frame_range, 2))
for i, sub_range in enumerate(part_frame_range):
    print()
    print('PULLING PARTITION %d OF %d' % (i, len(part_frame_range) - 1))
    num_train = 34 * len(sub_range)  # 3400
    num_val = 2 * len(sub_range)  # 200
    num_test = 10 * len(sub_range)  # 188
    (X_train, y_train,
     X_val, y_val, X_test, y_test) = \
         data_utils.get_data(frame_range=sub_range,
                             num_train=num_train,
                             num_validation=num_val,
                             num_test=num_test,
                             feature_list=None,
                             reshape_frames=False,
                             crop_at_constr=True,
                             blur_im=True)

    # create tesor objects, normalize and zero center and pass into data
    #loaders
    # hardcoded means and standard deviation of pixel values
    #data
    X_train = torch.from_numpy(X_train)
    y_train = torch.from_numpy(y_train)
    X_val = torch.from_numpy(X_val)
    y_val = torch.from_numpy(y_val)
    X_test = torch.from_numpy(X_test)
    y_test = torch.from_numpy(y_test)
Пример #33
0
def test_mlp(
        initial_learning_rate,
        learning_rate_decay,
        squared_filter_length_limit,
        n_epochs,
        batch_size,
        dropout,
        results_file_name,
        dataset,
        use_bias,
        get_mlp=get_mlp_default):
    """
    The dataset is the one from the mlp demo on deeplearning.net.  This training
    function is lifted from there almost exactly.

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


    """
    datasets = get_data(True)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    data_y = datasets[3]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    epoch = T.scalar()
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    learning_rate = theano.shared(np.asarray(initial_learning_rate,
                                             dtype=theano.config.floatX))

    rng = np.random.RandomState()

    # construct the MLP class
    classifier = get_mlp(rng, use_bias, x)

    # Build the expresson for the cost function.
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)

    # Compile theano function for testing.
    test_model = theano.function(inputs=[index],
                                 outputs=classifier.errors(y),
                                 givens={
                                     x: test_set_x[index * batch_size:(index + 1) * batch_size],
                                     y: test_set_y[index * batch_size:(index + 1) * batch_size]})
    # theano.printing.pydotprint(test_model, outfile="test_file.png")
    test_auc = theano.function([index],
                               (classifier.layers[-1]).p_y_given_x,
                               givens={
                                   x: test_set_x[index * batch_size: (index + 1) * batch_size],
                                   # y: valid_set_y[index * batch_size: (index + 1) * batch_size]
                               })

    # Compile theano function for validation.
    validate_model = theano.function(inputs=[index],
                                     outputs=classifier.errors(y),
                                     givens={
                                         x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                                         y: valid_set_y[index * batch_size:(index + 1) * batch_size]})

    # validate_auc = theano.function([index], classifier.layers[-1].p_y_given_x,
    validate_auc = theano.function([index],
                                   (classifier.layers[-1]).p_y_given_x,
                                   givens={
                                       x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                                       # y: valid_set_y[index * batch_size: (index + 1) * batch_size]
                                   })

    #theano.printing.pydotprint(validate_model, outfile="validate_file.png",
    #        var_with_name_simple=True)

    # Compute gradients of the model wrt parameters
    gparams = []
    for param in classifier.params:
        # Use the right cost function here to train with or without dropout.
        gparam = T.grad(dropout_cost if dropout else cost, param)
        gparams.append(gparam)

    # ... and allocate mmeory for momentum'd versions of the gradient
    gparams_mom = []
    for param in classifier.params:
        gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,
                                            dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)
    T_mom = 50.
    # Compute momentum for the current epoch
    mom = ifelse(epoch < T_mom,
                 # bug fix...
                 (epoch / T_mom) * 0.5 + (1 - epoch / T_mom) * 0.5,
                 0.99)

    # Update the step direction using momentum
    updates = {}
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam

    # ... and take a step along that direction
    for param, gparam_mom in zip(classifier.params, gparams_mom):
        stepped_param = param - learning_rate * updates[gparam_mom]

        # This is a silly hack to constrain the norms of the rows of the weight
        # matrices.  This just checks if there are two dimensions to the
        # parameter and constrains it if so... maybe this is a bit silly but it
        # should work for now.
        if param.get_value(borrow=True).ndim == 2:
            squared_norms = T.sum(stepped_param ** 2, axis=1).reshape((stepped_param.shape[0], 1))
            scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    output = dropout_cost if dropout else cost
    # grads = T.grad(output, classifier.params)
    # updates = []
    # for param_i, grad_i in zip(classifier.params, grads):
    #     updates.append((param_i, param_i - learning_rate * grad_i))

    # Compile theano function for training.  This returns the training cost and
    # updates the model parameters.

    train_model = theano.function(inputs=[epoch, index], outputs=output,
                                  updates=updates,
                                  on_unused_input='warn',
                                  givens=OrderedDict({
                                      x: train_set_x[index * batch_size:(index + 1) * batch_size],
                                      y: train_set_y[index * batch_size:(index + 1) * batch_size]
                                  }))
    # theano.printing.pydotprint(train_model, outfile="train_file.png")

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    decay_learning_rate = theano.function(inputs=[], outputs=learning_rate,
                                          updates={learning_rate: learning_rate * learning_rate_decay})

    if True:
        for loc, param in enumerate(classifier.params):
            if param.get_value().shape == (32,5,4,5,5):
                print 'saving images...'
                ff = param.get_value()[:,0,0,:,:].reshape((32,25))
                img = PIL.Image.fromarray(tile_raster_images(ff, (5, 5), (3, 5), tile_spacing=(1, 1)))
                img.save("ff-before" + str(loc) + ".png")
                ft = param.get_value()[0, :, 2, :, :].reshape((5, 25))
                img = PIL.Image.fromarray(tile_raster_images(ft, (5, 5), (1, 5), tile_spacing = (1,1)))
                img.save("ft-before" + str(loc) + ".png")


    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    best_params = None
    best_validation_errors = np.inf
    best_iter = 0
    test_score = 0.
    epoch_counter = 0
    start_time = time.clock()
    patient = True
    patience_limit = 15

    # results_file = open(results_file_name, 'wb')

    while epoch_counter < n_epochs and patient:
        # Train this epoch
        epoch_counter = epoch_counter + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(epoch_counter, minibatch_index)

        # Compute loss on validation set
        # validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
        # this_validation_errors = np.sum(validation_losses)
        probs = numpy.concatenate([validate_auc(i) for i
                                   in xrange(n_valid_batches)])[:, 1]

        fpr, tpr, _ = metrics.roc_curve(data_y[test_end:test_end + len(probs)], probs)
        this_validation_errors = 1 - metrics.auc(fpr, tpr)

        # only calc test error if needed
        if this_validation_errors < best_validation_errors:
            t_probs = numpy.concatenate([test_auc(i) for i
                                         in xrange(n_test_batches)])[:, 1]
            fpr, tpr, _ = metrics.roc_curve(data_y[train_end:train_end + len(t_probs)], t_probs)
            test_score = 1 - metrics.auc(fpr, tpr)

        # Report and save progress.
        print "epoch {}, valid error {}, learning_rate={}{}".format(
            epoch_counter, this_validation_errors * 100,
            learning_rate.get_value(borrow=True),
            " ** " + str(test_score) if this_validation_errors < best_validation_errors else "")

        if this_validation_errors < best_validation_errors:
            best_iter = epoch_counter
            best_validation_errors = this_validation_errors
            if False:
                for loc, param in enumerate(classifier.params):
                    if param.get_value().shape == (32, 5, 4, 5, 5):
                        print 'saving images...'
                        ff = param.get_value()[:, 0, 0, :, :].reshape((32, 25))
                        img = PIL.Image.fromarray(tile_raster_images(ff, (5, 5), (3, 5), tile_spacing=(1, 1)))
                        img.save("ff-after" + str(epoch_counter) + ".png")
                        ft = param.get_value()[0, :, 2, :, :].reshape((5, 25))
                        img = PIL.Image.fromarray(tile_raster_images(ft, (5, 5), (1, 5), tile_spacing=(1, 1)))
                        img.save("ft-after" + str(epoch_counter) + ".png")

        # results_file.write("{0}\n".format(this_validation_errors))
        # results_file.flush()

        decay_learning_rate()
        if epoch_counter - best_iter > patience_limit:
            patient = False
    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_errors * 100., best_iter, test_score * 100.))
    return test_score