def preprocess(self, query):

        ## input checks
        if isinstance(query, dict):
            query = pd.DataFrame(query)
        elif isinstance(query, pd.DataFrame):
            pass
        else:
            raise Exception(f"ERROR - FAIL:(model_evaluation) - invalid input. {type(query)} was given")

        X = query.drop(['y'], axis=1)
        y = query[['y']]
        query_train = DataLoader().feature_pipeline(self.numerical, self.categorical).fit(X).transform(X) # get from model trainer log
   
        for n_samples in range(10, len(X), 20):

            subset_indices = X.sample(n=n_samples, replace=True)
            subset_query = DataLoader().feature_pipeline(self.numerical, self.categorical).fit(subset_indices).transform(subset_indices)
            if subset_query.shape[1] != query_train.shape[1]:
                continue
            else:
                    break

        print(f'n_ : {n_samples}')
        print(f'y_ : {subset_query.shape[1]}')

        return subset_query, subset_indices, y
Exemplo n.º 2
0
def train(epochs=10, batch_size=50, lr=1e-3):
    dataloader = DataLoader('/data/strokes.npy')
    Model = model.Model()
    optimizer = torch.optim.RMSprop(Model.params(), lr=lr)

    for epoch in range(epochs):
        x, y = dataloader.generate_batch()
        x = torch.from_numpy(np.array(x))
        y = torch.from_numpy(np.array(y))

        y1 = y[:, :, 0]
        y2 = y[:, :, 1]
        y2 = y[:, :, 2]

        hidden = autograd.Variable(torch.randn(1, x.size(0), 121))

        e, pi, mu1, mu2, sigma1, sigma2, corr, hidden = Model(x, hidden)

        loss_val = loss(e, pi, mu1, mu2, sigma1, sigma2, corr, y1, y2, y3)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    torch.save(Model.save_dict(), 'unconditional.pt')
def test_dataset():
    qm9 = QM9Dataset('data/adjacency_matrix_train.pkl')

    np.random.seed(0)
    sample = qm9[0]

    assert sample.length == 19
    assert sample.targets == np.array(['N'])
    assert np.array_equal(sample.target_mask, np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
    assert sample.adj.max()==1

    dl = DataLoader(qm9, batch_size=10)
    a = next(iter(dl))

    qm9 = QM9Dataset('data/adjacency_matrix_train.pkl', bond_order=True)

    np.random.seed(0)
    sample = qm9[0]

    assert sample.length == 19
    assert sample.targets == np.array(['N'])
    assert np.array_equal(sample.target_mask, np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
    assert sample.adj.max()==2
    dl = DataLoader(qm9, batch_size=10)
    a = next(iter(dl))
Exemplo n.º 4
0
def run_nn_dmi(args):
    set_global_seeds(args['seed'])
    dataset = DataLoader(args['dataset'], args)
    X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val(
        args)
    mlp = MLP(
        feature_dim=X_train.shape[-1],
        hidsizes=args['hidsize'],
        dropout=args['dropout'],
        outputs=2,
    )
    classifier = DMIClassifier(
        model=mlp,
        learning_rate=args['lr'],
    )
    results = classifier.fit(
        X_train,
        y_train,
        X_test,
        y_test,
        batchsize=args['batchsize'],
        episodes=args['episodes'],
        logger=logger if args['seeds'] == 1 else None,
    )
    return results
Exemplo n.º 5
0
def run_pam(args):
    set_global_seeds(args['seed'])
    dataset = DataLoader(args['dataset'])
    X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val(
        args)
    model = Perceptron(feature_dim=X_train.shape[-1], margin=args['margin'])
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)
Exemplo n.º 6
0
def run_c_svm(args):
    set_global_seeds(args['seed'])
    dataset = DataLoader(args['dataset'], args)
    X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val(
        args)
    model = SVC(gamma='auto', class_weight={0: 1., 1: args['C1']})
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)
def main_experiment(train_data, valid_data):
    """
    Question 8:
    """
    trainloader = DataLoader(train_data, batch_size=32)
    devloader = DataLoader(valid_data, batch_size=len(valid_data))

    mlp = MLPClassifier(constants.Circles.INPUT_DIM, constants.Circles.N_CLASSES, 10, 0.05, 50)
    mlp.train(trainloader, devloader, log=os.path.join(constants.Circles.RESULTS_DIR, 'circles_log.txt'))
Exemplo n.º 8
0
def find_best_c1(args):
    set_global_seeds(args['seed'])
    dataset = DataLoader(args['dataset'], args)
    X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val(
        args)
    results = []
    for c1 in CLASS_WEIGHTS:
        model = SVC(gamma='auto', class_weight={0: 1., 1: c1})
        model.fit(X_train, y_train)
        results.append(model.score(X_val, y_val))
    return results
Exemplo n.º 9
0
def find_best_margin(args):
    """ return `best_margin / 0.1` """
    set_global_seeds(args['seed'])
    dataset = DataLoader(args['dataset'])
    X_train, X_test, X_val, y_train, y_test, y_val = dataset.prepare_train_test_val(
        args)

    results = []
    for margin in MARGINS:
        model = Perceptron(feature_dim=X_train.shape[-1], margin=margin)
        model.fit(X_train, y_train)
        results.append(model.score(X_val, y_val))
    return results
Exemplo n.º 10
0
    def load_data(self, subset=False):
        """Loads and Preprocess data """

        LOG.info(f'loading {self.config.data.path} dataset .....')

        self.dataset = DataLoader().load_data(self.config.data)

        LOG.info("..... validating all data")

        try:
            validate = DataLoader().validate_schema(self.dataset)
            if validate is None:
                LOG.info("PASS: data validation passed.")
        except:
            LOG.critical("FAIL: data validation failed.")
            raise Exception(
                "CRITICAL - FAIL:(dataloader) - invalid data schema")
            # sys.exit(100) # exit if using log and no raise exception

        # self.X, self.y = DataLoader().split_feature_target(self.dataset, self.target)
        # self.X_train, self.X_test, self.y_train ,self.y_test = DataLoader().preprocess_data(self.X, self.y, self.test_size, self.random_state)

        self.train_dataset, self.test_dataset = DataLoader().preprocess_data(
            self.dataset, self.test_size, self.random_state)

        train_shape = DataLoader().feature_pipeline(self.numerical, self.categorical) \
            .fit(self.train_dataset).transform(self.train_dataset)

        # subset the data to enable faster unittests
        if subset:
            subset_query = np.empty(shape=(1, 1), dtype=object)
            while subset_query.shape[1] != train_shape.shape[1]:
                subset_indices = self.train_dataset.sample(
                    frac=self.subset_n_frac, replace=True)
                subset_query = DataLoader().feature_pipeline(
                    self.numerical, self.categorical).fit(
                        subset_indices).transform(subset_indices)
                self.train_dataset = subset_indices

        self.X_train= DataLoader().feature_pipeline(self.numerical, self.categorical) \
            .fit(self.train_dataset).transform(self.train_dataset)
        self.y_train = DataLoader().target_pipeline(self.target).fit(self.train_dataset[self.target]) \
            .transform(self.train_dataset[self.target])

        self.X_test= DataLoader().feature_pipeline(self.numerical, self.categorical).fit(self.test_dataset) \
            .transform(self.test_dataset)
        self.y_test = DataLoader().target_pipeline(self.target).fit(self.test_dataset[self.target]) \
            .transform(self.test_dataset[self.target])
Exemplo n.º 11
0
def main(argv):
    log.info('Beginning prediction')
    funcs = pd.read_pickle(
        os.path.join(FLAGS.resources,
                     '{}.pkl'.format(FLAGS.function)))['functions'].values
    funcs = GODAG.initialize_idmap(funcs, FLAGS.function)

    log.info('GO DAG initialized. Updated function list-{}'.format(len(funcs)))
    FeatureExtractor.load(FLAGS.resources)
    log.info('Loaded amino acid and ngram mapping data')

    data = DataLoader(filename=FLAGS.inputfile)
    if FLAGS.evaluate:
        test_dataiter = DataIterator(batchsize=FLAGS.batchsize,
                                     size=FLAGS.testsize,
                                     dataloader=data,
                                     functype=FLAGS.function,
                                     featuretype='ngrams')
        predict_evaluate(test_dataiter, 0.2, FLAGS.modelsdir)
    else:
        test_dataiter = DataIterator(batchsize=FLAGS.batchsize,
                                     size=FLAGS.testsize,
                                     dataloader=data,
                                     functype=FLAGS.function,
                                     featuretype='ngrams',
                                     test=True)
        predict(test_dataiter, 0.2, FLAGS.modelsdir, funcs)
    def test_data_validation(self):

        ## schema checks
        try:
            validate = DataLoader().validate_schema(self.test_dataset)
            if validate is None:
                LOG.info("PASS: Test data validation passed.")
        except:
            raise Exception(
                "ERROR - FAIL:(model_evaluation) - invalid input schema.")

        ## input checks
        if isinstance(self.test_dataset, dict):
            self.test_dataset = pd.DataFrame(self.test_dataset)
        elif isinstance(self.test_dataset, pd.DataFrame):
            pass
        else:
            raise Exception(
                f"ERROR - FAIL:(model_evaluation) - invalid input. {self.test_dataset} was given"
            )

        ## features check
        test_features = sorted(self.test_dataset.columns.drop(['y']).tolist())
        data_features = sorted(self.dataset.columns.drop(['y']).tolist())
        if test_features != data_features:
            print(f"test features: {','.join(test_features)}")
            raise Exception(
                "ERROR - FAIL:(model_evaluation) - invalid features present")
Exemplo n.º 13
0
 def data_loader(self,
                 batch_size=10,
                 num_workers=4,
                 shuffle=False,
                 pin_memory=False):
     return DataLoader(self,
                       batch_size=batch_size,
                       shuffle=shuffle,
                       num_workers=num_workers,
                       pin_memory=pin_memory)
def decision_boundaries(train_data, valid_data):
    """
    Question 5: Train the neural network using gradient descent on the two circles dataset.
    Plot the decision regions for several different values of the hyperparameters
    (weight decay, number of hidden units, early stopping) so as to illustrate their
    effect on the capacity of the model.
    """

    # raw data is only used to plot the decision boundary
    raw_data = np.loadtxt(open(constants.Circles.DATA_PATH, 'r'))
    X = raw_data[:, :2]
    y = raw_data[:, -1]

    # hyperparameters
    HIDDEN_DIM_SET = [8, 14]
    NUM_EPOCH_SET = [30]
    LEARNING_RATE_SET = [0.05]
    L1_WEIGH_DECAY = [0, 0.005]
    L2_WEIGH_DECAY = [0, 0.005]

    trainloader = DataLoader(train_data, batch_size=32)
    devloader = DataLoader(valid_data, batch_size=len(valid_data))

    i = 0
    for h in HIDDEN_DIM_SET:
        for lr in LEARNING_RATE_SET:
            for l1 in L1_WEIGH_DECAY:
                for l2 in L2_WEIGH_DECAY:
                    for n_epoch in NUM_EPOCH_SET:

                        print('\nhidden_dim: {}, lr: {}, l1: {}, l2: {}'.format(h, lr, l1, l2))
                        mlp = MLPClassifier(constants.Circles.INPUT_DIM, constants.Circles.N_CLASSES, h, lr, n_epoch, l1, l2, l1, l2)
                        mlp.train(trainloader, devloader)

                        figure_name = 'decision_boundaries_{}.png'.format(i)

                        visualize.plot_decision(
                            X, y,
                            path=os.path.join(constants.Circles.FIGURES_DIR, figure_name),
                            model=mlp,
                            param=[h, lr, n_epoch, l1, l2, l1, l2]
                        )
                        i += 1
Exemplo n.º 15
0
    def __init__(self, args):
        self.args = args
        train = DataLoader(self.args.trainpath)
        dev = DataLoader(self.args.devpath)

        self.train_words, self.train_poss, self.train_chunks, self.train_labels = train.get_all_train_tokens(
        )
        self.train_max_sentence_len, self.train_max_word_len = train.get_required_max_len(
        )
        self.dev_words, self.dev_poss, self.dev_chunks, self.dev_labels = dev.get_all_train_tokens(
        )
        self.dev_max_sentence_len, self.dev_max_word_len = dev.get_required_max_len(
        )

        vocabulary = Vocabulary(self.train_words)
        self.vocab = vocabulary.get_word_vocab()
        self.char_vocab = vocabulary.get_char_vocab()

        self.train_vect = Vectorizer(self.train_max_sentence_len,
                                     self.train_max_word_len, self.vocab,
                                     self.char_vocab, self.train_words)
        self.dev_vect = Vectorizer(self.train_max_sentence_len,
                                   self.train_max_word_len, self.vocab,
                                   self.char_vocab, self.dev_words)

        self.poss_vect = LabelEncoderModel(self.train_poss,
                                           self.train_max_sentence_len)
        self.chunks_vect = LabelEncoderModel(self.train_chunks,
                                             self.train_max_sentence_len)
        self.labels_vect = LabelEncoderModel(self.train_labels,
                                             self.train_max_sentence_len)

        #st wrong here
        self.pos_emb_weights = self.poss_vect.get_emb_weights()
        self.chunk_emb_weights = self.chunks_vect.get_emb_weights()
        self.word_emb_weights, self.word_emb_dimensions = PretrainedEmbedder(
            self.vocab, self.args.pretrained_path).pretrained_embedder()
        self.model = ModelTraining(
            self.args.dropout,
            self.args.lr,
            len(set(sum(self.train_labels, []))),
            len(self.vocab),
            len(self.char_vocab),
            self.train_max_word_len,
            len(set(sum(self.train_poss, []))),
            len(set(sum(self.train_chunks, []))),
            word_emb_dimensions=self.word_emb_dimensions,
            word_emb_weights=self.word_emb_weights,
            pos_emb_weights=self.pos_emb_weights,
            chunk_emb_weights=self.chunk_emb_weights).model_build()
Exemplo n.º 16
0
def test_transformer_forward_cuda():
    qm9 = QM9Dataset('data/adjacency_matrix_train.pkl')

    np.random.seed(0)
    torch.manual_seed(0)

    dl = DataLoader(qm9, batch_size=1)
    sample = next(iter(dl))
    transformer = TransformerModel().cuda()
    sample.cuda()
    out = transformer(sample)

    assert torch.equal(out['prediction'][sample.target_mask], torch.tensor([0]).cuda())
Exemplo n.º 17
0
def deploy(path):
    assert os.path.exists(path), f'{path} not found : ('
    dataset = 'YOUR_DATASET_NAME'

    img_size = 256
    test_transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
    ])
    testA = ImageFolder(os.path.join('dataset', dataset, 'testA'), test_transform)
    with fluid.dygraph.guard(): 
        testA_loader = DataLoader(testA, batch_size=1, shuffle=False)
        real_A, _ = next(iter(testA_loader))
        in_np = real_A.numpy()

    # load model
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    program, feed_vars, fetch_vars = fluid.io.load_inference_model(path, exe)

    # inference
    fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
    def img_postprocess(img):
        assert isinstance(img, np.ndarray), type(img)
        img = img * 0.5 + 0.5
        img = img.squeeze(0).transpose((1, 2, 0))
        # BGR to RGB
        img = img[:, :, ::-1]
        return img
    in_img = img_postprocess(in_np)
    out_img = img_postprocess(fetch)
    plt.subplot(121)
    plt.title('real A')
    plt.imshow(in_img)
    plt.subplot(122)
    plt.title('A to B')
    plt.imshow(out_img)
    plt.show()
def finite_difference_check(dataset, batch_size):
    """
    Computes the gradients for a single example, and
    check that the gradient is correct using the nite
    difference method.

    Answers to questions 1, 2, and 4.
    """
    dataloader = DataLoader(dataset, batch_size)

    inputs, targets = next(dataloader)

    mlp = MLPClassifier(constants.Circles.INPUT_DIM, constants.Circles.N_CLASSES)
    gradHats, grads, param_names = mlp.finite_difference_check(inputs, targets)

    figure_name = 'finite_difference_check_batch_size_{}.png'.format(batch_size)

    visualize.plot_gradient(
        gradHats, grads,
        param_names,
        legend=['finite differences approx.', 'backpropagation'],
        path=os.path.join(constants.Circles.FIGURES_DIR, figure_name)
    )
Exemplo n.º 19
0
def test_transformer_forward_cpu():
    qm9 = QM9Dataset('data/adjacency_matrix_train.pkl', epsilon_greedy=0.5)

    np.random.seed(0)
    torch.manual_seed(0)

    dl = DataLoader(qm9, batch_size=2)
    sample = next(iter(dl))
    transformer = TransformerModel()
    out = transformer(sample)

    assert torch.equal(out['prediction'][sample.target_mask], torch.tensor([0, 4, 0, 4, 0, 0, 0]))

    criterion = CrossEntropyLoss()
    targets = sample.targets_num

    assert torch.equal(targets, torch.tensor([[2, 1, 3, 2, 1, 1], [1, 0, 0, 0, 0, 0]]))
    targets = targets[targets != 0]
    targets -= 1
    assert torch.equal(targets, torch.tensor([1, 0, 2, 1, 0, 0, 0]))
    loss = criterion(out['out'][sample.target_mask], targets)

    assert torch.equal(loss, cross_entropy(out['out'][sample.target_mask], targets, reduction='none').mean())
Exemplo n.º 20
0
    default='Transformer')
parser.add_argument('--gamma', default=1, type=float)
parser.add_argument('--bond_order', default=False, type=bool)
parser.add_argument('--dataset', default='zinc', choices=['qm9', 'zinc'])
args = parser.parse_args()

train_file = f'data/{args.dataset}/adjacency_matrix_train_scaffold.pkl' if args.scaffold else f'data/{args.dataset}/adjacency_matrix_train.pkl'
validation_file = f'data/{args.dataset}/adjacency_matrix_validation_scaffold.pkl' if args.scaffold else f'data/{args.dataset}/adjacency_matrix_validation.pkl'

training = QM9Dataset(data=train_file,
                      num_masks=args.num_masks,
                      epsilon_greedy=args.epsilon_greedy,
                      num_fake=args.num_fake,
                      bond_order=args.bond_order)

train_dl = DataLoader(training, batch_size=args.batch_size)

# Create multiple validation dlators, one for 25, 50 and 75% masked atoms
val_dls = []
if args.num_fake == 0:
    for masks in range(1, 6):

        val_set = QM9Dataset(data=validation_file,
                             num_masks=masks,
                             bond_order=args.bond_order)
        val_dl = DataLoader(val_set, batch_size=args.batch_size)
        val_dls.append(val_dl)

if args.num_masks == 0:
    for fakes in range(1, 6):
Exemplo n.º 21
0
    def __init__(self, isInjector=True):
        self.isInjector = isInjector
        # Input shape
        cube_shape = config['cube_shape']
        self.img_rows = config['cube_shape'][1]
        self.img_cols = config['cube_shape'][2]
        self.img_depth = config['cube_shape'][0]
        self.channels = 1
        self.num_classes = 5
        self.img_shape = (self.img_rows, self.img_cols, self.img_depth,
                          self.channels)

        # Configure data loader
        if self.isInjector:
            self.dataset_path = config['unhealthy_samples']
            self.modelpath = config['modelpath_inject']
        else:
            self.dataset_path = config['healthy_samples']
            self.modelpath = config['modelpath_remove']

        self.dataloader = DataLoader(dataset_path=self.dataset_path,
                                     normdata_path=self.modelpath,
                                     img_res=(self.img_rows, self.img_cols,
                                              self.img_depth))

        # Calculate output shape of D (PatchGAN)
        patch = int(self.img_rows / 2**4)
        self.disc_patch = (patch, patch, patch, 1)

        # Number of filters in the first layer of G and D
        self.gf = 100
        self.df = 100

        optimizer = Adam(0.0002, 0.5)
        optimizer_G = Adam(0.000001, 0.5)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.summary()
        self.discriminator.compile(loss='mse',
                                   optimizer=optimizer_G,
                                   metrics=['accuracy'])

        # -------------------------
        # Construct Computational
        #   Graph of Generator
        # -------------------------

        # Build the generator
        self.generator = self.build_generator()
        self.generator.summary()

        # Input images and their conditioning images
        img_A = Input(shape=self.img_shape)
        img_B = Input(shape=self.img_shape)

        # By conditioning on B generate a fake version of A
        fake_A = self.generator([img_B])

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # Discriminators determines validity of translated images / condition pairs
        valid = self.discriminator([fake_A, img_B])

        self.combined = Model(inputs=[img_A, img_B], outputs=[valid, fake_A])
        self.combined.compile(loss=['mse', 'mae'],
                              loss_weights=[1, 100],
                              optimizer=optimizer)
Exemplo n.º 22
0
class Trainer:
    def __init__(self, isInjector=True):
        self.isInjector = isInjector
        # Input shape
        cube_shape = config['cube_shape']
        self.img_rows = config['cube_shape'][1]
        self.img_cols = config['cube_shape'][2]
        self.img_depth = config['cube_shape'][0]
        self.channels = 1
        self.num_classes = 5
        self.img_shape = (self.img_rows, self.img_cols, self.img_depth,
                          self.channels)

        # Configure data loader
        if self.isInjector:
            self.dataset_path = config['unhealthy_samples']
            self.modelpath = config['modelpath_inject']
        else:
            self.dataset_path = config['healthy_samples']
            self.modelpath = config['modelpath_remove']

        self.dataloader = DataLoader(dataset_path=self.dataset_path,
                                     normdata_path=self.modelpath,
                                     img_res=(self.img_rows, self.img_cols,
                                              self.img_depth))

        # Calculate output shape of D (PatchGAN)
        patch = int(self.img_rows / 2**4)
        self.disc_patch = (patch, patch, patch, 1)

        # Number of filters in the first layer of G and D
        self.gf = 100
        self.df = 100

        optimizer = Adam(0.0002, 0.5)
        optimizer_G = Adam(0.000001, 0.5)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.summary()
        self.discriminator.compile(loss='mse',
                                   optimizer=optimizer_G,
                                   metrics=['accuracy'])

        # -------------------------
        # Construct Computational
        #   Graph of Generator
        # -------------------------

        # Build the generator
        self.generator = self.build_generator()
        self.generator.summary()

        # Input images and their conditioning images
        img_A = Input(shape=self.img_shape)
        img_B = Input(shape=self.img_shape)

        # By conditioning on B generate a fake version of A
        fake_A = self.generator([img_B])

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # Discriminators determines validity of translated images / condition pairs
        valid = self.discriminator([fake_A, img_B])

        self.combined = Model(inputs=[img_A, img_B], outputs=[valid, fake_A])
        self.combined.compile(loss=['mse', 'mae'],
                              loss_weights=[1, 100],
                              optimizer=optimizer)

    def build_generator(self):
        """U-Net Generator"""
        def get_crop_shape(target, refer):

            # depth, the 4rth dimension
            cd = (target.get_shape()[3] - refer.get_shape()[3]).value
            assert (cd >= 0)
            if cd % 2 != 0:
                cd1, cd2 = int(cd / 2), int(cd / 2) + 1
            else:
                cd1, cd2 = int(cd / 2), int(cd / 2)
            # width, the 3rd dimension
            cw = (target.get_shape()[2] - refer.get_shape()[2]).value
            assert (cw >= 0)
            if cw % 2 != 0:
                cw1, cw2 = int(cw / 2), int(cw / 2) + 1
            else:
                cw1, cw2 = int(cw / 2), int(cw / 2)
            # height, the 2nd dimension
            ch = (target.get_shape()[1] - refer.get_shape()[1]).value
            assert (ch >= 0)
            if ch % 2 != 0:
                ch1, ch2 = int(ch / 2), int(ch / 2) + 1
            else:
                ch1, ch2 = int(ch / 2), int(ch / 2)

            return (ch1, ch2), (cw1, cw2), (cd1, cd2)

        def conv3d(layer_input, filters, f_size=4, bn=True):
            """Layers used during downsampling"""
            d = Conv3D(filters, kernel_size=f_size, strides=2,
                       padding='same')(layer_input)
            d = LeakyReLU(alpha=0.2)(d)
            if bn:
                d = BatchNormalization(momentum=0.8)(d)
            return d

        def deconv3d(layer_input,
                     skip_input,
                     filters,
                     f_size=4,
                     dropout_rate=0.5):
            """Layers used during upsampling"""
            u = UpSampling3D(size=2)(layer_input)
            u = Conv3D(filters,
                       kernel_size=f_size,
                       strides=1,
                       padding='same',
                       activation='relu')(u)
            if dropout_rate:
                u = Dropout(dropout_rate)(u)
            u = BatchNormalization(momentum=0.8)(u)

            # u = Concatenate()([u, skip_input])
            ch, cw, cd = get_crop_shape(u, skip_input)
            crop_conv4 = Cropping3D(cropping=(ch, cw, cd),
                                    data_format="channels_last")(u)
            u = Concatenate()([crop_conv4, skip_input])
            return u

        # Image input
        d0 = Input(shape=self.img_shape, name="input_image")

        # Downsampling
        d1 = conv3d(d0, self.gf, bn=False)
        d2 = conv3d(d1, self.gf * 2)
        d3 = conv3d(d2, self.gf * 4)
        d4 = conv3d(d3, self.gf * 8)
        d5 = conv3d(d4, self.gf * 8)
        u3 = deconv3d(d5, d4, self.gf * 8)
        u4 = deconv3d(u3, d3, self.gf * 4)
        u5 = deconv3d(u4, d2, self.gf * 2)
        u6 = deconv3d(u5, d1, self.gf)

        u7 = UpSampling3D(size=2)(u6)
        output_img = Conv3D(self.channels,
                            kernel_size=4,
                            strides=1,
                            padding='same',
                            activation='tanh')(u7)

        return Model(inputs=[d0], outputs=[output_img])

    def build_discriminator(self):
        def d_layer(layer_input, filters, f_size=4, bn=True):
            """Discriminator layer"""
            d = Conv3D(filters, kernel_size=f_size, strides=2,
                       padding='same')(layer_input)
            d = LeakyReLU(alpha=0.2)(d)
            if bn:
                d = BatchNormalization(momentum=0.8)(d)
            return d

        img_A = Input(shape=self.img_shape)
        img_B = Input(shape=self.img_shape)

        # Concatenate image and conditioning image by channels to produce input
        model_input = Concatenate(axis=-1)([img_A, img_B])

        d1 = d_layer(model_input, self.df, bn=False)
        d2 = d_layer(d1, self.df * 2)
        d3 = d_layer(d2, self.df * 4)
        d4 = d_layer(d3, self.df * 8)

        validity = Conv3D(1, kernel_size=4, strides=1, padding='same')(d4)

        return Model([img_A, img_B], validity)

    def train(self, epochs, batch_size=1, sample_interval=50):
        start_time = datetime.datetime.now()
        # Adversarial loss ground truths
        valid = np.zeros((batch_size, ) + self.disc_patch)
        fake = np.ones((batch_size, ) + self.disc_patch)

        for epoch in range(epochs):
            # save model
            if epoch > 0:
                print("Saving Models...")
                self.generator.save(os.path.join(
                    self.modelpath, "G_model.h5"))  # creates a HDF5 file
                self.discriminator.save(
                    os.path.join(
                        self.modelpath,
                        "D_model.h5"))  # creates a HDF5 file 'my_model.h5'

            for batch_i, (imgs_A, imgs_B) in enumerate(
                    self.dataloader.load_batch(batch_size)):
                # ---------------------
                #  Train Discriminator
                # ---------------------
                # Condition on B and generate a translated version
                fake_A = self.generator.predict([imgs_B])

                # Train the discriminators (original images = real / generated = Fake)
                d_loss_real = self.discriminator.train_on_batch(
                    [imgs_A, imgs_B], valid)
                d_loss_fake = self.discriminator.train_on_batch(
                    [fake_A, imgs_B], fake)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

                # -----------------
                #  Train Generator
                # -----------------

                # Train the generators
                g_loss = self.combined.train_on_batch([imgs_A, imgs_B],
                                                      [valid, imgs_A])
                elapsed_time = datetime.datetime.now() - start_time
                # Plot the progress
                print(
                    "[Epoch %d/%d] [Batch %d/%d] [D loss: %f, acc: %3d%%] [G loss: %f] time: %s"
                    % (epoch, epochs, batch_i, self.dataloader.n_batches,
                       d_loss[0], 100 * d_loss[1], g_loss[0], elapsed_time))

                # If at save interval => save generated image samples
                if batch_i % sample_interval == 0:
                    self.show_progress(epoch, batch_i)

    def show_progress(self, epoch, batch_i):
        filename = "%d_%d.png" % (epoch, batch_i)
        if self.isInjector:
            savepath = os.path.join(config['progress'], "injector")
        else:
            savepath = os.path.join(config['progress'], "remover")
        os.makedirs(savepath, exist_ok=True)
        r, c = 3, 3

        imgs_A, imgs_B = self.dataloader.load_data(batch_size=3,
                                                   is_testing=True)
        fake_A = self.generator.predict([imgs_B])

        gen_imgs = np.concatenate([imgs_B, fake_A, imgs_A])

        # Rescale images 0 - 1
        gen_imgs = 0.5 * gen_imgs + 0.5

        titles = ['Condition', 'Generated', 'Original']
        fig, axs = plt.subplots(r, c)
        cnt = 0
        for i in range(r):
            for j in range(c):
                axs[i, j].imshow(gen_imgs[cnt].reshape(
                    (self.img_rows, self.img_cols, self.img_depth))[16, :, :])
                axs[i, j].set_title(titles[i])
                axs[i, j].axis('off')
                cnt += 1
        fig.savefig(os.path.join(savepath, filename))
        plt.close()
Exemplo n.º 23
0
        path=constants.TRAIN_PATH,
        input_features=constants.INPUT_FEATURES,
        output_features=constants.OUTPUT_FEATURES,
        header=0,
        transform=lambda X: [x / 255 for x in X]
    )

    valid_data = Dataset(
        path=constants.VALID_PATH,
        input_features=constants.INPUT_FEATURES,
        output_features=constants.OUTPUT_FEATURES,
        header=0,
        transform=lambda X: [x / 255 for x in X]
    )

    trainloader = DataLoader(train_data, batch_size=constants.BATCH_SIZE)
    devloader = DataLoader(valid_data, batch_size=1000)

    mlp = MLPClassifier(
        input_size=constants.INPUT_DIM,
        hidden_size=constants.HIDDEN_DIM,
        output_size=constants.N_CLASSES,
        learning_rate=constants.LEARNING_RATE,
        num_epochs=constants.NUM_EPOCHS
    )

    loss_storage, acc_storage = mlp.train(
        trainloader,
        devloader,
        log=os.path.join(constants.RESULTS_DIR, 'mnist_log.txt')
    )
Exemplo n.º 24
0
def main(argv):
    funcs = pd.read_pickle(os.path.join(FLAGS.resources, '{}.pkl'.format(FLAGS.function)))['functions'].values
    funcs = GODAG.initialize_idmap(funcs, FLAGS.function)

    log.info('GO DAG initialized. Updated function list-{}'.format(len(funcs)))
    FeatureExtractor.load(FLAGS.resources)
    log.info('Loaded amino acid and ngram mapping data')

    data = DataLoader(filename=FLAGS.inputfile)
    modelsavename = 'savedmodels_{}_{}'.format(__processor__, int(time.time()))
    if FLAGS.predict != '':
        modelsavename = FLAGS.predict
        bestthres = 0.1
        log.info('no training')
        valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize,
                                      dataloader=data, functype=FLAGS.function, featuretype='onehot')

        train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize,
                                  seqlen=FLAGS.maxseqlen, dataloader=data,
                                  numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000),
                                  functype=FLAGS.function, featuretype='onehot')
        next(valid_dataiter)
        next(train_iter)
    else:
        with tf.Session() as sess:
            valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize,
                                          dataloader=data, functype=FLAGS.function, featuretype='onehot')

            train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize,
                                      seqlen=FLAGS.maxseqlen, dataloader=data,
                                      numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000),
                                      functype=FLAGS.function, featuretype='onehot')

            encoder = CHARCNNEncoder(vocab_size=len(FeatureExtractor.aminoacidmap) + 1,
                                     inputsize=train_iter.expectedshape).build()
            log.info('built encoder')
            decoder = HierarchicalGODecoder(funcs, encoder.outputs, FLAGS.function).build(GODAG)
            log.info('built decoder')
            init = tf.global_variables_initializer()
            init.run(session=sess)
            chkpt = tf.train.Saver(max_to_keep=4)
            train_writer = tf.summary.FileWriter(FLAGS.outputdir + '/train',
                                              sess.graph)

            test_writer = tf.summary.FileWriter(FLAGS.outputdir + '/test')
            step = 0
            maxwait = 1
            wait = 0
            bestf1 = -1
            metagraphFlag = True
            log.info('starting epochs')
            for epoch in range(FLAGS.num_epochs):
                for x, y in train_iter:
                    if x.shape[0] != y.shape[0]:
                        raise Exception('invalid, x-{}, y-{}'.format(str(x.shape), str(y.shape)))

                    _, loss, summary = sess.run([decoder.train, decoder.loss, decoder.summary],
                                                 feed_dict={decoder.ys_: y, encoder.xs_: x,
                                                            decoder.threshold: [0.2]})
                    train_writer.add_summary(summary, step)
                    log.info('step-{}, loss-{}'.format(step, round(loss, 2)))
                    step += 1

                if True:
                    log.info('beginning validation')
                    prec, recall, f1 = validate(valid_dataiter, sess, encoder, decoder, test_writer)
                    thres = np.argmax(np.round(f1, 2))
                    log.info('epoch: {} \n precision: {}, recall: {}, f1: {}'.format(epoch,
                                                                                     np.round(prec, 2)[thres],
                                                                                     np.round(recall, 2)[thres],
                                                                                     np.round(f1, 2)[thres]))
                    log.info('precision mat {}'.format(str(np.round(prec, 2))))
                    log.info('recall mat {}'.format(str(np.round(recall, 2))))
                    log.info('f1 mat {}'.format(str(np.round(f1, 2))))

                    log.info('selected threshold is {}'.format(thres/10 + 0.1))
                    if f1[thres] > (bestf1 + 1e-3):
                        bestf1 = f1[thres]
                        bestthres = THRESHOLD_RANGE[thres]
                        wait = 0
                        chkpt.save(sess, os.path.join(FLAGS.outputdir, modelsavename,
                                                        'model_{}_{}'.format(FLAGS.function, step)),
                                    global_step=step, write_meta_graph=metagraphFlag)
                        metagraphFlag = False
                    else:
                        wait += 1
                        if wait > maxwait:
                            log.info('f1 didnt improve for last {} validation steps, so stopping'.format(maxwait))
                            break

                    step += 1

                train_iter.reset()

    log.info('testing model')
    test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize,
                                 dataloader=data, functype=FLAGS.function, featuretype='onehot')
    prec, recall, f1 = predict_evaluate(test_dataiter, [bestthres], os.path.join(FLAGS.outputdir, modelsavename))
    log.info('test results')
    log.info('precision: {}, recall: {}, F1: {}'.format(round(prec, 2), round(recall, 2), round(f1, 2)))
    data.close()
def main(argv):
    goids = GODAG.initialize_idmap(None, None)

    labelembedding = load_labelembedding(os.path.join(FLAGS.resources, 'goEmbeddings.txt'), goids)
    assert(labelembedding.shape[0] == (len(goids))) , 'label embeddings and known go ids differ'

    ## Add a row of zeros to refer to NOGO or STOPGO
    labelembedding = np.vstack([np.zeros(labelembedding.shape[1]), labelembedding]).astype(np.float32)
    labelembeddingsize = labelembedding.shape[1]

    # shift all goids by 1, to allow STOPGO
    GODAG.idmap = {key: (val + 1) for key, val in GODAG.idmap.items()}
    log.info('min go index - {}'.format(min(list(GODAG.idmap.values()))))
    GODAG.idmap['STOPGO'] = 0
    GODAG.GOIDS.insert(0, 'STOPGO')
    log.info('first from main-{}, from goids-{},  from idmap-{}, by reversemap-{}'.format(goids[0], GODAG.GOIDS[1], GODAG.id2node(1), GODAG.get_id(goids[0])))

    FeatureExtractor.load(FLAGS.resources)
    log.info('Loaded amino acid and ngram mapping data')

    data = DataLoader(filename=FLAGS.inputfile)
    modelsavename = FLAGS.predict
    if FLAGS.predict == "":
        modelsavename = 'savedmodels_{}'.format(int(time.time()))
        with tf.Session() as sess:
            # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize,
                                          dataloader=data, functype=FLAGS.function, featuretype='onehot',
                                          onlyLeafNodes=True, numfuncs=FLAGS.maxnumfuncs)


            train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize,
                                      seqlen=FLAGS.maxseqlen, dataloader=data,
                                      numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000),
                                      functype=FLAGS.function, featuretype='onehot', onlyLeafNodes=True, numfuncs=FLAGS.maxnumfuncs)

            #encoder = CNNEncoder(vocab_size=len(FeatureExtractor.ngrammap) + 1, inputsize=train_iter.expectedshape).build()

            encoder = MultiCharCNN(vocab_size=len(FeatureExtractor.aminoacidmap) + 1,
                                   inputsize=train_iter.expectedshape, with_dilation=False, charfilter=32,
                                   poolsize=80, poolstride=48).build()

            log.info('built encoder')
            decoder = GORNNDecoder(encoder.outputs, labelembedding, numfuncs=FLAGS.maxnumfuncs,
                                   trainlabelEmbedding=FLAGS.trainlabel, distancefunc=FLAGS.distancefunc, godag=GODAG).build()
            log.info('built decoder')

            init = tf.global_variables_initializer()
            init.run(session=sess)
            chkpt = tf.train.Saver(max_to_keep=4)
            train_writer = tf.summary.FileWriter(FLAGS.outputdir + '/train',
                                              sess.graph)

            test_writer = tf.summary.FileWriter(FLAGS.outputdir + '/test')
            step = 0
            maxwait = 2
            wait = 0
            bestf1 = 0
            bestthres = 0
            metagraphFlag = True
            log.info('starting epochs')
            log.info('params - trainsize-{}, validsie-{}, rootfunc-{}, batchsize-{}'.format(FLAGS.trainsize, FLAGS.validationsize,
                                                                                            FLAGS.function, FLAGS.batchsize))
            for epoch in range(FLAGS.num_epochs):
                for x, y in train_iter:
                    if x.shape[0] != y.shape[0]:
                        raise Exception('invalid, x-{}, y-{}'.format(str(x.shape), str(y.shape)))

                    negatives = get_negatives(y, 10)
                    _, loss, summary = sess.run([decoder.train, decoder.loss, decoder.summary],
                                                 feed_dict={decoder.ys_: y[:, :FLAGS.maxnumfuncs], encoder.xs_: x,
                                                    decoder.negsamples: negatives, decoder.istraining: [True]})
                    train_writer.add_summary(summary, step)
                    log.info('step-{}, loss-{}'.format(step, round(loss, 2)))
                    step += 1

                log.info('beginning validation')
                prec, recall, f1 = validate(valid_dataiter, sess, encoder, decoder, test_writer)
                log.info('epoch: {} \n precision: {}, recall: {}, f1: {}'.format(epoch,
                                                                                 np.round(prec, 2),
                                                                                 np.round(recall, 2),
                                                                                 np.round(f1, 2)))
                if np.round(f1,2) >= (bestf1):
                    bestf1 = np.round(f1,2)
                    wait = 0
                    log.info('saving meta graph')
                    #ipdb.set_trace()
                    chkpt.save(sess, os.path.join(FLAGS.outputdir, modelsavename,
                                                    'model_{}_{}'.format(FLAGS.function, step)),
                                global_step=step, write_meta_graph=metagraphFlag)
                    metagraphFlag = True
                else:
                    wait += 1
                    if wait > maxwait:
                        log.info('f1 didnt improve for last {} validation steps, so stopping'.format(maxwait))
                        break

                train_iter.reset()
                prec, recall, f1 = validate(train_iter, sess, encoder, decoder, None)
                log.info('training error,epoch-{}, precision: {}, recall: {}, f1: {}'.format(epoch,
                                                                                             np.round(prec, 2),
                                                                                             np.round(recall, 2),
                                                                                             np.round(f1, 2)))


                train_iter.reset()

    log.info('testing model')
    test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize,
                                 dataloader=data, functype=FLAGS.function, featuretype='onehot',
                                 onlyLeafNodes=True, numfuncs=FLAGS.maxnumfuncs)
    prec, recall, f1 = predict_evaluate(test_dataiter, os.path.join(FLAGS.outputdir, modelsavename))
    log.info('test results')
    log.info('precision: {}, recall: {}, F1: {}'.format(round(prec, 2), round(recall, 2), round(f1, 2)))
    data.close()
Exemplo n.º 26
0
from utils.bert import get_config, BertModel, BertForEmoji, set_learned_params
from torch import optim, nn
import torch
from utils.dataloader import DataLoader
from utils.train import train_model

train_dl, val_dl, TEXT, dataloaders_dict = DataLoader(max_length=256,
                                                      batch_size=32)
# モデル設定のJOSNファイルをオブジェクト変数として読み込む
config = get_config(file_path="./weights/bert_config.json")

# ベースのBERTモデルを生成
net_bert = BertModel(config)

# BERTモデルに学習済みパラメータセット
net_bert = set_learned_params(net_bert,
                              weights_path="./weights/pytorch_model.bin")

net = BertForEmoji(net_bert)

# 訓練モードに設定
net.train()

# 勾配計算を最後のBertLayerモジュールと追加した分類アダプターのみ実行

for name, param in net.named_parameters():
    param.requires_grad = False

for name, param in net.bert.encoder.layer[-1].named_parameters():
    param.requires_grad = True
Exemplo n.º 27
0
def main(argv):
    goids = GODAG.initialize_idmap(None, None)
    # GO_MAT = GODAG.get_fullmat(goids)
    # log.info('GO Matrix shape - {}'.format(GO_MAT.shape))
    # GO_MAT = np.vstack([np.zeros(GO_MAT.shape[1]), GO_MAT])
    labelembedding = load_labelembedding(os.path.join(FLAGS.data, 'goEmbeddings.txt'), goids)
    assert(labelembedding.shape[0] == (len(goids) + 1)) , 'label embeddings and known go ids differ'
    labelembeddingsize = labelembedding.shape[1]
    FeatureExtractor.load(FLAGS.data)
    log.info('Loaded amino acid and ngram mapping data')

    data = DataLoader()
    modelsavename = 'savedmodels_{}'.format(int(time.time()))
    with tf.Session() as sess:
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        valid_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.validationsize,
                                      dataloader=data, functype=FLAGS.function, featuretype='ngrams',
                                      onlyLeafNodes=True, limit=FLAGS.maxnumfuncs)


        train_iter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.trainsize,
                                  seqlen=FLAGS.maxseqlen, dataloader=data,
                                  numfiles=np.floor((FLAGS.trainsize * FLAGS.batchsize) / 250000),
                                  functype=FLAGS.function, featuretype='ngrams', onlyLeafNodes=True, limit=FLAGS.maxnumfuncs)

        encoder = CNNEncoder(vocab_size=len(FeatureExtractor.ngrammap) + 1, inputsize=train_iter.expectedshape).build()
        log.info('built encoder')
        decoder = GORNNDecoder(encoder.outputs, labelembedding, numfuncs=FLAGS.maxnumfuncs).build()
        log.info('built decoder')
        init = tf.global_variables_initializer()
        init.run(session=sess)
        chkpt = tf.train.Saver(max_to_keep=4)
        train_writer = tf.summary.FileWriter(FLAGS.outputdir + '/train',
                                          sess.graph)

        test_writer = tf.summary.FileWriter(FLAGS.outputdir + '/test')
        step = 0
        maxwait = 1
        wait = 0
        bestf1 = 0
        bestthres = 0
        metagraphFlag = True
        log.info('starting epochs')
        log.info('params - trainsize-{}, validsie-{}, rootfunc-{}, batchsize-{}'.format(FLAGS.trainsize, FLAGS.validationsize,
                                                                                        FLAGS.function, FLAGS.batchsize))
        for epoch in range(FLAGS.num_epochs):
            for x, y in train_iter:
                if x.shape[0] != y.shape[0]:
                    raise Exception('invalid, x-{}, y-{}'.format(str(x.shape), str(y.shape)))

                negatives = get_negatives(y, 10)
                _, loss, summary = sess.run([decoder.train, decoder.loss, decoder.summary],
                                            feed_dict={decoder.ys_: y, encoder.xs_: x,
                                                decoder.negsamples: negatives})
                train_writer.add_summary(summary, step)
                log.info('step-{}, loss-{}'.format(step, round(loss, 2)))
                step += 1

            log.info('beginning validation')
            prec, recall, f1 = validate(valid_dataiter, sess, encoder, decoder, test_writer)
            log.info('epoch: {} \n precision: {}, recall: {}, f1: {}'.format(epoch,
                                                                             np.round(prec, 2),
                                                                             np.round(recall, 2),
                                                                             np.round(f1, 2)))
            if f1 > (bestf1 + 1e-3):
                bestf1 = f1
                wait = 0
                chkpt.save(sess, os.path.join(FLAGS.outputdir, modelsavename,
                                                'model_{}_{}'.format(FLAGS.function, step)),
                            global_step=step, write_meta_graph=metagraphFlag)
                metagraphFlag = False
            else:
                wait += 1
                if wait > maxwait:
                    log.info('f1 didnt improve for last {} validation steps, so stopping'.format(maxwait))
                    break

            train_iter.reset()

    log.info('testing model')
    test_dataiter = DataIterator(batchsize=FLAGS.batchsize, size=FLAGS.testsize,
                                 dataloader=data, functype=FLAGS.function, featuretype='ngrams',
                                 onlyLeafNodes=True, limit=FLAGS.maxnumfuncs)
    prec, recall, f1 = predict_evaluate(test_dataiter, [bestthres], os.path.join(FLAGS.outputdir, modelsavename))
    log.info('test results')
    log.info('precision: {}, recall: {}, F1: {}'.format(round(prec, 2), round(recall, 2), round(f1, 2)))
    data.close()
Exemplo n.º 28
0
from utils.dataloader import DataLoader
import torch
from model import model_utils
from optimizer.optimizer import NoamOpt
from train.trainer import Trainer

hidden_size = 256
num_encoder = 6
num_decoder = 6
n_head = 8
pf_dim = 1024
drop_out = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

dataloader = DataLoader(device)
train_iterator, valid_iterator, test_iterator = dataloader.load_data(64)
model = model_utils.create_model(dataloader.src_vocab_size(), dataloader.trg_vocab_size(), hidden_size, num_encoder, num_decoder, n_head, pf_dim,
                                 drop_out, dataloader.get_pad_idx(), device)

print(model_utils.count_parameters(model))
model_utils.init(model)
optimizer = NoamOpt(hidden_size , 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

trainer = Trainer(train_iterator, valid_iterator, model, optimizer, dataloader.get_pad_idx(), device)
trainer.train(5)
# for i, batch in enumerate(train_iterator):
#     src = batch.src.permute(1, 0).to(device)
#     trg = batch.trg.permute(1, 0).to(device)
Exemplo n.º 29
0
                               targets_global, predictions_global)
    return metric_per_length, lengths, metric_global


device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
batch_size = 248
val_iters_mask = []

num_corrupted = [1, 2, 3, 4, 5, 20]

for masks in num_corrupted:

    test_set = QM9Dataset(data='data/adjacency_matrix_test.pkl',
                          num_masks=masks)
    test_dl = DataLoader(test_set, batch_size=batch_size)
    val_iters_mask.append(test_dl)

val_iters_fake = []

for fakes in num_corrupted:

    test_set = QM9Dataset(data='data/adjacency_matrix_test.pkl',
                          num_fake=fakes)
    test_dl = DataLoader(test_set, batch_size=batch_size)
    val_iters_fake.append(test_dl)

model_names = [
    "Transformer_num_masks=1_num_fake=0_num_same=0_num_layers=4_num_heads=3_embedding_dim=64_dropout=0.0_lr=0.001_edge_encoding=1_epsilon_greedy=0.2.pt",
    "BagOfWords_num_masks=1_num_fake=0_num_same=0_num_layers=4_embedding_dim=64_lr=0.0005_epsilon_greedy=0.2_bow_type=1.pt",
    "BagOfWords_num_masks=1_num_fake=0_num_same=0_num_layers=4_embedding_dim=64_lr=0.0005_epsilon_greedy=0.2_bow_type=2.pt",
Exemplo n.º 30
0
def main(argv):
    funcs = pd.read_pickle(
        os.path.join(FLAGS.resources,
                     '{}.pkl'.format(FLAGS.function)))['functions'].values
    funcs = GODAG.initialize_idmap(funcs, FLAGS.function)

    log.info('GO DAG initialized. Updated function list-{}'.format(len(funcs)))
    FeatureExtractor.load(FLAGS.resources)
    log.info('Loaded amino acid and ngram mapping data')
    pretrained = None
    featuretype = 'onehot'
    if FLAGS.pretrained != '':
        log.info('loading pretrained embedding')
        pretrained, ngrammap = load_pretrained_embedding(FLAGS.pretrained)
        FeatureExtractor.ngrammap = ngrammap
        featuretype = 'ngrams'

    with tf.Session() as sess:
        data = DataLoader(filename=FLAGS.inputfile)
        log.info('initializing validation data')
        valid_dataiter = DataIterator(batchsize=FLAGS.batchsize,
                                      size=FLAGS.validationsize,
                                      dataloader=data,
                                      functype=FLAGS.function,
                                      featuretype='ngrams',
                                      numfuncs=len(funcs),
                                      all_labels=False,
                                      autoreset=True)

        log.info('initializing train data')
        train_iter = DataIterator(batchsize=FLAGS.batchsize,
                                  size=FLAGS.trainsize,
                                  seqlen=FLAGS.maxseqlen,
                                  dataloader=data,
                                  numfiles=4,
                                  numfuncs=len(funcs),
                                  functype=FLAGS.function,
                                  featuretype='ngrams',
                                  all_labels=False,
                                  autoreset=True)

        vocabsize = ((len(FeatureExtractor.ngrammap) +
                      1) if featuretype == 'ngrams' else
                     (len(FeatureExtractor.aminoacidmap) + 1))

        model = KerasDeepGO(funcs,
                            FLAGS.function,
                            GODAG,
                            train_iter.expectedshape,
                            vocabsize,
                            pretrained_embedding=pretrained).build()
        log.info('built encoder')
        log.info('built decoder')
        keras.backend.set_session(sess)
        log.info('starting epochs')

        model_path = FLAGS.outputdir + 'models/model_seq_' + FLAGS.function + '.h5'
        checkpointer = keras.callbacks.ModelCheckpoint(filepath=model_path,
                                                       verbose=1,
                                                       save_best_only=True,
                                                       save_weights_only=True)
        earlystopper = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                     patience=10,
                                                     verbose=1)

        model_jsonpath = FLAGS.outputdir + 'models/model_{}.json'.format(
            FLAGS.function)
        f = open(model_jsonpath, 'w')
        f.write(model.to_json())
        f.close()

        model.fit_generator(train_iter,
                            steps_per_epoch=FLAGS.trainsize,
                            epochs=5,
                            validation_data=valid_dataiter,
                            validation_steps=FLAGS.validationsize,
                            max_queue_size=128,
                            callbacks=[checkpointer, earlystopper])

        valid_dataiter.close()
        train_iter.close()

    log.info('initializing test data')
    test_dataiter = DataIterator(batchsize=FLAGS.batchsize,
                                 size=FLAGS.testsize,
                                 seqlen=FLAGS.maxseqlen,
                                 dataloader=data,
                                 numfiles=4,
                                 numfuncs=len(funcs),
                                 functype=FLAGS.function,
                                 featuretype='ngrams',
                                 all_labels=True)

    prec, recall, f1 = predict_evaluate(test_dataiter, model_jsonpath,
                                        model_path)
    log.info('testing error, prec-{}, recall-{}, f1-{}'.format(
        np.round(prec, 3), np.round(recall, 3), np.round(f1, 3)))
    data.close()