Exemplo n.º 1
0
    def __init__(self, L, cell_line, descriptor='jtvae', n_fold=5, random_state=0, random_genes=False,
                 csv_file="", useChirality=False):
        """
            Parameters
            -----------
            L: dictionary from L1000CDS_subset.json

            cell_line: cell_id
                params:
                    string: 'VCAP', 'A549', 'A375', 'PC3', 'MCF7', 'HT29', etc.

            descriptor: descriptor for chemical compounds.
                params:
                    string: 'ecfp', 'ecfp_autoencoder', 'maccs', 'topological', 'shed', 'cats2d', 'jtvae'(default)

            n_fold: number of folds
                params:
                    int: 5(default)

            random_state: random_state for Kfold
                params:
                    int: 0(default)

            random_genes: if it is true, returns random 20 genes from target values
                params:
                    bool: False(default)
                    list of random genes: [118, 919, 274, 866, 354, 253, 207, 667, 773, 563,
                                           553, 918, 934, 81, 56, 232, 892, 485, 30, 53]

            csv_file: if it is not empty, representation data used from this file
                params:
                    string: "<csv_file_path>"
        """
        
        self.L = L
        self.cell_line = cell_line
        self.descriptor = descriptor
        self.n_fold = n_fold
        self.random_state = random_state
        self.random_genes = random_genes
        self.csv_file = csv_file
        self.useChirality = useChirality
        
        if self.useChirality and self.descriptor != 'ecfp':
            sys.exit('useChirality parameter is only usable with ecfp descriptor.')
            
        self.random_index_list = [118, 919, 274, 866, 354, 253, 207, 667, 773, 563,
                                  553, 918, 934, 81, 56, 232, 892, 485, 30, 53]

        self.LmGenes = []
        self.meta_smiles = pd.read_csv('meta_SMILES.csv')

        file_path = 'LmGenes.txt'
        with open(file_path) as fp:
            line = fp.readline()
            while line:
                self.LmGenes.append(line.strip())
                line = fp.readline()
        
        self.rep = Representation(self.descriptor)
Exemplo n.º 2
0
class BasicPipeline:

    def __init__(self, view, genes):
        self.representation = Representation(view)
        self.genes = genes

    def pca_view(self):
        pca = self.representation.pca()
        plt.plot(pca[2])
        plt.show()
        pdb.set_trace()
        plt.clf()
        return pca

    def pca_view_diff(self):
        tmp_view = self.representation.view
        self.representation.view = self.time_diff()
        pca = self.representation.view.pca()
        plt.plot(pca[2])
        plt.show()
        pdb.set_trace()
        plt.clf()
        self.representation.view = tmp_view
        return pca

    def time_diff(self):
        num_t = self.representation.view.shape[1]
        num_g = self.representation.view.shape[0]
        d_matrix = np.ndarray((num_g, num_t - 1))
        for i in range(1, num_t):
            d_matrix[:, i - 1] = self.representation.view[:, i] - self.representation.view[:, i - 1]
        return d_matrix

    def clean(self):
        print "Cleaning data..."
        U, S, V = self.representation.svd()
        new_view = np.ndarray(self.representation.view.shape)
        loadings = U[:, 0:param['clean_components']]
        for i in range(self.representation.view.shape[1]):
            feature_vector = self.representation.view[:, i]
            model = LinearRegression(fit_intercept=False)
            model.fit(loadings, feature_vector)
            residual = feature_vector - model.predict(loadings)
            new_view[:, i] = residual

        self.representation.view = scale(new_view)
        return self.representation.view
 def render_interactive(self):
     random.seed(config.seed)
     screen = pygame.display.set_mode(config.size)
     clock = pygame.time.Clock()
     speedLimits = simulation.speedLimits.SpeedLimits(
         config.speedLimits, config.maxSpeed)
     road = simulation.road.Road(config.lanes, config.length, speedLimits)
     simulation_ = SimulationManager(road, config.trafficGenerator,
                                     config.updateFrame)
     representation = Representation(screen, road, simulation_, config.data)
     while simulation_.running:
         for event in pygame.event.get():
             if event.type == pygame.KEYDOWN:
                 simulation_.processKey(event.key)
         clock.tick_busy_loop(
             config.maxFps)  #.tick_busy_loop = updates the clock
         dt = clock.get_time()  # —	time used in the previous tick
         simulation_.update(dt)  #updates logistics
         representation.draw(dt * simulation_.timeFactor)  #updates graphics
         pygame.display.flip()
Exemplo n.º 4
0
 def __init__(self, view, genes):
     self.representation = Representation(view)
     self.genes = genes
    exit()

config = importlib.import_module(sys.argv[1])

random.seed(config.seed)
pygame.init()
screen = pygame.display.set_mode(config.size)

clock = pygame.time.Clock()

simulation.car.Car.slowDownProbability = config.slowDownProbability
simulation.car.Car.laneChangeProbability = config.laneChangeProbability
speedLimits = simulation.speedLimits.SpeedLimits(config.speedLimits,
                                                 config.maxSpeed)
road = simulation.road.Road(config.lanes, config.length, speedLimits)
simulation = SimulationManager(road, config.trafficGenerator,
                               config.updateFrame)
representation = Representation(screen, road, simulation)

while simulation.running:
    for event in pygame.event.get():
        if event.type == pygame.KEYDOWN:
            simulation.processKey(event.key)
    clock.tick_busy_loop(config.maxFps)
    dt = clock.get_time()
    simulation.update(dt)
    representation.draw(dt * simulation.timeFactor)
    pygame.display.flip()

print("Goodbye")
#Aqui se generan las distintas configuraciones
config = importlib.import_module(sys.argv[1])

random.seed(config.seed)
pygame.init()
screen = pygame.display.set_mode(config.size)

clock = pygame.time.Clock()

simulation.car.Car.slowDownProbability = config.slowDownProbability
simulation.car.Car.laneChangeProbability = config.laneChangeProbability
speedLimits = simulation.speedLimits.SpeedLimits(config.speedLimits, config.maxSpeed)
road = simulation.road.Road(config.lanes, config.length, speedLimits)
simulation = SimulationManager(road, config.trafficGenerator, config.updateFrame)
representation = Representation(screen, road, simulation)

#[M]DataScience
datascience = DataScience(config, road, simulation, representation)

datascience.writeInput()
#[M]Fin DataScience

while simulation.running:
    for event in pygame.event.get():
        if event.type == pygame.KEYDOWN:
            simulation.processKey(event.key)
            
    clock.tick_busy_loop(config.maxFps)
    dt = clock.get_time()
    #Aqui aparecen los datos de la simulacion.
Exemplo n.º 7
0
def predict():
    # Set mixed precision policy
    if FLAGS.mixed_precision:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)
    # Make target dir
    if not os.path.exists(FLAGS.predict_dir):
        os.makedirs(FLAGS.predict_dir)
    # Get dataset
    dataset = _get_dataset(dataset=FLAGS.dataset,
                           label_mode=FLAGS.label_mode,
                           input_mode=FLAGS.input_mode,
                           input_length=FLAGS.input_length,
                           seq_shift=FLAGS.seq_shift,
                           def_val=DEF_VAL)
    num_event_classes = dataset.num_event_classes()
    # Define representation
    rep = Representation(blank_index=BLANK_INDEX,
                         def_val=DEF_VAL,
                         loss_mode=FLAGS.loss_mode,
                         num_event_classes=num_event_classes,
                         pad_val=PAD_VAL,
                         use_def=FLAGS.use_def,
                         decode_fn=FLAGS.decode_fn,
                         beam_width=FLAGS.beam_width)
    num_classes = rep.get_num_classes()
    # Get model and infer seq_length
    model = _get_model(model=FLAGS.model,
                       dataset=FLAGS.dataset,
                       num_classes=num_classes,
                       input_length=FLAGS.input_length,
                       l2_lambda=L2_LAMBDA)
    seq_length = model.get_seq_length()
    rep.set_seq_length(seq_length)
    # Make sure that seq_shift is set corresponding to model SEQ_POOL
    assert FLAGS.seq_shift == model.get_out_pool(), \
      "seq_shift should be equal to model.get_out_pool() in predict"
    # Load weights
    model.load_weights(
        os.path.join(FLAGS.model_dir, "checkpoints", FLAGS.model_ckpt))
    # Set up metrics
    metrics = PredMetrics(rep)
    # Files for predicting
    filenames = gfile.Glob(os.path.join(FLAGS.eval_dir, "*.tfrecord"))
    # For each filename, export logits
    for filename in filenames:
        # Get video id
        video_id = os.path.splitext(os.path.basename(filename))[0]
        export_csv = os.path.join(FLAGS.predict_dir, str(video_id) + ".csv")
        export_tfrecord = os.path.join(FLAGS.predict_dir, "logits",
                                       str(video_id) + ".tfrecord")
        logging.info("Working on {0}.".format(video_id))
        if os.path.exists(export_csv) and os.path.exists(export_tfrecord):
            logging.info(
                "Export files already exist. Skipping {0}.".format(filename))
            continue
        # Get the dataset
        label_fn = model.get_label_fn(1)
        collapse_fn = rep.get_loss_collapse_fn()
        data = dataset(batch_size=1,
                       data_dir=filename,
                       is_predicting=True,
                       is_training=False,
                       label_fn=label_fn,
                       collapse_fn=collapse_fn)
        # Iterate to get n and v_seq_length
        n = len(list(data))
        v_seq_length = n + seq_length - 1
        # Get the aggregators
        labels_aggregator = aggregation.ConcatAggregator(n=n,
                                                         idx=seq_length - 1)
        if seq_length == 1:
            logits_aggregator = aggregation.ConcatAggregator(n=n,
                                                             idx=seq_length -
                                                             1)
        else:
            logits_aggregator = aggregation.AverageAggregator(
                num_classes=num_classes, seq_length=seq_length)
        preds_aggregator = _get_preds_aggregator(
            predict_mode=FLAGS.predict_mode,
            n=n,
            rep=rep,
            v_seq_length=v_seq_length)
        # Iterate through batches
        # Write logits and labels to TFRecord for analysis
        if not os.path.exists(os.path.join(FLAGS.predict_dir, "logits")):
            os.makedirs(os.path.join(FLAGS.predict_dir, "logits"))
        with tf.io.TFRecordWriter(export_tfrecord) as tfrecord_writer:
            for i, (b_features, b_labels) in enumerate(data):
                # Assert sizes
                assert b_labels.shape == [1, seq_length
                                          ], "Labels shape [1, seq_length]"
                # Prediction step
                b_logits = pred_step(model, b_features)
                assert b_logits.shape == [
                    1, seq_length, rep.get_num_classes()
                ], "Logits shape [1, seq_length, num_classes]"
                # Aggregation step
                labels_aggregator.step(i, b_labels)
                logits_aggregator.step(i, b_logits)
                if preds_aggregator is not None:
                    preds_aggregator.step(i, b_logits)
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'example/logits': _floats_feature(
                            b_logits.numpy().ravel()),
                        'example/labels': _int64_feature(
                            b_labels.numpy().ravel())
                    }))
                tfrecord_writer.write(example.SerializeToString())
        # Get aggregated data
        labels = labels_aggregator.result()
        logits = logits_aggregator.result()
        preds = None
        if preds_aggregator is not None:
            preds = preds_aggregator.result()
        # Collapse on video level
        if preds is not None:
            preds = rep.get_inference_collapse_fn(v_seq_length)(preds)
        # Remove empty batch dimensions
        labels = tf.squeeze(labels, axis=0)
        logits = tf.squeeze(logits, axis=0)
        if preds is not None:
            preds = tf.squeeze(preds, axis=0)
        # Export probs for two stage model
        ids = [video_id] * v_seq_length
        if FLAGS.predict_mode == "probs":
            logging.info("Saving labels and probs")
            probs = tf.nn.softmax(logits, axis=-1)
            save_array = np.column_stack(
                (ids, labels.numpy().tolist(), probs.numpy().tolist()))
            np.savetxt(export_csv, save_array, delimiter=",", fmt='%s')
            continue
        # Update metrics for single stage model
        metrics.update(labels, preds)
        # Save for single stage model
        logging.info("Writing {0} examples to {1}.csv...".format(
            len(ids), video_id))
        save_array = np.column_stack(
            (ids, labels.numpy().tolist(), logits.numpy().tolist(),
             preds.numpy().tolist()))
        np.savetxt(export_csv, save_array, delimiter=",", fmt='%s')
    if FLAGS.predict_mode == "probs":
        # Finish
        exit()
    # Print metrics
    metrics.finish()
Exemplo n.º 8
0
def train_and_evaluate():
    """Train the model with custom training loop, evaluating at given intervals."""

    # Set mixed precision policy
    if FLAGS.mixed_precision:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_policy(policy)

    # Get dataset
    dataset = _get_dataset(dataset=FLAGS.dataset,
                           label_mode=FLAGS.label_mode,
                           input_mode=FLAGS.input_mode,
                           input_length=FLAGS.input_length,
                           seq_shift=FLAGS.seq_shift,
                           def_val=DEF_VAL)

    # Define representation
    rep = Representation(blank_index=BLANK_INDEX,
                         def_val=DEF_VAL,
                         loss_mode=FLAGS.loss_mode,
                         num_event_classes=dataset.num_event_classes(),
                         pad_val=PAD_VAL,
                         use_def=FLAGS.use_def,
                         decode_fn=FLAGS.decode_fn,
                         beam_width=FLAGS.beam_width)

    # Get model
    model = _get_model(model=FLAGS.model,
                       dataset=FLAGS.dataset,
                       num_classes=rep.get_num_classes(),
                       input_length=FLAGS.input_length,
                       l2_lambda=L2_LAMBDA)
    seq_length = model.get_seq_length()
    rep.set_seq_length(seq_length)

    # Instantiate learning rate schedule and optimizer
    if FLAGS.lr_decay_fn == "exponential":
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=FLAGS.lr_base,
            decay_steps=LR_DECAY_STEPS,
            decay_rate=FLAGS.lr_decay_rate,
            staircase=True)
    elif FLAGS.lr_decay_fn == "piecewise_constant":
        values = np.divide(FLAGS.lr_base, LR_VALUE_DIV)
        lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries=LR_BOUNDARIES, values=values.tolist())
    elif FLAGS.lr_decay_fn == "constant":
        lr_schedule = ConstantLR(FLAGS.lr_base)
    optimizer = Adam(learning_rate=lr_schedule)
    # Get LossScaleOptimizer
    if FLAGS.mixed_precision:
        optimizer = LossScaleOptimizer(optimizer=optimizer,
                                       loss_scale='dynamic')

    # Get loss function
    train_loss_fn = rep.get_loss_fn(batch_size=FLAGS.batch_size)
    eval_loss_fn = rep.get_loss_fn(batch_size=FLAGS.eval_batch_size)

    # Get train and eval dataset
    collapse_fn = rep.get_loss_collapse_fn()
    train_dataset = dataset(batch_size=FLAGS.batch_size,
                            data_dir=FLAGS.train_dir,
                            is_predicting=False,
                            is_training=True,
                            label_fn=model.get_label_fn(FLAGS.batch_size),
                            collapse_fn=collapse_fn,
                            num_shuffle=FLAGS.num_shuffle)
    eval_dataset = dataset(batch_size=FLAGS.eval_batch_size,
                           data_dir=FLAGS.eval_dir,
                           is_predicting=False,
                           is_training=False,
                           label_fn=model.get_label_fn(FLAGS.eval_batch_size),
                           collapse_fn=collapse_fn,
                           num_shuffle=FLAGS.num_shuffle)

    # Load model
    if FLAGS.model_ckpt is not None:
        logging.info("Loading model from {}".format(FLAGS.model_ckpt))
        load_status = model.load_weights(
            os.path.join(FLAGS.model_dir, "checkpoints", FLAGS.model_ckpt))
        load_status.assert_consumed()

    # Set up log writer and metrics
    train_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.model_dir, "log/train"))
    eval_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.model_dir, "log/eval"))
    train_metrics = TrainMetrics(representation=rep, writer=train_writer)
    eval_metrics = EvalMetrics(representation=rep, writer=eval_writer)

    # Save best checkpoints in terms of f1
    model_saver = ModelSaver(os.path.join(FLAGS.model_dir, "checkpoints"),
                             compare_fn=lambda x, y: x.score > y.score,
                             sort_reverse=True)

    # Keep track of total global step
    global_step = 0

    # Iterate over epochs
    for epoch in range(FLAGS.train_epochs):
        logging.info('Starting epoch %d' % (epoch, ))

        # Iterate over training batches
        for step, (train_features, train_labels, train_labels_c,
                   train_labels_l) in enumerate(train_dataset):
            # Assert sizes
            assert train_labels.shape == [
                FLAGS.batch_size, seq_length
            ], "Labels shape [batch_size, seq_length]"
            # Run the train step
            train_logits, train_loss, train_l2_loss, train_grads = train_step(
                model, train_features, train_labels, train_labels_c,
                train_labels_l, train_loss_fn, optimizer)
            # Assert sizes
            assert train_logits.shape == [
                FLAGS.batch_size, seq_length,
                rep.get_num_classes()
            ], "Logits shape [batch_size, seq_length, num_classes]"
            # Log every FLAGS.log_steps steps.
            if global_step % FLAGS.log_steps == 0:
                logging.info("Memory used: {} GB".format(
                    psutil.virtual_memory().used / 2**30))
                # Decode logits into predictions
                train_predictions_u = None
                if FLAGS.loss_mode == "ctc":
                    train_predictions_u, _ = rep.get_decode_fn(
                        FLAGS.batch_size)(train_logits)
                    train_predictions_u = rep.get_inference_collapse_fn()(
                        train_predictions_u)
                # General logs
                logging.info('Step %s in epoch %s; global step %s' %
                             (step, epoch, global_step))
                logging.info('Seen this epoch: %s samples' %
                             ((step + 1) * FLAGS.batch_size))
                logging.info('Total loss (this step): %s' %
                             float(train_loss + train_l2_loss))
                with train_writer.as_default():
                    tf.summary.scalar("training/global_gradient_norm",
                                      data=tf.linalg.global_norm(train_grads),
                                      step=global_step)
                    tf.summary.scalar('training/loss',
                                      data=train_loss,
                                      step=global_step)
                    tf.summary.scalar('training/l2_loss',
                                      data=train_l2_loss,
                                      step=global_step)
                    tf.summary.scalar('training/total_loss',
                                      data=train_loss + train_l2_loss,
                                      step=global_step)
                    tf.summary.scalar('training/learning_rate',
                                      data=lr_schedule(epoch),
                                      step=global_step)
                # Update metrics
                train_metrics.update(train_labels, train_logits,
                                     train_predictions_u)
                # Log metrics
                train_metrics.log(global_step)
                # Save latest model
                model_saver.save_latest(model=model,
                                        step=global_step,
                                        file="model")
                # Flush TensorBoard
                train_writer.flush()

            # Evaluate every FLAGS.eval_steps steps.
            if global_step % FLAGS.eval_steps == 0:
                logging.info('Evaluating at global step %s' % global_step)
                # Keep track of eval losses
                eval_losses = []
                eval_l2_losses = []
                # Iterate through eval batches
                for i, (eval_features, eval_labels, eval_labels_c,
                        eval_labels_l) in enumerate(eval_dataset):
                    # Assert sizes
                    assert eval_labels.shape == [
                        FLAGS.eval_batch_size, seq_length
                    ], "Labels shape [batch_size, seq_length]"
                    # Run the eval step
                    eval_logits, eval_loss, eval_l2_loss = eval_step(
                        model, eval_features, eval_labels, eval_labels_c,
                        eval_labels_l, eval_loss_fn)
                    eval_losses.append(eval_loss.numpy())
                    eval_l2_losses.append(eval_l2_loss.numpy())
                    # Assert sizes
                    assert eval_logits.shape == [
                        FLAGS.eval_batch_size, seq_length,
                        rep.get_num_classes()
                    ], "Logits shape [batch_size, seq_length, num_classes]"
                    # Decode logits into predictions
                    eval_predictions_u = None
                    if FLAGS.loss_mode == "ctc":
                        eval_predictions_u, _ = rep.get_decode_fn(
                            FLAGS.eval_batch_size)(eval_logits)
                        eval_predictions_u = rep.get_inference_collapse_fn()(
                            eval_predictions_u)
                    # Update metrics for this batch
                    eval_metrics.update_i(eval_labels, eval_logits,
                                          eval_predictions_u)
                # Update mean metrics
                eval_score = eval_metrics.update()
                # General logs
                eval_loss = np.mean(eval_losses)
                eval_l2_loss = np.mean(eval_l2_losses)
                logging.info('Evaluation loss: %s' %
                             float(eval_loss + eval_l2_loss))
                with eval_writer.as_default():
                    tf.summary.scalar('training/loss',
                                      data=eval_loss,
                                      step=global_step)
                    tf.summary.scalar('training/l2_loss',
                                      data=eval_l2_loss,
                                      step=global_step)
                    tf.summary.scalar('training/total_loss',
                                      data=eval_loss + eval_l2_loss,
                                      step=global_step)
                # Log metrics
                eval_metrics.log(global_step)
                # Save best models
                model_saver.save_best(model=model,
                                      score=float(eval_score),
                                      step=global_step,
                                      file="model")
                # Flush TensorBoard
                eval_writer.flush()

            # Clean up memory
            tf.keras.backend.clear_session()
            gc.collect()

            # Increment global step
            global_step += 1

        # Save and keep latest model for every 10th epoch
        if epoch % 10 == 9:
            model_saver.save_keep(model=model, step=global_step, file="model")

        logging.info('Finished epoch %s' % (epoch, ))
        optimizer.finish_epoch()

    # Save final model
    model_saver.save_latest(model=model, step=global_step, file="model")
    # Finished training
    logging.info("Finished training")
Exemplo n.º 9
0
    return x_train, y_train, x_predict, y_predict

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--datapath', default=None, help='The path of dataset file')
    parser.add_argument('--train', '-t', action='store_true', help='Training ATCNN model?')
    parser.add_argument('--predict', '-p', action='store_true', help='Applying ATCNN to prediction?')
    parser.add_argument('--readlabel', '-r', action='store_true', help='Whether read the label?')
    parser.add_argument('--niter', type=int, default=2500, help='Number of epochs to training ATCNN')
    parser.add_argument('--ratio', type=float, default=0.8, help='The ratio of training set to data set')
    parser.add_argument('--batchsize', type=int, default=128, help='Batch Size')
    opt = parser.parse_args()
    
    model = ATCNN_Ef_model()
    Rep = Representation()
    print(model.summary())
    #sys.exit(0)
    
    if opt.train:
        opt.readlabel = True
        x_data, y_data, _ = read_input(opt.datapath, opt.readlabel)
        x_train, y_train, x_predict, y_predict = data_split(x_data, y_data, split_ratio=opt.ratio)
        x_train = np.reshape(x_train,(len(x_train),10,10,1))     
        x_predict = np.reshape(x_predict,(len(x_predict),10,10,1))     
        
        model.fit(x_train, y_train, validation_split=0.02, batch_size=opt.batchsize, epochs=opt.niter)
        model.save('model.h5')
        loss = model.evaluate(x_predict, y_predict, batch_size=opt.batchsize)
        y_calc = model.predict(x_predict, batch_size=opt.batchsize)
        print('test set loss:',loss)
Exemplo n.º 10
0
def main(arg=None):
    # Make target dir
    export_dir = os.path.join(FLAGS.predict_dir,
                              "beam_width_" + str(FLAGS.beam_width))
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)
    # Get representation and metrics
    seq_length = FLAGS.seq_length
    num_classes = FLAGS.num_classes
    rep = Representation(blank_index=BLANK_INDEX,
                         def_val=DEF_VAL,
                         loss_mode=None,
                         num_event_classes=num_classes - 1,
                         pad_val=PAD_VAL,
                         use_def=False,
                         decode_fn=FLAGS.decode_fn,
                         beam_width=FLAGS.beam_width)
    rep.set_seq_length(seq_length)
    metrics = PredMetrics(rep)
    # Find files
    filenames = sorted(gfile.Glob(os.path.join(FLAGS.logits_dir,
                                               "*.tfrecord")))
    # For each file
    for filename in filenames:
        # Get video id
        video_id = os.path.splitext(os.path.basename(filename))[0]
        export_csv = os.path.join(FLAGS.predict_dir,
                                  "beam_width_" + str(FLAGS.beam_width),
                                  str(video_id) + ".csv")
        logging.info("Working on {0}.".format(video_id))
        # Get data information
        data = tf.data.TFRecordDataset(filename)
        n = len(list(data))
        v_seq_length = n + seq_length - 1
        # Get the aggregators
        labels_aggregator = aggregation.ConcatAggregator(n=n,
                                                         idx=seq_length - 1)
        logits_aggregator = aggregation.AverageAggregator(
            num_classes=num_classes, seq_length=seq_length)
        decode_fn = rep.get_decode_fn(1)
        preds_aggregator = aggregation.BatchLevelVotedPredsAggregator(
            num_classes=num_classes,
            seq_length=seq_length,
            def_val=DEF_VAL,
            decode_fn=decode_fn)
        # Iterate through batches
        for i, batch_data in enumerate(data):
            b_logits, b_labels = parse(batch_data)
            # Aggregation step
            labels_aggregator.step(i, b_labels)
            logits_aggregator.step(i, b_logits)
            preds_aggregator.step(i, b_logits)
        # Get aggregated data
        labels = labels_aggregator.result()
        logits = logits_aggregator.result()
        preds = preds_aggregator.result()
        # Collapse on video level
        preds = rep.get_inference_collapse_fn(v_seq_length)(preds)
        # Remove empty batch dimensions
        labels = tf.squeeze(labels, axis=0)
        logits = tf.squeeze(logits, axis=0)
        preds = tf.squeeze(preds, axis=0)
        # Update metrics for single stage model
        metrics.update(labels, preds)
        # Save
        ids = [video_id] * v_seq_length
        logging.info("Writing {0} examples to {1}.csv...".format(
            len(ids), video_id))
        save_array = np.column_stack(
            (ids, labels.numpy().tolist(), logits.numpy().tolist(),
             preds.numpy().tolist()))
        np.savetxt(export_csv, save_array, delimiter=",", fmt='%s')
    # Print metrics
    metrics.finish()
Exemplo n.º 11
0
#pygame.init()
screen = pygame.display.set_mode(config.size)

clock = pygame.time.Clock()  #object created to keep track of time

#simulation.car.Car.slowDownProbability = config.slowDownProbability
#simulation.car.Car.laneChangeProbability = config.laneChangeProbability

speedLimits = simulation.speedLimits.SpeedLimits(
    config.speedLimits,
    config.maxSpeed)  #takes speedLimits and maxSpeed input from source file
road = simulation.road.Road(
    config.lanes, config.length, speedLimits
)  #road takes lane and length input from source file and speed limit from ^
simulation = SimulationManager(
    road, config.trafficGenerator, config.updateFrame
)  #simulation takes input from road, trafficgen from source file, and update frame from source file
representation = Representation(screen, road,
                                simulation)  #from above functions

while simulation.running:
    for event in pygame.event.get():
        if event.type == pygame.KEYDOWN:
            simulation.processKey(event.key)
    clock.tick_busy_loop(config.maxFps)  #.tick_busy_loop = updates the clock
    dt = clock.get_time()  # —	time used in the previous tick
    simulation.update(dt)  #updates logistics
    #  representation.draw(dt * simulation.timeFactor) #updates graphics
    representation.batch(dt * simulation.timeFactor)  #batch mode
#  pygame.display.flip()
Exemplo n.º 12
0
    'config.case')  #sys.argv[1] = e.g. .case or .trafficlight

random.seed(config.seed)  #this too
pygame.init()
screen = pygame.display.set_mode(config.size)

clock = pygame.time.Clock()  #object created to keep track of time

speedLimits = simulation.speedLimits.SpeedLimits(
    config.speedLimits,
    config.maxSpeed)  #takes speedLimits and maxSpeed input from source file
road = simulation.road.Road(
    config.lanes, config.length, speedLimits
)  #road takes lane and length input from source file and speed limit from ^
simulation = SimulationManager(
    road, config.trafficGenerator, config.updateFrame
)  #simulation takes input from road, trafficgen from source file, and update frame from source file
representation = Representation(screen, road,
                                simulation)  #from above functions

while simulation.running:
    for event in pygame.event.get():
        if event.type == pygame.KEYDOWN:
            simulation.processKey(event.key)
    clock.tick_busy_loop(config.maxFps)  #.tick_busy_loop = updates the clock
    dt = clock.get_time()  # —	time used in the previous tick
    simulation.update(dt)  #updates logistics
    representation.draw(dt * simulation.timeFactor)  #updates graphics
    #  representation.batch(dt * simulation.timeFactor) #batch mode
    pygame.display.flip()
Exemplo n.º 13
0
import pandas as pd
from representation import Representation
from sklearn.metrics.pairwise import cosine_similarity

k = 5
R = Representation()
df_bigram, df2_bigram = R.bigram_count()

sim = cosine_similarity(df_bigram, df2_bigram)
df_sim = pd.DataFrame(sim, index=df_bigram.index, columns=df2_bigram.index)
best = df_sim[0].nlargest(k)

print(R.reading.iloc[0, -1])
print()
for each in best.index:
    print(R.df_songs.iloc[each, 0])
Exemplo n.º 14
0
from data_preprocessing import DataPreprocessing
from representation import Representation
from sklearn.model_selection import GridSearchCV
import numpy as np

# load data
X_public, y_public, X_eval = DataPreprocessing.loadData()

# split data
X_public_h, X_public_t, y_public_h, y_public_t = DataPreprocessing.splitData(X_public, y_public, 0.70)

# fix data
X_public_t = DataPreprocessing.fixData(X_public_t)
X_public_h = DataPreprocessing.fixData(X_public_h)

# scale data
X_public_t = DataPreprocessing.scalerData(X_public_t)


# ------------------------------------------------------------------------------
from representation import Representation
from d_tree import _DecisionTree as dt

model = dt.initGrid(X_public_t, y_public_t)
Representation.evaluate(model, X_public_h, y_public_h, "DecisionTreeBoosted", "")


# ------------------------------------------------------------------------------
X_eval = DataPreprocessing.scalerData(X_eval)
DataPreprocessing.saveData(model, X_eval)
Exemplo n.º 15
0
class GetData:
    
    def __init__(self, L, cell_line, descriptor='jtvae', n_fold=5, random_state=0, random_genes=False,
                 csv_file="", useChirality=False):
        """
            Parameters
            -----------
            L: dictionary from L1000CDS_subset.json

            cell_line: cell_id
                params:
                    string: 'VCAP', 'A549', 'A375', 'PC3', 'MCF7', 'HT29', etc.

            descriptor: descriptor for chemical compounds.
                params:
                    string: 'ecfp', 'ecfp_autoencoder', 'maccs', 'topological', 'shed', 'cats2d', 'jtvae'(default)

            n_fold: number of folds
                params:
                    int: 5(default)

            random_state: random_state for Kfold
                params:
                    int: 0(default)

            random_genes: if it is true, returns random 20 genes from target values
                params:
                    bool: False(default)
                    list of random genes: [118, 919, 274, 866, 354, 253, 207, 667, 773, 563,
                                           553, 918, 934, 81, 56, 232, 892, 485, 30, 53]

            csv_file: if it is not empty, representation data used from this file
                params:
                    string: "<csv_file_path>"
        """
        
        self.L = L
        self.cell_line = cell_line
        self.descriptor = descriptor
        self.n_fold = n_fold
        self.random_state = random_state
        self.random_genes = random_genes
        self.csv_file = csv_file
        self.useChirality = useChirality
        
        if self.useChirality and self.descriptor != 'ecfp':
            sys.exit('useChirality parameter is only usable with ecfp descriptor.')
            
        self.random_index_list = [118, 919, 274, 866, 354, 253, 207, 667, 773, 563,
                                  553, 918, 934, 81, 56, 232, 892, 485, 30, 53]

        self.LmGenes = []
        self.meta_smiles = pd.read_csv('meta_SMILES.csv')

        file_path = 'LmGenes.txt'
        with open(file_path) as fp:
            line = fp.readline()
            while line:
                self.LmGenes.append(line.strip())
                line = fp.readline()
        
        self.rep = Representation(self.descriptor)

    def get_regression_data(self):
        X = []
        Y = []
        perts = []
        unique_smiles = []
        counter = 0
        length = len(self.L[self.cell_line])
        
        print('Getting data...')
        
        data = None
        if len(self.csv_file) != 0:
            data = pd.read_csv(self.csv_file)

        for pert_id in self.L[self.cell_line]:
            counter += 1
            if counter % 10 == 0:
                print('%.1f %%    \r' % (counter / length * 100), end=""),
                
            smiles = self.meta_smiles[self.meta_smiles['pert_id'] == pert_id]['SMILES'].values[0]
            if str(smiles) == 'nan' or str(smiles) == '-666':
                continue

            if not self.useChirality:
                mol = Chem.MolFromSmiles(smiles)
                canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
            else:
                canonical_smiles = smiles

            if canonical_smiles in unique_smiles or len(canonical_smiles) > 120:
                continue
            if data is not None:
                if data[data['pert_id'] == pert_id].empty:
                    continue
                else:
                    feature = data[data['pert_id'] == pert_id].drop(['pert_id'], axis=1).values[0].tolist()
            else:
                feature = self.rep.get_representation(smiles=canonical_smiles, descriptor=self.descriptor,
                                                      useChirality=self.useChirality)

            unique_smiles.append(canonical_smiles)
            labels = self.L[self.cell_line][pert_id]['chdirLm']
            X.append(feature)
            Y.append(labels)
            perts.append(pert_id)

        x = np.asarray(X)
        y = np.asarray(Y)

        x_columns = ['SMILES']
        if self.descriptor == 'ecfp':
            for i in range(x.shape[1]-1):
                x_columns.append('ecfp_' + str(i + 1))
        elif self.descriptor == 'ecfp_autoencoder':
            for i in range(x.shape[1]-1):
                x_columns.append('ecfp_autoencoder_' + str(i + 1))
        elif self.descriptor == 'topological':
            for i in range(x.shape[1]-1):
                x_columns.append('topological_' + str(i + 1))
        elif self.descriptor == 'maccs':
            for i in range(x.shape[1]-1):
                x_columns.append('maccs_' + str(i + 1))
        elif self.descriptor == 'jtvae':
            for i in range(x.shape[1]-1):
                x_columns.append('jtvae_' + str(i + 1))
        elif self.descriptor == 'shed':
            for i in range(x.shape[1]-1):
                x_columns.append('shed_' + str(i + 1))
        elif self.descriptor == 'cats2d':
            for i in range(x.shape[1]-1):
                x_columns.append('cats2d_' + str(i + 1))

        x = pd.DataFrame(x, index=perts, columns=x_columns)
        y = pd.DataFrame(y, index=perts)
        folds = list(KFold(self.n_fold, shuffle=True, random_state=self.random_state).split(x))

        if self.random_genes:
            y_random = []
            for i in self.random_index_list:
                y_random.append(y.iloc[:, i:i + 1])
            df = y_random[0]
            for i in range(len(y_random) - 1):
                df = pd.concat([df, y_random[i + 1]], axis=1)
            y = df

        print('\nDone.')
        
        return x, y, folds

    def get_up_genes(self):
        X = []
        Y = []
        perts = []
        unique_smiles = []
        counter = 0
        length = len(self.L[self.cell_line])
        
        print('Getting data...')
        
        class_dict = {}
        data = None
        if len(self.csv_file) != 0:
            data = pd.read_csv(self.csv_file)

        for gene in self.LmGenes:
            class_dict.update({gene: 0})

        for pert_id in self.L[self.cell_line]:
            counter += 1
            if counter % 10 == 0:
                print('%.1f %%    \r' % (counter / length * 100), end=""),
                
            if 'upGenes' not in self.L[self.cell_line][pert_id]:
                continue

            smiles = self.meta_smiles[self.meta_smiles['pert_id'] == pert_id]['SMILES'].values[0]
            if str(smiles) == 'nan' or str(smiles) == '-666':
                continue

            if not self.useChirality:
                mol = Chem.MolFromSmiles(smiles)
                canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
            else:
                canonical_smiles = smiles

            if canonical_smiles in unique_smiles or len(canonical_smiles) > 120:
                continue
            if data is not None:
                if data[data['pert_id'] == pert_id].empty:
                    continue
                else:
                    feature = data[data['pert_id'] == pert_id].drop(['pert_id'], axis=1).values[0].tolist()
            else:
                feature = self.rep.get_representation(smiles=canonical_smiles, descriptor=self.descriptor,
                                                      useChirality=self.useChirality)

            unique_smiles.append(canonical_smiles)
            up_genes = list(set(self.L[self.cell_line][pert_id]['upGenes']))
            class_dict = dict.fromkeys(class_dict, 0)

            for gene in up_genes:
                if gene in class_dict:
                    class_dict.update({gene: 1})

            labels = np.fromiter(class_dict.values(), dtype=int)
            X.append(feature)
            Y.append(labels)
            perts.append(pert_id)

        x = np.asarray(X)
        y = np.asarray(Y)

        x_columns = ['SMILES']
        if self.descriptor == 'ecfp':
            for i in range(x.shape[1]-1):
                x_columns.append('ecfp_' + str(i + 1))
        elif self.descriptor == 'ecfp_autoencoder':
            for i in range(x.shape[1]-1):
                x_columns.append('ecfp_autoencoder_' + str(i + 1))
        elif self.descriptor == 'topological':
            for i in range(x.shape[1]-1):
                x_columns.append('topological_' + str(i + 1))
        elif self.descriptor == 'maccs':
            for i in range(x.shape[1]-1):
                x_columns.append('maccs_' + str(i + 1))
        elif self.descriptor == 'jtvae':
            for i in range(x.shape[1]-1):
                x_columns.append('jtvae_' + str(i + 1))
        elif self.descriptor == 'shed':
            for i in range(x.shape[1]-1):
                x_columns.append('shed_' + str(i + 1))
        elif self.descriptor == 'cats2d':
            for i in range(x.shape[1]-1):
                x_columns.append('cats2d_' + str(i + 1))

        x = pd.DataFrame(x, index=perts, columns=x_columns)
        y = pd.DataFrame(y, index=perts)
        folds = list(KFold(self.n_fold, shuffle=True, random_state=self.random_state).split(x))

        if self.random_genes:
            y_random = []
            for i in self.random_index_list:
                y_random.append(y.iloc[:, i:i + 1])
            df = y_random[0]
            for i in range(len(y_random) - 1):
                df = pd.concat([df, y_random[i + 1]], axis=1)
            y = df

        print('\nDone.')
        
        return x, y, folds

    def get_down_genes(self):
        X = []
        Y = []
        perts = []
        unique_smiles = []
        counter = 0
        length = len(self.L[self.cell_line])
        
        print('Getting data...')
        
        class_dict = {}
        data = None
        if len(self.csv_file) != 0:
            data = pd.read_csv(self.csv_file)

        for gene in self.LmGenes:
            class_dict.update({gene: 0})

        for pert_id in self.L[self.cell_line]:
            counter += 1
            if counter % 10 == 0:
                print('%.1f %%    \r' % (counter / length * 100), end=""),
                
            if 'dnGenes' not in self.L[self.cell_line][pert_id]:
                continue

            smiles = self.meta_smiles[self.meta_smiles['pert_id'] == pert_id]['SMILES'].values[0]
            if str(smiles) == 'nan' or str(smiles) == '-666':
                continue

            if not self.useChirality:
                mol = Chem.MolFromSmiles(smiles)
                canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
            else:
                canonical_smiles = smiles

            if canonical_smiles in unique_smiles or len(canonical_smiles) > 120:
                continue
            if data is not None:
                if data[data['pert_id'] == pert_id].empty:
                    continue
                else:
                    feature = data[data['pert_id'] == pert_id].drop(['pert_id'], axis=1).values[0].tolist()
            else:
                feature = self.rep.get_representation(smiles=canonical_smiles, descriptor=self.descriptor,
                                                      useChirality=self.useChirality)

            unique_smiles.append(canonical_smiles)
            dn_genes = list(set(self.L[self.cell_line][pert_id]['dnGenes']))
            class_dict = dict.fromkeys(class_dict, 0)
            for gene in dn_genes:
                if gene in class_dict:
                    class_dict.update({gene: 1})

            labels = np.fromiter(class_dict.values(), dtype=int)
            X.append(feature)
            Y.append(labels)
            perts.append(pert_id)

        x = np.asarray(X)
        y = np.asarray(Y)

        x_columns = ['SMILES']
        if self.descriptor == 'ecfp':
            for i in range(x.shape[1]-1):
                x_columns.append('ecfp_' + str(i + 1))
        elif self.descriptor == 'ecfp_autoencoder':
            for i in range(x.shape[1]-1):
                x_columns.append('ecfp_autoencoder_' + str(i + 1))
        elif self.descriptor == 'topological':
            for i in range(x.shape[1]-1):
                x_columns.append('topological_' + str(i + 1))
        elif self.descriptor == 'maccs':
            for i in range(x.shape[1]-1):
                x_columns.append('maccs_' + str(i + 1))
        elif self.descriptor == 'jtvae':
            for i in range(x.shape[1]-1):
                x_columns.append('jtvae_' + str(i + 1))
        elif self.descriptor == 'shed':
            for i in range(x.shape[1]-1):
                x_columns.append('shed_' + str(i + 1))
        elif self.descriptor == 'cats2d':
            for i in range(x.shape[1]-1):
                x_columns.append('cats2d_' + str(i + 1))

        x = pd.DataFrame(x, index=perts, columns=x_columns)
        y = pd.DataFrame(y, index=perts)
        folds = list(KFold(self.n_fold, shuffle=True, random_state=self.random_state).split(x))

        if self.random_genes:
            y_random = []
            for i in self.random_index_list:
                y_random.append(y.iloc[:, i:i + 1])
            df = y_random[0]
            for i in range(len(y_random) - 1):
                df = pd.concat([df, y_random[i + 1]], axis=1)
            y = df

        print('\nDone.')    
            
        return x, y, folds