Exemplo n.º 1
0
def main(train_file, val_file, kmer, model_dir):
    input_train_seq, input_train_err, label = ut.get_data_jm(train_file, kmer)
    input_val_seq, input_val_err, vy = ut.get_data_jm(val_file, kmer)
    # import pdb;pdb.set_trace()
    cls = autosklearn.classification.AutoSklearnClassifier()
    cls.fit([input_train_seq, input_train_err], label)
    import pdb
    pdb.set_trace()
    predictions = cls.predict([input_val_seq, input_val_err])

    import pdb
    pdb.set_trace()
Exemplo n.º 2
0
def train_jm(train_file,
             val_file,
             log_dir,
             model_dir,
             batch_size,
             kmer,
             epochs,
             checkpoint_path=None):

    strategy = tf.distribute.MirroredStrategy()
    #print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    #BATCH_SIZE_PER_REPLICA = 64
    #BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    log_dir += datetime.datetime.now().strftime("%Y%m%d-%H%M%S_jm")
    model_dir += datetime.datetime.now().strftime("%Y%m%d-%H%M%S_jm_model")

    input_train_seq, input_train_err, label = ut.get_data_jm(train_file, kmer)
    input_val_seq, input_val_err, vy = ut.get_data_jm(val_file, kmer)

    ## train model
    with strategy.scope():
        model = JointNN()
        model.compile(
            loss='binary_crossentropy',
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.00125),
            metrics=['accuracy'])
        input_shape = ([(None, kmer, 9), (None, kmer, 9)])
        model.build(input_shape)
        print(model.summary())
        if checkpoint_path:
            model.load_weights(checkpoint_path)

        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                              histogram_freq=1)
        callback_list = [
            tensorboard_callback,
            tf.keras.callbacks.ModelCheckpoint(filepath=model_dir,
                                               monitor='val_accuracy',
                                               mode='max',
                                               save_best_only=True,
                                               save_weights_only=False)
        ]

        model.fit([input_train_seq, input_train_err],
                  label,
                  batch_size=batch_size,
                  epochs=epochs,
                  callbacks=callback_list,
                  validation_data=([input_val_seq, input_val_err], vy))
    model.save(model_dir)

    return None
Exemplo n.º 3
0
def joint_read_calling(test_file, kmer, trained_model, model_type):
    data_seq, data_err, labels, data_id = ut.get_data_jm(test_file,
                                                         kmer,
                                                         get_id=True)
    pred, inferred = test_single_read([data_seq, data_err], trained_model,
                                      model_type, kmer)
    return pred, inferred, data_id
Exemplo n.º 4
0
def main(features, model_type, model, kmer, output):

    if model_type == 'seq':
        data_seq, labels = ut.get_data_sequence(features, kmer, err_features)
        acc, pred, inferred = acc_test_single(data_seq, labels, model)

    elif model_type == 'err':
        data_err, labels = ut.get_data_errors(features, kmer)
        acc, pred, inferred = acc_test_single(data_err, labels, model)

    elif model_type == 'joint':
        data_seq, data_err, labels = ut.get_data_jm(features, kmer)

        acc, pred, inferred = acc_test_single([data_seq, data_err], labels,
                                              model)

        mean, median, std, rang, len_sig = get_ind_feat_seq(
            data_seq, labels, pred, inferred)
        quality, mismatch, deletion, insertion = get_ind_feat_err(
            data_err, labels, pred, inferred)

        to_plot = [(mean, 'mean.pdf', (-3, 3)), (median, 'median.pdf', (-3, 3)), \
            (std, 'std.pdf', (-3, 3)), (rang, 'rang.pdf', (-0.5, 3)), (len_sig, 'len_sig.pdf', (0, 50)), \
            (quality, 'quality.pdf', (0, 20))]

        plot_err = [(mismatch, 'mismatch.pdf'), (deletion, \
                'deletion.pdf'), (insertion, 'insertion.pdf')]

        for el in to_plot:
            pl.feature_exploration_plots(el[0], kmer, output, el[1], el[2])
            pl.do_PCA(el[0], kmer, output, el[1])
Exemplo n.º 5
0
def process_chunk(features, tmp_folder, output, model):
    df = pd.read_csv(os.path.join(tmp_folder, features),
                     sep='\t',
                     names=names_all)
    df = df[(df['pos'] >= 1000000) & (df['pos'] <= 2000000)]
    counter = round(random(), 10)

    if df.shape[0] > 0:
        ut.preprocess_combined(df, output, 'all_{}'.format(counter), 'test')
        test_file = os.path.join(output, 'test_all_{}.h5'.format(counter))

        data_seq, data_err, labels = ut.get_data_jm(test_file, 17)

        acc, pred, inferred = acc_test_single([data_seq, data_err], labels,
                                              model)

        info = Counter(df['methyl_label'])
        print(acc, info)
Exemplo n.º 6
0
def call_mods_user(model_type,
                   test_file,
                   trained_model,
                   kmer,
                   output,
                   err_features=False,
                   pos_based=False,
                   pred_type='min_max',
                   figures=False):

    ## process text file input
    if test_file.rsplit('.')[-1] == 'tsv':
        print("processing tsv file, this might take a while...")
        test = pd.read_csv(test_file, sep='\t', names=pr.names_all)
        ut.preprocess_combined(test, os.path.dirname(test_file), '',
                               'test_all')
        test_file = os.path.join(os.path.dirname(test_file), 'test_all.h5')

    ## read-based calling
    if model_type == 'seq':
        data_seq, labels = ut.get_data_sequence(test_file, kmer, err_features)
        pred, inferred = test_single_read(data_seq, trained_model)

    elif model_type == 'err':
        data_err, labels = ut.get_data_errors(test_file, kmer)
        pred, inferred = test_single_read(data_err, trained_model)

    elif model_type == 'joint':
        data_seq, data_err, labels, data_id = ut.get_data_jm(test_file,
                                                             kmer,
                                                             get_id=True)
        pred, inferred = test_single_read([data_seq, data_err], trained_model,
                                          labels)
    # ut.save_probs_user(pred, inferred, output)

    ## position-based calling
    # TODO store position info in test file
    if pos_based:
        if 'data_id' in locals():
            test = build_test_df(data_id)
            #TODO DELETE
            test['methyl_label'] = labels

        #TODO output proper df with all the information. put columns at different thresholds as well as the min max for testing
        all_preds = do_per_position_analysis(test, pred, inferred, output,
                                             pred_type)

        all_preds.to_csv(os.path.join(output, 'human_chr1_table.tsv'),
                         sep='\t',
                         index=None)
        # test.to_csv(os.path.join(output, '60_mod_40_unmod_pred_table.tsv'), sep='\t', index=None)

        uu = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_005'],
                                             average='binary')
        xx = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_01'],
                                             average='binary')
        yy = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_02'],
                                             average='binary')
        zz = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_03'],
                                             average='binary')
        ww = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_04'],
                                             average='binary')
        vv = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_min_max'],
                                             average='binary')
        pp = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_posterior'],
                                             average='binary')
        pp = precision_recall_fscore_support(all_preds['meth_label'],
                                             all_preds['pred_beta'],
                                             average='binary')

        # print(xx,yy,zz,ww,vv,pp)
        import pdb
        pdb.set_trace()