Пример #1
0
def main():
    """
    The main executable function
    """
    parser = make_argument_parser()
    args = parser.parse_args()

    input_dir = args.inputdir
    model_dir = args.modeldir
    tf = args.factor
    bed_file = args.bed
    output_file = args.outputfile

    print 'Loading genome'
    genome = utils.load_genome()
    print 'Loading model'
    model_tfs, model_bigwig_names, features, model = utils.load_model(
        model_dir)
    L = model.input_shape[0][1]
    utils.L = L
    assert tf in model_tfs
    assert 'bigwig' in features
    use_meta = 'meta' in features
    use_gencode = 'gencode' in features
    print 'Loading test data'
    is_sorted = True
    bigwig_names, meta_names, datagen_bed, nonblacklist_bools = utils.load_beddata(
        genome, bed_file, use_meta, use_gencode, input_dir, is_sorted)
    assert bigwig_names == model_bigwig_names
    if use_meta:
        model_meta_file = model_dir + '/meta.txt'
        assert os.path.isfile(model_meta_file)
        model_meta_names = np.loadtxt(model_meta_file, dtype=str)
        if len(model_meta_names.shape) == 0:
            model_meta_names = [str(model_meta_names)]
        else:
            model_meta_names = list(model_meta_names)
        assert meta_names == model_meta_names
    print 'Generating predictions'
    model_tf_index = model_tfs.index(tf)
    model_predicts = model.predict_generator(datagen_bed,
                                             val_samples=len(datagen_bed),
                                             pickle_safe=True)
    if len(model_tfs) > 1:
        model_tf_predicts = model_predicts[:, model_tf_index]
    else:
        model_tf_predicts = model_predicts
    final_scores = np.zeros(len(nonblacklist_bools))
    final_scores[nonblacklist_bools] = model_tf_predicts
    print 'Saving predictions'
    df = pandas.read_csv(bed_file, sep='\t', header=None)
    df[3] = final_scores
    df.to_csv(output_file,
              sep='\t',
              compression='gzip',
              float_format='%.3e',
              header=False,
              index=False)
Пример #2
0
def main():
    """
    The main executable function
    """
    parser = make_argument_parser()
    args = parser.parse_args()

    input_dir = args.inputdir
    model_dir = args.modeldir
    bed_file = args.bed
    chrom = args.chrom

    if args.outputdir is None:
        clobber = True
        output_dir = args.outputdirc
    else:
        clobber = False
        output_dir = args.outputdir

    try:  # adapted from dreme.py by T. Bailey
        os.makedirs(output_dir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print >> sys.stderr, (
                    'output directory (%s) already exists '
                    'but you specified not to clobber it') % output_dir
                sys.exit(1)
            else:
                print >> sys.stderr, ('output directory (%s) already exists '
                                      'so it will be clobbered') % output_dir

    print 'Loading genome'
    genome = utils.load_genome()
    print 'Loading model'
    model_tfs, model_bigwig_names, features, model = utils.load_model(
        model_dir)
    L = model.input_shape[0][1]
    utils.L = L
    use_meta = 'meta' in features
    use_gencode = 'gencode' in features
    print 'Loading BED data'
    is_sorted = False
    bigwig_names, meta_names, datagen_bed, nonblacklist_bools = utils.load_beddata(
        genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom)
    assert bigwig_names == model_bigwig_names
    if use_meta:
        model_meta_file = model_dir + '/meta.txt'
        assert os.path.isfile(model_meta_file)
        model_meta_names = np.loadtxt(model_meta_file, dtype=str)
        if len(model_meta_names.shape) == 0:
            model_meta_names = [str(model_meta_names)]
        else:
            model_meta_names = list(model_meta_names)
        assert meta_names == model_meta_names
    output_results(bigwig_names, datagen_bed, model, output_dir)
Пример #3
0
def main():
    """
    The main executable function
    """
    parser = make_argument_parser()
    args = parser.parse_args()

    input_dirs = args.inputdirs
    tf = args.factor
    valid_chroms = args.validchroms
    valid_input_dirs = args.validinputdirs
    test_chroms = args.testchroms
    epochs = args.epochs
    patience = args.patience
    learningrate = args.learningrate
    seed = args.seed
    utils.set_seed(seed)
    dropout_rate = args.dropout
    L = args.seqlen
    w = args.motifwidth
    utils.L = L
    utils.w = w
    utils.w2 = w/2
    negatives = args.negatives
    assert negatives > 0
    meta = args.meta
    gencode = args.gencode
    motif = args.motif

    num_motifs = args.kernels
    num_recurrent = args.recurrent
    num_dense = args.dense
 
    features = ['bigwig']    
    #pdb.set_trace()

    if tf:
        print('Single-task training:', tf)
    else:
        print('Multi-task training')
        singleTask = False
        #Cannot use any metadata features
        assert not meta
        assert not gencode

    if args.outputdir is None:
        clobber = True
        output_dir = args.outputdirc
    else:
        clobber = False
        output_dir = args.outputdir

    try:  # adapted from dreme.py and train.py by T. Bailey & Daniel Quang
        os.makedirs(output_dir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print(sys.stderr, ('output directory (%s) already exists '
                                      'but you specified not to clobber it') % output_dir)
                sys.exit(1)
            else:
                print(sys.stderr, ('output directory (%s) already exists '
                                      'so it will be clobbered') % output_dir)

    print('Loading genome')
    genome = utils.load_genome()
    if valid_input_dirs:
        print('You specified at least one validation input directory')
        assert singleTask # This option only works for single-task training
    print('Loading ChIP labels')
    if singleTask:
        num_tfs = 1
    else:
        assert len(input_dirs) == 1 # multi-task training only supports one cell line
        input_dir = input_dirs[0]
        tfs, positive_windows, y_positive, nonnegative_regions_bed = \
            utils.load_chip_multiTask(input_dir)
        num_tfs = len(tfs)
    print('Loading bigWig data')
    bigwig_names, bigwig_files_list = utils.load_bigwigs(input_dirs)
    num_bigwigs = len(bigwig_names)
    if valid_input_dirs:
        valid_bigwig_names, valid_bigwig_files_list = utils.load_bigwigs(valid_input_dirs)
        assert valid_bigwig_names == bigwig_names
    if not singleTask:
        bigwig_files = bigwig_files_list[0]
    if meta:## did not use in scFAN
        print('Loading metadata features')
        
    else:# meta option was not selected, pass empty metadata features to the functions
        meta_list = [[] for bigwig_files in bigwig_files_list]
        if valid_input_dirs:
            valid_meta_list = [[] for bigwig_files in valid_bigwig_files_list]
    
    print('Making features')
    if singleTask:## did not use in scFAN
        print('single Task feature')
    else:
        datagen_train, datagen_valid,datagen_test,data_valid,data_test = \
            utils.make_features_multiTask(positive_windows, y_positive,
            nonnegative_regions_bed, bigwig_files, bigwig_names,
            genome, epochs, valid_chroms, test_chroms)
    #pdb.set_trace()
    print('Building model')
    if len(data_test) == 0:
        model_predicts,model_tfs = test(output_dir,datagen_valid)
        data_test = data_valid
    else:
        model_predicts,model_tfs = test(output_dir,datagen_test)

    ### test and evaluate  users and uncomment this part for evaluation
    '''
    test_label_all = []
    for _,item in enumerate(data_test):
        test_label_all.append(item[-1])
    test_label_all = np.array(test_label_all)
    atac = []
    atacpr = []
    atacrc = []
    #pdb.set_trace()
    for index in range(len(model_tfs)):
        truth = test_label_all[:,index]
        pred = model_predicts[:,index]
        atac.append(roc_auc_score(truth, pred))
        atacpr.append(average_precision_score(truth, pred))
        #atacrc.append(recall_score(truth,pred,average='macro'))
        try:
            precision, recall, thresholds = precision_recall_curve(truth, pred)
            atacrc.append(recall[np.where(precision==0.5)[0][0]])
        except:
            #pdb.set_trace()
            atacrc.append(0)#(recall[np.where((precision>=0.49)&(precision<0.51))[0][0]])
            #continue
    #np.save('draw_plot/%s/more_TFs/auc_value_%s_ATAC_moreTFs_FactorNet'%(motif,motif),atac) #np.save('draw_plot/%s/more_TFs/auc_value_%s_ATAC_moreTFs_FactorNet'%(motif,motif),atac)
    #np.save('draw_plot/%s/more_TFs/aupr_value_%s_ATAC_moreTFs_FactorNet'%(motif,motif),atacpr)
    #np.save('draw_plot/%s/more_TFs/recall_value_%s_ATAC_moreTFs_FactorNet'%(motif,motif),atacrc)
    #pdb.set_trace()
    print("Average AUC ROC (ATAC)", np.mean(atac))
    print("Average AUPR ROC (ATAC)", np.mean(atacpr))
    print("Average Recall ROC (ATAC)", np.mean(atacrc))
    '''
    return model_predicts
Пример #4
0
def main():
    """
    The main executable function
    """
    parser = make_argument_parser()
    args = parser.parse_args()

    input_dirs = args.inputdirs
    tf = args.factor
    valid_chroms = args.validchroms
    valid_input_dirs = args.validinputdirs
    test_chroms = args.testchroms
    epochs = args.epochs
    patience = args.patience
    learningrate = args.learningrate
    seed = args.seed
    utils.set_seed(seed)
    dropout_rate = args.dropout
    L = args.seqlen
    w = args.motifwidth
    utils.L = L
    utils.w = w
    utils.w2 = w/2
    negatives = args.negatives
    assert negatives > 0
    meta = args.meta
    gencode = args.gencode
    motif = args.motif

    num_motifs = args.kernels
    num_recurrent = args.recurrent
    num_dense = args.dense
 
    features = ['bigwig']    
    #pdb.set_trace()

    if tf:
        print('Single-task training:', tf)
    else:
        print('Multi-task training')
        singleTask = False
        #Cannot use any metadata features
        assert not meta
        assert not gencode

    if args.outputdir is None:
        clobber = True
        output_dir = args.outputdirc
    else:
        clobber = False
        output_dir = args.outputdir

    try:  # adapted from dreme.py and train.py by T. Bailey & Daniel Quang
        os.makedirs(output_dir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print(sys.stderr, ('output directory (%s) already exists '
                                      'but you specified not to clobber it') % output_dir)
                sys.exit(1)
            else:
                print(sys.stderr, ('output directory (%s) already exists '
                                      'so it will be clobbered') % output_dir)

    print('Loading genome')
    genome = utils.load_genome()
    if valid_input_dirs:
        print('You specified at least one validation input directory')
        assert singleTask # This option only works for single-task training
    print('Loading ChIP labels')
    if singleTask:
        num_tfs = 1
    else:
        assert len(input_dirs) == 1 # multi-task training only supports one cell line
        input_dir = input_dirs[0]
        tfs, positive_windows, y_positive, nonnegative_regions_bed = \
            utils.load_chip_multiTask(input_dir)
        num_tfs = len(tfs)
    print('Loading bigWig data')
    bigwig_names, bigwig_files_list = utils.load_bigwigs(input_dirs)
    num_bigwigs = len(bigwig_names)
    if valid_input_dirs:
        valid_bigwig_names, valid_bigwig_files_list = utils.load_bigwigs(valid_input_dirs)
        assert valid_bigwig_names == bigwig_names
    if not singleTask:
        bigwig_files = bigwig_files_list[0]
    if meta:## did not use in scFAN
        print('Loading metadata features')
        
    else:# meta option was not selected, pass empty metadata features to the functions
        meta_list = [[] for bigwig_files in bigwig_files_list]
        if valid_input_dirs:
            valid_meta_list = [[] for bigwig_files in valid_bigwig_files_list]
    
    print('Making features')
    if singleTask:## did not use in scFAN
        print('single Task feature')
    else:
        datagen_train, datagen_valid,datagen_test,data_valid,data_test = \
            utils.make_features_multiTask(positive_windows, y_positive,
            nonnegative_regions_bed, bigwig_files, bigwig_names,
            genome, epochs, valid_chroms, test_chroms)
    #pdb.set_trace()
    print('Building model')
    if num_recurrent == 0:
        print('You specified 0 LSTM units. Omitting BLSTM layer')
    if num_recurrent < 0:
        print('You specified less than 0 LSTM units. Replacing BLSTM layer with global max-pooling layer')
    if meta or gencode:
        num_meta = 0
        if meta:
            num_meta = len(meta_names)
        if gencode:
            num_meta += 6
        model = utils.make_meta_model(num_tfs, num_bigwigs, num_meta, num_motifs, num_recurrent, num_dense, dropout_rate)
    else:
        #model = utils.make_model(num_tfs, num_bigwigs, num_motifs, num_recurrent, num_dense, dropout_rate)
        #model = utils.DeepSEA(num_tfs,num_recurrent,num_bigwigs)
        model = utils.scFANet(num_tfs,num_recurrent,num_bigwigs)

    if motif:
        assert singleTask # This option only works with single-task training
        
    output_tf_file = open(output_dir + '/chip.txt', 'w')
    if singleTask:## did not use in scFAN
        output_tf_file.write("%s\n" % tf)
    else:
        for tf in tfs:
            output_tf_file.write("%s\n" % tf)
    output_tf_file.close()
    output_feature_file = open(output_dir + '/feature.txt', 'w')
    for feature in features:
        output_feature_file.write("%s\n" % feature)
    output_feature_file.close()
    output_bw_file = open(output_dir + '/bigwig.txt', 'w')
    for bw in bigwig_names:
        output_bw_file.write("%s\n" % bw)
    output_bw_file.close()
    
    model_json = model.to_json()
    output_json_file = open(output_dir + '/model.json', 'w')
    output_json_file.write(model_json)
    output_json_file.close()
    train(datagen_train, datagen_valid, model, epochs, patience, learningrate, output_dir)
Пример #5
0
def main():
    """
    The main executable function
    """
    parser = make_argument_parser()
    args = parser.parse_args()

    input_dirs = args.inputdirs
    tf = args.factor
    valid_chroms = args.validchroms
    valid_input_dirs = args.validinputdirs
    test_chroms = args.testchroms
    epochs = args.epochs
    patience = args.patience
    learningrate = args.learningrate
    seed = args.seed
    utils.set_seed(seed)
    dropout_rate = args.dropout
    L = args.seqlen
    w = args.motifwidth
    utils.L = L
    utils.w = w
    utils.w2 = w / 2
    negatives = args.negatives
    assert negatives > 0
    meta = args.meta
    gencode = args.gencode
    motif = args.motif

    num_motifs = args.kernels
    num_recurrent = args.recurrent
    num_dense = args.dense

    features = ['bigwig']

    if tf:
        print 'Single-task training:', tf
        singleTask = True
        if meta:
            print 'Including metadata features'
            features.append('meta')
        if gencode:
            print 'Including genome annotations'
            features.append('gencode')
    else:
        print 'Multi-task training'
        singleTask = False
        #Cannot use any metadata features
        assert not meta
        assert not gencode

    if args.outputdir is None:
        clobber = True
        output_dir = args.outputdirc
    else:
        clobber = False
        output_dir = args.outputdir

    try:  # adapted from dreme.py by T. Bailey
        os.makedirs(output_dir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print >> sys.stderr, (
                    'output directory (%s) already exists '
                    'but you specified not to clobber it') % output_dir
                sys.exit(1)
            else:
                print >> sys.stderr, ('output directory (%s) already exists '
                                      'so it will be clobbered') % output_dir

    print 'Loading genome'
    genome = utils.load_genome()
    if valid_input_dirs:
        print 'You specified at least one validation input directory'
        assert singleTask  # This option only works for single-task training
    print 'Loading ChIP labels'
    if singleTask:
        chip_bed_list, nonnegative_regions_bed_list = \
            utils.load_chip_singleTask(input_dirs, tf)
        if valid_input_dirs:
            valid_chip_bed_list, valid_nonnegative_regions_bed_list = \
                utils.load_chip_singleTask(valid_input_dirs, tf)
        num_tfs = 1
    else:
        assert len(
            input_dirs) == 1  # multi-task training only supports one cell line
        input_dir = input_dirs[0]
        tfs, positive_windows, y_positive, nonnegative_regions_bed = \
            utils.load_chip_multiTask(input_dir)
        num_tfs = len(tfs)
    print 'Loading bigWig data'
    bigwig_names, bigwig_files_list = utils.load_bigwigs(input_dirs)
    num_bigwigs = len(bigwig_names)
    if valid_input_dirs:
        valid_bigwig_names, valid_bigwig_files_list = utils.load_bigwigs(
            valid_input_dirs)
        assert valid_bigwig_names == bigwig_names
    if not singleTask:
        bigwig_files = bigwig_files_list[0]
    if meta:
        print 'Loading metadata features'
        meta_names, meta_list = utils.load_meta(input_dirs)
        if valid_input_dirs:
            valid_meta_names, valid_meta_list = utils.load_load(
                valid_input_dirs)
            assert valid_meta_names == meta_names
    else:  # meta option was not selected, pass empty metadata features to the functions
        meta_list = [[] for bigwig_files in bigwig_files_list]
        if valid_input_dirs:
            valid_meta_list = [[] for bigwig_files in valid_bigwig_files_list]

    print 'Making features'
    if singleTask:
        if not valid_input_dirs:  #validation directories not used, must pass placeholder values
            valid_chip_bed_list = None
            valid_nonnegative_regions_bed_list = None
            valid_bigwig_files_list = None
            valid_meta_list = None
        datagen_train, datagen_valid = \
            utils.make_features_singleTask(chip_bed_list,
            nonnegative_regions_bed_list, bigwig_files_list, bigwig_names,
            meta_list, gencode, genome, epochs, negatives, valid_chroms, test_chroms,
            valid_chip_bed_list, valid_nonnegative_regions_bed_list,
            valid_bigwig_files_list, valid_meta_list)
    else:
        datagen_train, datagen_valid = \
            utils.make_features_multiTask(positive_windows, y_positive,
            nonnegative_regions_bed, bigwig_files, bigwig_names,
            genome, epochs, valid_chroms, test_chroms)
    print 'Building model'
    if num_recurrent == 0:
        print 'You specified 0 LSTM units. Omitting BLSTM layer'
    if num_recurrent < 0:
        print 'You specified less than 0 LSTM units. Replacing BLSTM layer with global max-pooling layer'
    if meta or gencode:
        num_meta = 0
        if meta:
            num_meta = len(meta_names)
        if gencode:
            num_meta += 6
        model = utils.make_meta_model(num_tfs, num_bigwigs, num_meta,
                                      num_motifs, num_recurrent, num_dense,
                                      dropout_rate)
    else:
        model = utils.make_model(num_tfs, num_bigwigs, num_motifs,
                                 num_recurrent, num_dense, dropout_rate)

    if motif:
        assert singleTask  # This option only works with single-task training
        motifs_db = utils.load_motif_db('resources/HOCOMOCOv9.meme')
        if tf in motifs_db:
            print 'Injecting canonical motif'
            pwm = motifs_db[tf]
            pwm += 0.001
            pwm = pwm / pwm.sum(axis=1)[:, np.newaxis]
            pwm = np.log2(pwm / 0.25)
            utils.inject_pwm(model, pwm)
    output_tf_file = open(output_dir + '/chip.txt', 'w')
    if singleTask:
        output_tf_file.write("%s\n" % tf)
    else:
        for tf in tfs:
            output_tf_file.write("%s\n" % tf)
    output_tf_file.close()
    output_feature_file = open(output_dir + '/feature.txt', 'w')
    for feature in features:
        output_feature_file.write("%s\n" % feature)
    output_feature_file.close()
    output_bw_file = open(output_dir + '/bigwig.txt', 'w')
    for bw in bigwig_names:
        output_bw_file.write("%s\n" % bw)
    output_bw_file.close()
    if meta:
        output_meta_file = open(output_dir + '/meta.txt', 'w')
        for meta_name in meta_names:
            output_meta_file.write("%s\n" % meta_name)
        output_meta_file.close()
    model_json = model.to_json()
    output_json_file = open(output_dir + '/model.json', 'w')
    output_json_file.write(model_json)
    output_json_file.close()
    train(datagen_train, datagen_valid, model, epochs, patience, learningrate,
          output_dir)