Exemplo n.º 1
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]')
    parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]')
    parser.add_option('-n', dest='center_nt', default=0, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-p', dest='print_table_all', default=False, action='store_true', help='Print all targets to the table [Default: %default]')
    parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)


    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len)/2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start+delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]


    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth':1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20,3))
            ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start))
            ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start))
            ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True)
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1)
            ax_sad.set_xlim(0,minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top','bottom','left','right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300)
            plt.close()


        #################################################################
        # print table of nt variability for each cell
        #################################################################
        print_targets = plot_targets
        if options.print_table_all:
            print_targets = range(seq_mod_preds.shape[3])

        for ci in print_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
Exemplo n.º 2
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '-d',
        dest='model_hdf5_file',
        default=None,
        help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option(
        '-g',
        dest='gain_height',
        default=False,
        action='store_true',
        help=
        'Nucleotide heights determined by the max of loss and gain [Default: %default]'
    )
    parser.add_option('-m',
                      dest='min_limit',
                      default=0.1,
                      type='float',
                      help='Minimum heatmap limit [Default: %default]')
    parser.add_option(
        '-n',
        dest='center_nt',
        default=200,
        type='int',
        help='Center nt to mutate and plot in the heat map [Default: %default]'
    )
    parser.add_option('-o',
                      dest='out_dir',
                      default='heat',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='targets',
        default='0',
        help=
        'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

            # read in target names
            target_labels = open(
                options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try:  # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(
                    random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5' % options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (
            options.center_nt, model_file, model_input_hdf5,
            options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start + delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]

    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack(
                [min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth': 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start),
                                       colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start),
                                      colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0),
                                       colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(
                    axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir,
                                              header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            subprocess.call('convert -density 300 %s %s' %
                            (logo_eps, logo_png),
                            shell=True)
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0],
                        c=rdbu[0],
                        label='loss',
                        linewidth=1)
            ax_sad.plot(minmax_matrix[1],
                        c=rdbu[-1],
                        label='gain',
                        linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top', 'bottom', 'left', 'right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix,
                        linewidths=0,
                        cmap='RdBu_r',
                        vmin=-vlim,
                        vmax=vlim,
                        xticklabels=False,
                        ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA',
                                         rotation='horizontal')  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' %
                        (options.out_dir, header.replace(':', '_'), ci),
                        dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [
                    header, delta_start + pos, ci, loss_matrix[pos],
                    gain_matrix[pos]
                ]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
Exemplo n.º 3
0
def main():
    usage = "usage: %prog [options] <model_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g",
        dest="gain_height",
        default=False,
        action="store_true",
        help="Nucleotide heights determined by the max of loss and gain [Default: %default]",
    )
    parser.add_option(
        "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]"
    )
    parser.add_option(
        "-n",
        dest="center_nt",
        default=200,
        type="int",
        help="Center nt to mutate and plot in the heat map [Default: %default]",
    )
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]"
    )
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file")
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split("\t")

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")
            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            try:  # TEMP
                seq_headers = np.array(hdf5_in["test_headers"])
                target_labels = np.array(hdf5_in["target_labels"])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = "%s/model_in.h5" % options.out_dir
                h5f = h5py.File(model_input_hdf5, "w")
                h5f.create_dataset("test_in", data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = "%s/model_out.h5" % options.out_dir
        torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % (
            options.center_nt,
            model_file,
            model_input_hdf5,
            options.model_hdf5_file,
        )
        if subprocess.call(torch_cmd, shell=True):
            message("Error running basset_sat_predict.lua", "error")

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, "r")
    seq_mod_preds = np.array(hdf5_in["seq_mod_preds"])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start : delta_start + delta_len]

    # decide which cells to plot
    if options.targets == "-1":
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(",")]

    #################################################################
    # plot
    #################################################################
    table_out = open("%s/table.txt" % options.out_dir, "w")

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = "ACGT"
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = "seq%d" % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style="white", font_scale=0.5)
            sns.axes_style({"axes.linewidth": 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = "%s.png" % logo_eps[:-4]
            logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png)
            if subprocess.call(logo_cmd, shell=True):
                message("Error running convert", "error")
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ["top", "bottom", "left", "right"]:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal")  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, "\t".join([str(c) for c in cols])

    table_out.close()
Exemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file> <output_file>'
    parser = OptionParser(usage)
    parser.add_option('-b', '--batch', dest='batch', default=128, type='int', help='Batch size [Default: %default]')
    parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Run on GPGPU [Default: %default]')
    parser.add_option('--cudnn', dest='cudnn', default=False, action='store_true', help='Run on GPGPU w/cuDNN [Default: %default]')
    parser.add_option('-n', '--norm', dest='norm', default=False, action='store_true', help='Normalize all targets to a level plane [Default: %default]')
    parser.add_option('-r', '--rc', dest='rc', default=False, action='store_true', help='Average forward and reverse complement [Default: %default]')
    parser.add_option('-s', '--scores', dest='scores', default=False, action='store_true', help='Print pre-sigmoid scores instead of probability predictions [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide Basset model file, input sequences (as a FASTA file or test data in an HDF file, and output table file')
    else:
        model_file = args[0]
        input_file = args[1]
        out_file = args[2]

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences
        seqs_1hot = dna_io.load_sequences(input_file, permute=False)

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        model_input_fd, model_input_hdf5 = tempfile.mkstemp()
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()
        temp_hdf5 = True

    except (IOError, IndexError):
        # input_file is HDF5
        model_input_hdf5 = input_file
        temp_hdf5 = False


    #################################################################
    # Torch predict modifications
    #################################################################
    opts_str = '-batch %d' % options.batch
    if options.cudnn:
        opts_str += ' -cudnn'
    elif options.cuda:
        opts_str += ' -cuda'
    if options.norm:
        opts_str += ' -norm'
    if options.rc:
        opts_str += ' -rc'
    if options.scores:
        opts_str += ' -scores'

    torch_cmd = 'basset_predict.lua %s %s %s %s' % (opts_str, model_file, model_input_hdf5, out_file)
    print torch_cmd
    subprocess.call(torch_cmd, shell=True)

    if temp_hdf5:
        os.remove(model_input_hdf5)
Exemplo n.º 5
0
def main():
    usage = 'usage: %prog [options] <model_file> <profile_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '--all',
        dest='all_data',
        default=False,
        action='store_true',
        help=
        'Search all training/valid/test sequences. By default we search only the test set. [Default: %default]'
    )
    parser.add_option('--cuda',
                      dest='cuda',
                      default=False,
                      action='store_true',
                      help='Run on GPGPU [Default: %default]')
    parser.add_option('--cudnn',
                      dest='cudnn',
                      default=False,
                      action='store_true',
                      help='Run on GPGPU w/cuDNN [Default: %default]')
    parser.add_option(
        '-d',
        dest='model_out_file',
        default=None,
        help='Pre-computed model predictions output table [Default: %default]')
    parser.add_option(
        '-e',
        dest='norm_even',
        default=False,
        action='store_true',
        help=
        'Normalize the weights for the positive and negative datasets to be even [Default: %default]'
    )
    parser.add_option('-f',
                      dest='font_heat',
                      default=6,
                      type='int',
                      help='Heat map axis font size [Default: %default]')
    parser.add_option('-n',
                      dest='num_dissect',
                      default=10,
                      type='int',
                      help='Dissect the top n hits [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='profile',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '-r',
        dest='norm_preds',
        default=False,
        action='store_true',
        help='Normalize predictions to have equal frequency [Default: %default]'
    )
    parser.add_option(
        '-z',
        dest='weight_zero',
        default=1.0,
        type='float',
        help=
        'Adjust the weights for the zero samples by this value [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)'
        )
    else:
        model_file = args[0]
        profile_file = args[1]
        input_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError, UnicodeDecodeError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')

            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            seq_headers = np.array(
                [h.decode('UTF-8') for h in hdf5_in['test_headers']])

            hdf5_in.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    # GPU options (needed below, too)
    gpgpu_str = ''
    if options.cudnn:
        gpgpu_str = '-cudnn'
    elif options.cuda:
        gpgpu_str = '-cuda'

    if options.model_out_file is None:
        options.model_out_file = '%s/preds.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -mc_n 10 -rc %s %s %s %s' % (
            gpgpu_str, model_file, model_input_hdf5, options.model_out_file)
        print(torch_cmd)
        subprocess.call(torch_cmd, shell=True)

    # read in predictions
    seqs_preds = np.loadtxt(options.model_out_file)

    num_targets = seqs_preds.shape[1]

    #################################################################
    # parse profile file
    #################################################################
    activity_profile, profile_weights, profile_mask, target_labels = load_profile(
        profile_file, num_targets, options.norm_even, options.weight_zero)

    # normalize predictions
    if options.norm_preds:
        pred_means = seqs_preds.mean(axis=0)

        # save to file for basset_refine.py
        np.save('%s/pred_means' % options.out_dir, pred_means)

        # aim for profile weighted average
        aim_mean = np.average(pred_means[profile_mask],
                              weights=profile_weights[profile_mask])

        # normalize
        for ti in range(seqs_preds.shape[1]):
            ratio_ti = pred_means[ti] / aim_mean
            if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4):
                print(
                    'WARNING: target %d with mean %.4f differs 4-fold from the median %.3f'
                    % (ti, pred_means[ti], aim_mean),
                    file=sys.stderr)
            seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti],
                                      aim_mean)

    #################################################################
    # plot clustered heat map limited to relevant targets
    #################################################################
    seqs_preds_prof = seqs_preds[:, profile_mask]
    seqs_preds_var = seqs_preds_prof.var(axis=1)
    seqs_sort_var = np.argsort(seqs_preds_var)[::-1]

    # heat map
    plt.figure()
    g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]),
                       metric='cosine',
                       linewidths=0,
                       yticklabels=target_labels[profile_mask],
                       xticklabels=False)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig('%s/heat_clust.pdf' % options.out_dir)
    plt.close()

    # dimension reduction
    # model_pca = PCA(n_components=50)
    # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof))
    # model = TSNE(n_components=2, perplexity=5, metric='euclidean')
    # spp_dr = model.fit_transform(spp_pca)
    model = PCA(n_components=2)
    spp_dr = model.fit_transform(np.transpose(seqs_preds_prof))
    plt.figure()
    plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c='black', s=5)
    target_labels_prof_concise = [
        tl.split(':')[-1] for tl in target_labels[profile_mask]
    ]
    for label, x, y, activity in zip(target_labels_prof_concise, spp_dr[:, 0],
                                     spp_dr[:, 1],
                                     activity_profile[profile_mask]):
        plt.annotate(label,
                     xy=(x, y),
                     size=10,
                     color=sns.color_palette('deep')[int(activity)])
    plt.savefig('%s/dim_red.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # compute profile distances
    #################################################################
    # compute prediction distances
    seqs_pdists = []
    for si in range(seqs_preds.shape[0]):
        # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum()
        sd = log_loss(activity_profile[profile_mask],
                      seqs_preds[si, profile_mask],
                      sample_weight=profile_weights[profile_mask])
        seqs_pdists.append(sd)
    seqs_pdists = np.array(seqs_pdists)

    # obtain sorted indexes
    seqs_sort_dist = np.argsort(seqs_pdists)

    # compute target distances
    seqs_tdists = []
    for si in range(seqs_preds.shape[0]):
        tdists = np.absolute(targets[si, profile_mask] -
                             activity_profile[profile_mask])
        tdists_weight = np.multiply(tdists, profile_weights[profile_mask])
        td = tdists_weight.sum()
        seqs_tdists.append(td)
    seqs_tdists = np.array(seqs_tdists)

    # print as table
    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for si in seqs_sort_dist:
        cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(
            seqs_preds[si, profile_mask])
        print('\t'.join([str(c) for c in cols]), file=table_out)
    table_out.close()

    #################################################################
    # plot sorted heat map
    #################################################################
    plt.figure()
    g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]),
                       col_cluster=False,
                       metric='cosine',
                       linewidths=0,
                       yticklabels=target_labels[profile_mask],
                       xticklabels=False)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig('%s/heat_rank.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # dissect the top hits
    #################################################################
    satmut_targets = ','.join(
        [str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]])

    if gpgpu_str != '':
        gpgpu_str = '-%s' % gpgpu_str

    for ni in range(options.num_dissect):
        si = seqs_sort_dist[ni]

        # print FASTA
        fasta_file = '%s/seq%d.fa' % (options.out_dir, ni)
        fasta_out = open(fasta_file, 'w')
        print('>%s\n%s' % (seq_headers[si], seqs[si]), file=fasta_out)
        fasta_out.close()

        # saturated mutagenesis
        cmd = 'basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s' % (
            gpgpu_str, options.out_dir, ni, satmut_targets, model_file,
            fasta_file)
        subprocess.call(cmd, shell=True)

        # predictions and targets heat
        profile_sort = np.argsort(activity_profile[profile_mask])
        heat_mat = np.array([
            activity_profile[profile_mask], targets[si, profile_mask],
            seqs_preds_prof[si]
        ])
        heat_mat = heat_mat[:, profile_sort]

        plt.figure()
        ax = sns.heatmap(np.transpose(heat_mat),
                         yticklabels=target_labels[profile_mask][profile_sort],
                         xticklabels=['Desired', 'Experiment', 'Prediction'])
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0)
        for label in ax.yaxis.get_majorticklabels():
            label.set_fontsize(options.font_heat)
        plt.savefig('%s/heat%d.pdf' % (options.out_dir, ni))
        plt.close()
Exemplo n.º 6
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_net_predict.lua %s %s %s' % (model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)


    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    reprs = []
    l = 1
    while 'reprs%d'%l in hdf5_in.keys():
        reprs.append(np.array(hdf5_in['reprs%d'%l]))
        l += 1
    hdf5_in.close()


    #################################################################
    # plot
    #################################################################
    print len(reprs)
    for l in range(len(reprs)):
        for si in range(len(seq_headers)):
            plt.figure()

            # just write the sequence out above it
            # or maybe I'll ultimately want to write an
            # influence version. yea probably.

            print reprs[l][si].shape
            sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False)
            plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l))
            plt.close()
Exemplo n.º 7
0
def main():
    usage = "usage: %prog [options] <model_file> <profile_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "--all",
        dest="all_data",
        default=False,
        action="store_true",
        help="Search all training/valid/test sequences. By default we search only the test set. [Default: %default]",
    )
    parser.add_option(
        "--cuda", dest="cuda", default=False, action="store_true", help="Run on GPGPU [Default: %default]"
    )
    parser.add_option(
        "--cudnn", dest="cudnn", default=False, action="store_true", help="Run on GPGPU w/cuDNN [Default: %default]"
    )
    parser.add_option(
        "-d",
        dest="model_out_file",
        default=None,
        help="Pre-computed model predictions output table [Default: %default]",
    )
    parser.add_option(
        "-e",
        dest="norm_even",
        default=False,
        action="store_true",
        help="Normalize the weights for the positive and negative datasets to be even [Default: %default]",
    )
    parser.add_option("-f", dest="font_heat", default=6, type="int", help="Heat map axis font size [Default: %default]")
    parser.add_option(
        "-n", dest="num_dissect", default=10, type="int", help="Dissect the top n hits [Default: %default]"
    )
    parser.add_option("-o", dest="out_dir", default="profile", help="Output directory [Default: %default]")
    parser.add_option(
        "-r",
        dest="norm_preds",
        default=False,
        action="store_true",
        help="Normalize predictions to have equal frequency [Default: %default]",
    )
    parser.add_option(
        "-z",
        dest="weight_zero",
        default=1.0,
        type="float",
        help="Adjust the weights for the zero samples by this value [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            "Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)"
        )
    else:
        model_file = args[0]
        profile_file = args[1]
        input_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError, UnicodeDecodeError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")

            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            seq_headers = np.array([h.decode("UTF-8") for h in hdf5_in["test_headers"]])

            hdf5_in.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    # GPU options (needed below, too)
    gpgpu_str = ""
    if options.cudnn:
        gpgpu_str = "-cudnn"
    elif options.cuda:
        gpgpu_str = "-cuda"

    if options.model_out_file is None:
        options.model_out_file = "%s/preds.txt" % options.out_dir

        torch_cmd = "basset_predict.lua -mc_n 10 -rc %s %s %s %s" % (
            gpgpu_str,
            model_file,
            model_input_hdf5,
            options.model_out_file,
        )
        print(torch_cmd)
        subprocess.call(torch_cmd, shell=True)

    # read in predictions
    seqs_preds = np.loadtxt(options.model_out_file)

    num_targets = seqs_preds.shape[1]

    #################################################################
    # parse profile file
    #################################################################
    activity_profile, profile_weights, profile_mask, target_labels = load_profile(
        profile_file, num_targets, options.norm_even, options.weight_zero
    )

    # normalize predictions
    if options.norm_preds:
        pred_means = seqs_preds.mean(axis=0)

        # save to file for basset_refine.py
        np.save("%s/pred_means" % options.out_dir, pred_means)

        # aim for profile weighted average
        aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask])

        # normalize
        for ti in range(seqs_preds.shape[1]):
            ratio_ti = pred_means[ti] / aim_mean
            if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4):
                print(
                    "WARNING: target %d with mean %.4f differs 4-fold from the median %.3f"
                    % (ti, pred_means[ti], aim_mean),
                    file=sys.stderr,
                )
            seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean)

    #################################################################
    # plot clustered heat map limited to relevant targets
    #################################################################
    seqs_preds_prof = seqs_preds[:, profile_mask]
    seqs_preds_var = seqs_preds_prof.var(axis=1)
    seqs_sort_var = np.argsort(seqs_preds_var)[::-1]

    # heat map
    plt.figure()
    g = sns.clustermap(
        np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]),
        metric="cosine",
        linewidths=0,
        yticklabels=target_labels[profile_mask],
        xticklabels=False,
    )
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig("%s/heat_clust.pdf" % options.out_dir)
    plt.close()

    # dimension reduction
    # model_pca = PCA(n_components=50)
    # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof))
    # model = TSNE(n_components=2, perplexity=5, metric='euclidean')
    # spp_dr = model.fit_transform(spp_pca)
    model = PCA(n_components=2)
    spp_dr = model.fit_transform(np.transpose(seqs_preds_prof))
    plt.figure()
    plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c="black", s=5)
    target_labels_prof_concise = [tl.split(":")[-1] for tl in target_labels[profile_mask]]
    for label, x, y, activity in zip(
        target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask]
    ):
        plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette("deep")[int(activity)])
    plt.savefig("%s/dim_red.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # compute profile distances
    #################################################################
    # compute prediction distances
    seqs_pdists = []
    for si in range(seqs_preds.shape[0]):
        # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum()
        sd = log_loss(
            activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask]
        )
        seqs_pdists.append(sd)
    seqs_pdists = np.array(seqs_pdists)

    # obtain sorted indexes
    seqs_sort_dist = np.argsort(seqs_pdists)

    # compute target distances
    seqs_tdists = []
    for si in range(seqs_preds.shape[0]):
        tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask])
        tdists_weight = np.multiply(tdists, profile_weights[profile_mask])
        td = tdists_weight.sum()
        seqs_tdists.append(td)
    seqs_tdists = np.array(seqs_tdists)

    # print as table
    table_out = open("%s/table.txt" % options.out_dir, "w")
    for si in seqs_sort_dist:
        cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(seqs_preds[si, profile_mask])
        print("\t".join([str(c) for c in cols]), file=table_out)
    table_out.close()

    #################################################################
    # plot sorted heat map
    #################################################################
    plt.figure()
    g = sns.clustermap(
        np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]),
        col_cluster=False,
        metric="cosine",
        linewidths=0,
        yticklabels=target_labels[profile_mask],
        xticklabels=False,
    )
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig("%s/heat_rank.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # dissect the top hits
    #################################################################
    satmut_targets = ",".join([str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]])

    if gpgpu_str != "":
        gpgpu_str = "-%s" % gpgpu_str

    for ni in range(options.num_dissect):
        si = seqs_sort_dist[ni]

        # print FASTA
        fasta_file = "%s/seq%d.fa" % (options.out_dir, ni)
        fasta_out = open(fasta_file, "w")
        print(">%s\n%s" % (seq_headers[si], seqs[si]), file=fasta_out)
        fasta_out.close()

        # saturated mutagenesis
        cmd = "basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s" % (
            gpgpu_str,
            options.out_dir,
            ni,
            satmut_targets,
            model_file,
            fasta_file,
        )
        subprocess.call(cmd, shell=True)

        # predictions and targets heat
        profile_sort = np.argsort(activity_profile[profile_mask])
        heat_mat = np.array([activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si]])
        heat_mat = heat_mat[:, profile_sort]

        plt.figure()
        ax = sns.heatmap(
            np.transpose(heat_mat),
            yticklabels=target_labels[profile_mask][profile_sort],
            xticklabels=["Desired", "Experiment", "Prediction"],
        )
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0)
        for label in ax.yaxis.get_majorticklabels():
            label.set_fontsize(options.font_heat)
        plt.savefig("%s/heat%d.pdf" % (options.out_dir, ni))
        plt.close()
Exemplo n.º 8
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file> <output_file>'
    parser = OptionParser(usage)
    parser.add_option('-b', '--batch', dest='batch', default=128, type='int', help='Batch size [Default: %default]')
    parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Run on GPGPU [Default: %default]')
    parser.add_option('--cudnn', dest='cudnn', default=False, action='store_true', help='Run on GPGPU w/cuDNN [Default: %default]')
    parser.add_option('-n', '--norm', dest='norm', default=False, action='store_true', help='Normalize all targets to a level plane [Default: %default]')
    parser.add_option('-r', '--rc', dest='rc', default=False, action='store_true', help='Average forward and reverse complement [Default: %default]')
    parser.add_option('-s', '--scores', dest='scores', default=False, action='store_true', help='Print pre-sigmoid scores instead of probability predictions [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide Basset model file, input sequences (as a FASTA file or test data in an HDF file, and output table file')
    else:
        model_file = args[0]
        input_file = args[1]
        out_file = args[2]

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences
        seqs_1hot = dna_io.load_sequences(input_file, permute=False)

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        model_input_fd, model_input_hdf5 = tempfile.mkstemp()
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()
        temp_hdf5 = True

    except (IOError, IndexError):
        # input_file is HDF5
        model_input_hdf5 = input_file
        temp_hdf5 = False


    #################################################################
    # Torch predict modifications
    #################################################################
    opts_str = '-batch %d' % options.batch
    if options.cudnn:
        opts_str += ' -cudnn'
    elif options.cuda:
        opts_str += ' -cuda'
    if options.norm:
        opts_str += ' -norm'
    if options.rc:
        opts_str += ' -rc'
    if options.scores:
        opts_str += ' -scores'

    torch_cmd = 'basset_predict.lua %s %s %s %s' % (opts_str, model_file, model_input_hdf5, out_file)
    print torch_cmd
    subprocess.call(torch_cmd, shell=True)

    if temp_hdf5:
        os.remove(model_input_hdf5)
Exemplo n.º 9
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '-d',
        dest='model_hdf5_file',
        default=None,
        help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='heat',
                      help='Output directory [Default: %default]')
    parser.add_option('-r',
                      dest='rng_seed',
                      default=1,
                      type='float',
                      help='Random number generator seed [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='targets',
        default='0',
        help=
        'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

            # read in target names
            target_labels = open(
                options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try:  # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(
                    random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5' % options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_net_predict.lua %s %s %s' % (
            model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    reprs = []
    l = 1
    while 'reprs%d' % l in hdf5_in.keys():
        reprs.append(np.array(hdf5_in['reprs%d' % l]))
        l += 1
    hdf5_in.close()

    #################################################################
    # plot
    #################################################################
    print len(reprs)
    for l in range(len(reprs)):
        for si in range(len(seq_headers)):
            plt.figure()

            # just write the sequence out above it
            # or maybe I'll ultimately want to write an
            # influence version. yea probably.

            print reprs[l][si].shape
            sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False)
            plt.savefig('%s/%s_l%d.pdf' %
                        (options.out_dir, header_filename(seq_headers[si]), l))
            plt.close()