示例#1
0
    def next_seqs(self):
        """ Construct array of sequences for this stream chunk. """

        # extract next sequences from generator
        seqs_1hot = []
        stream_end = self.stream_start + self.stream_size
        for si in range(self.stream_start, stream_end):
            try:
                seqs_1hot.append(self.seqs_gen.__next__())
            except StopIteration:
                continue

        # initialize ensemble
        seqs_1hot_ens = []

        # add rc/shifts
        for seq_1hot in seqs_1hot:
            for shift in self.shifts:
                seq_1hot_aug = dna_io.hot1_augment(seq_1hot, shift=shift)
                seqs_1hot_ens.append(seq_1hot_aug)
                if self.rc:
                    seq_1hot_aug = dna_io.hot1_rc(seq_1hot_aug)
                    seqs_1hot_ens.append(seq_1hot_aug)

        seqs_1hot_ens = np.array(seqs_1hot_ens, dtype='float32')
        return seqs_1hot_ens
示例#2
0
def main():
    usage = 'usage: %prog [options] <scores_file>'
    parser = OptionParser(usage)
    parser.add_option('-d',
                      dest='n_components',
                      default=None,
                      type='int',
                      help='PCA n_components [Default: %default]')
    parser.add_option(
        '-e',
        dest='num_estimators',
        default=100,
        type='int',
        help='Number of random forest estimators [Default: %default]')
    parser.add_option('-g',
                      dest='genome',
                      default='ce11',
                      help='PhyloP and FASTA genome [Default: %default]')
    parser.add_option('-i',
                      dest='iterations',
                      default=1,
                      type='int',
                      help='Cross-validation iterations [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='regr_out')
    parser.add_option(
        '-p',
        dest='parallel_threads',
        default=1,
        type='int',
        help=
        'Parallel threads passed to scikit-learn n_jobs [Default: %default]')
    parser.add_option('-r', dest='random_seed', default=44, type='int')
    parser.add_option('--stat',
                      dest='sad_stat',
                      default='sum',
                      help='HDF5 key stat to consider. [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide ISM scores and PhyloP bigwig file.')
    else:
        scores_file = args[0]

    np.random.seed(options.random_seed)
    options.genome = options.genome.lower()

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    ################################################################
    # read ISM scores

    with h5py.File(scores_file, 'r') as h5o:
        score_chrs = [chrm.decode('UTF-8') for chrm in h5o['chr']]
        score_starts = h5o['start'][:]
        score_ends = h5o['end'][:]
        score_strands = [strand.decode('UTF-8') for strand in h5o['strand']]
        score_seqs = h5o['seqs'][:]
        nt_scores = h5o[options.sad_stat][:].astype('float16')
    num_seqs, mut_len, _, num_targets = nt_scores.shape

    # reference transform
    nt_scores_ref = np.reshape(nt_scores[score_seqs],
                               (num_seqs, mut_len, num_targets))

    # min/max transform
    nt_scores_min = nt_scores.min(axis=-2)
    nt_scores_max = nt_scores.max(axis=-2)
    pos_mask = (nt_scores_ref > 0)
    nt_scores_refm = nt_scores_ref.copy()
    nt_scores_refm[pos_mask] -= nt_scores_min[pos_mask]
    nt_scores_refm[~pos_mask] -= nt_scores_max[~pos_mask]

    ################################################################
    # read phylop bigwig annotations

    genome_path = os.environ[options.genome.upper()]
    fasta_file = '%s/assembly/%s.fa' % (genome_path, options.genome)
    if options.genome == 'ce11':
        phylop_file = '%s/phylop/ce11.phyloP26way.bw' % genome_path
    else:
        print('Genome PhyloP not found', file=sys.stderr)
        exit(1)

    seqs_phylop = []
    seqs_phylop_dna1 = []
    seqs_phylop_mask = np.ones(num_seqs, dtype='bool')

    fasta_open = pysam.FastaFile(fasta_file)
    phylop_open = pyBigWig.open(phylop_file, 'r')

    for si in range(num_seqs):
        phylop_chr = score_chrs[si]
        if not phylop_chr.startswith('chr'):
            phylop_chr = 'chr%s' % phylop_chr

        # read values
        try:
            seq_phylop = phylop_open.values(phylop_chr,
                                            score_starts[si],
                                            score_ends[si],
                                            numpy=True).astype('float16')

            # read DNA
            seq_phylop_dna = fasta_open.fetch(score_chrs[si], score_starts[si],
                                              score_ends[si])
            seq_phylop_dna1 = dna_io.dna_1hot(seq_phylop_dna)

            # reverse complement
            if score_strands[si] == '-':
                seq_phylop = seq_phylop[::-1]
                seq_phylop_dna1 = dna_io.hot1_rc(seq_phylop_dna1)

            # save
            seqs_phylop.append(seq_phylop)
            seqs_phylop_dna1.append(seq_phylop_dna1)

        except RuntimeError:
            print('Ignoring %s:%d-%d; phylop not found.' % \
                (phylop_chr, score_starts[si], score_ends[si]), file=sys.stderr)
            seqs_phylop_mask[si] = False

    # filter for valid sequences
    nt_scores = nt_scores[seqs_phylop_mask]
    nt_scores_ref = nt_scores_ref[seqs_phylop_mask]
    nt_scores_refm = nt_scores_refm[seqs_phylop_mask]
    score_seqs = score_seqs[seqs_phylop_mask]
    num_seqs = len(score_seqs)

    # transform PhyloP
    seqs_phylop = np.array(seqs_phylop, dtype='float32')
    seqs_phylop = np.nan_to_num(seqs_phylop)
    seqs_phylop = np.clip(seqs_phylop, -1.5, 5)

    # verify DNA
    seqs_phylop_dna1 = np.array(seqs_phylop_dna1)
    for si in range(num_seqs):
        seq_diff = np.logical_xor(score_seqs[si], seqs_phylop_dna1[si])
        nts_diff = seq_diff.sum() // 2
        if nts_diff != 0:
            pdb.set_trace()

    ################################################################
    # regression

    # add positions
    seqs_pos = np.arange(mut_len)
    seqs_pos = np.tile(seqs_pos, num_seqs)
    seqs_pos = np.reshape(seqs_pos, (num_seqs, -1, 1))

    # flatten everything
    # seqs_phylop_flat = seqs_phylop.flatten()
    # seqs_pos_flat = seqs_pos.flatten()
    # nt_scores_refm_flat = nt_scores_refm.reshape((-1,num_targets))
    # num_pos = nt_scores_refm_flat.shape[0]

    # form matrix
    # X_scores = nt_scores_refm_flat
    # if options.n_components is not None:
    #     options.n_components = min(options.n_components, num_targets)
    #     X_scores = PCA(options.n_components).fit_transform(nt_scores_refm_flat)
    # X_pos = seqs_pos.reshape(num_pos,1)
    # X = np.concatenate([X_scores,X_pos], axis=1)

    X = np.concatenate([nt_scores_refm, seqs_pos], axis=-1)
    X = X.astype('float32')

    # regressor
    r2s, pcors = randfor_cv(X,
                            seqs_phylop,
                            iterations=options.iterations,
                            n_estimators=options.num_estimators,
                            random_state=options.random_seed,
                            n_jobs=options.parallel_threads)

    # save
    np.save('%s/r2.npy' % options.out_dir, r2s)
    np.save('%s/pcor.npy' % options.out_dir, pcors)

    # print stats
    iterations = len(r2s)
    stats_out = open('%s/stats.txt' % options.out_dir, 'w')
    print('R2 %.4f (%.4f)' % (r2s.mean(), r2s.std() / np.sqrt(iterations)),
          file=stats_out)
    print('pR %.4f (%.4f)' % (pcors.mean(), pcors.std() / np.sqrt(iterations)),
          file=stats_out)
    stats_out.close()