Пример #1
0
  def get_1hots(self, genome_open):
    seqs1_list = []

    # extract reference
    if self.start < 0:
      ref_seq = 'N'*(-self.start) + genome_open.fetch(self.chr, 0, self.end).upper()
    else:
      ref_seq = genome_open.fetch(self.chr, self.start, self.end).upper()

    # extend to full length
    if len(ref_seq) < self.end - self.start:
      ref_seq += 'N'*(self.end-self.start-len(ref_seq))

    # verify reference alleles
    for snp in self.snps:
      ref_n = len(snp.ref_allele)
      ref_snp = ref_seq[snp.seq_pos:snp.seq_pos+ref_n]
      if snp.ref_allele != ref_snp:
        print('ERROR: %s does not match reference %s' % (snp, ref_snp), file=sys.stderr)
        exit(1)

    # 1 hot code reference sequence
    ref_1hot = dna_io.dna_1hot(ref_seq)
    seqs1_list = [ref_1hot]

    # make alternative 1 hot coded sequences
    #  (assuming SNP is 1-based indexed)
    for snp in self.snps:
      alt_1hot = make_alt_1hot(ref_1hot, snp.seq_pos, snp.ref_allele, snp.alt_alleles[0])
      seqs1_list.append(alt_1hot)

    return seqs1_list
Пример #2
0
    def run(self):
        while True:
            try:
                # unload predictions
                seq_dna, seq_preds, si = self.queue.get()
                print('Writing %d' % si, flush=True)

                # seq_preds is (1 + 3*mut_len) x (target_len) x (num_targets)
                seq_preds = np.array(seq_preds)
                num_preds = seq_preds.shape[0]
                num_targets = seq_preds.shape[-1]

                # reverse engineer mutagenesis position parameters
                mut_len = (num_preds - 1) // 3
                mut_mid = len(seq_dna) // 2
                mut_start = mut_mid - mut_len // 2
                mut_end = mut_start + mut_len

                # one hot code mutagenized DNA
                seq_dna_mut = seq_dna[mut_start:mut_end]
                seq_1hot_mut = dna_io.dna_1hot(seq_dna_mut)

                # initialize scores
                seq_scores = np.zeros((mut_len, 4, num_targets),
                                      dtype='float32')

                # sum across length
                seq_preds_sum = seq_preds.sum(axis=1, dtype='float32')

                # predictions index (starting at first mutagenesis)
                pi = 1

                # for each mutated position
                for mi in range(mut_len):
                    # for each nucleotide
                    for ni in range(4):
                        if seq_1hot_mut[mi, ni]:
                            # reference score
                            seq_scores[mi, ni, :] = seq_preds_sum[0, :]
                        else:
                            # mutation score
                            seq_scores[mi, ni, :] = seq_preds_sum[pi, :]
                            pi += 1

                # normalize positions
                seq_scores -= seq_scores.mean(axis=1, keepdims=True)

                # write to HDF5
                self.scores_h5['scores'][si, :, :, :] = seq_scores.astype(
                    'float16')
                self.scores_h5['seqs'][si, :, :] = seq_1hot_mut

            except:
                # communicate error
                print('ERROR: Sequence %d failed' % si,
                      file=sys.stderr,
                      flush=True)

            # communicate finished task
            self.queue.task_done()
Пример #3
0
def fasta_pwm(motif_fasta_file):
    # read sequences
    seqs_dna = [
        line.rstrip() for line in open(motif_fasta_file) if line[0] != '>'
    ]
    seqs_1hot = np.array(
        [dna_io.dna_1hot(sd, n_uniform=True) for sd in seqs_dna])
    num_seqs = seqs_1hot.shape[0]

    # compute PWM
    pwm = seqs_1hot.sum(axis=0) + 1
    pwm = pwm / (seqs_1hot.shape[0] + 4)

    return pwm, num_seqs + 4
Пример #4
0
def end_align_fasta(seqlets_fasta, msa_fasta, shift=1, pwm_iter=1, epochs=2):
    """Align seqlets in a FASTA file, allowing only end gaps."""

    # read seqlet DNA
    seqs_1hot = []
    for line in open(seqlets_fasta):
        if line[0] != '>':
            seqlet_dna = line.rstrip()
            seqlet_1hot = dna_io.dna_1hot(seqlet_dna)
            seqs_1hot.append(seqlet_1hot)
    seqs_1hot = np.array(seqs_1hot)
    num_seqs, width, depth = seqs_1hot.shape

    # extend with blanks for shifts
    gaps = 2 * shift
    gap_col = np.full((num_seqs, shift, depth), np.nan)
    msa_1hot = np.concatenate([gap_col, seqs_1hot, gap_col], axis=1)

    for ei in range(epochs):
        for si in range(num_seqs):
            if si % pwm_iter == 0:
                # pwm = (msa_1hot[:,gaps:-gaps,:] + .1).mean(axis=0)
                pwm = msa_1hot[:, gaps:-gaps, :].sum(axis=0) + 0.1
                pwm /= num_seqs

            # extract sequence
            seq_1hot = seqs_1hot[si]

            # score gap positions
            gap_scores = []
            for gi in range(gaps + 1):
                g1 = gaps - gi
                g2 = g1 + pwm.shape[0]
                gap_1hot = seq_1hot[g1:g2]
                gscore = np.log(pwm[gap_1hot]).sum()
                gap_scores.append(gscore)

            # find best
            gi = np.argmax(gap_scores)
            gj = width + gaps - (gaps - gi)

            # set msa
            msa_1hot[si] = np.nan
            msa_1hot[si, gi:gj, :] = seq_1hot

    # write to FASTA
    write_msa(msa_1hot, msa_fasta)

    return msa_1hot
Пример #5
0
    def seqs_gen():
        for seq_dna in seqs_dna:
            # 1 hot code DNA
            seq_1hot = dna_io.dna_1hot(seq_dna)
            yield {'sequence': seq_1hot}

            # for mutation positions
            for mi in range(mut_start, mut_end):
                # for each nucleotide
                for ni in range(4):
                    # if non-reference
                    if seq_1hot[mi, ni] == 0:
                        # copy and modify
                        seq_mut_1hot = np.copy(seq_1hot)
                        seq_mut_1hot[mi, :] = 0
                        seq_mut_1hot[mi, ni] = 1
                        yield {'sequence': seq_mut_1hot}
Пример #6
0
def satmut_gen(seqs_dna, mut_start, mut_end):
    """Construct generator for 1 hot encoded saturation
     mutagenesis DNA sequences."""

    for seq_dna in seqs_dna:
        # 1 hot code DNA
        seq_1hot = dna_io.dna_1hot(seq_dna)
        yield seq_1hot

        # for mutation positions
        for mi in range(mut_start, mut_end):
            # for each nucleotide
            for ni in range(4):
                # if non-reference
                if seq_1hot[mi, ni] == 0:
                    # copy and modify
                    seq_mut_1hot = np.copy(seq_1hot)
                    seq_mut_1hot[mi, :] = 0
                    seq_mut_1hot[mi, ni] = 1
                    yield seq_mut_1hot
Пример #7
0
def segments_1hot(fasta_file, segments, seq_length, stride):
    """ Read and 1-hot code sequences in their segment batches.

    Args
     fasta_file: FASTA genome
     segments: list of (chrom,start,end) genomic segments to read
     seq_length: sequence length to break them into
     stride: distance to advance each sequence

    Returns:
     seqs_1hot: You know.
     seqs_segments: list of (chrom,start,end) sequence segments
    """

    # open fasta
    fasta = pysam.Fastafile(fasta_file)

    # initialize 1-hot coding list
    seqs_1hot = []

    # segment corresponding to each sequence
    seqs_segments = []

    for chrom, seg_start, seg_end in segments:
        # read sequence
        seg_seq = fasta.fetch(chrom, seg_start, seg_end)

        # break up into batchable sequences (as above in bigwig_batch)
        bstart = 0
        bend = bstart + seq_length
        while bend < len(seg_seq):
            # append
            seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))

            seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))

            # update
            bstart += stride
            bend += stride

    return np.array(seqs_1hot), seqs_segments
Пример #8
0
def main():
    usage = "usage: %prog [options] <fasta_file> <gtf_file> <hdf5_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-g",
        dest="genome_file",
        default=None,
        help="Chromosome lengths file [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="seq_length",
        default=1024,
        type="int",
        help="Sequence length [Default: %default]",
    )
    parser.add_option(
        "-c",
        dest="center_t",
        default=0.333,
        type="float",
        help="Center proportion in which TSSs are required to be [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=1,
        type="int",
        help="Number parallel processes to load data [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="target_wigs_file",
        default=None,
        help="Store target values, extracted from this list of WIG files",
    )
    parser.add_option(
        "-w",
        dest="pool_width",
        type="int",
        default=1,
        help="Average pooling width [Default: %default]",
    )
    parser.add_option(
        "--w5",
        dest="w5",
        default=False,
        action="store_true",
        help="Coverage files are w5 rather than BigWig [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("Must provide genes as GTF, genome FASTA, and output HDF5")
    else:
        fasta_file = args[0]
        gtf_file = args[1]
        hdf5_file = args[2]

    if options.target_wigs_file is not None:
        check_wigs(options.target_wigs_file)

    ################################################################
    # organize TSS's by chromosome

    # read transcripts
    transcripts = gff.read_genes(gtf_file, key_id="transcript_id")

    # read transcript --> gene mapping
    transcript_genes = gff.t2g(gtf_file, feature="exon")

    # make gene --> strand mapping
    gene_strand = {}
    for tx_id in transcripts:
        gene_strand[transcript_genes[tx_id]] = transcripts[tx_id].strand

    # cluster TSSs by gene
    gene_tss = cluster_tss(transcript_genes, transcripts, options.pool_width / 2)

    # hash TSS's by chromosome
    gene_chrom = {}
    for tx_id in transcripts:
        gene_id = transcript_genes[tx_id]
        gene_chrom[gene_id] = transcripts[tx_id].chrom

    chrom_tss = {}
    for gene_id in gene_tss:
        for tss_pos in gene_tss[gene_id]:
            chrom_tss.setdefault(gene_chrom[gene_id], []).append((tss_pos, gene_id))

    # sort TSS's by chromosome
    for chrom in chrom_tss:
        chrom_tss[chrom].sort()

    ################################################################
    # determine segments / map transcripts

    # open fasta (to verify chromosome presence)
    fasta = pysam.Fastafile(fasta_file)

    chrom_sizes = OrderedDict()
    for line in open(options.genome_file):
        a = line.split()
        if a[0] in fasta.references:
            chrom_sizes[a[0]] = int(a[1])
        elif a[0] in chrom_tss:
            print("FASTA missing chromosome - %s" % a[0], file=sys.stderr)
            del chrom_tss[a[0]]

    merge_distance = options.center_t * options.seq_length

    seq_coords = []
    tss_list = []

    # ordering by options.genome_file allows for easier
    #  bigwig output in downstream scripts.

    for chrom in chrom_sizes:
        ctss = chrom_tss.get(chrom, [])

        left_i = 0
        while left_i < len(ctss):
            # left TSS
            left_tss = ctss[left_i][0]

            # right TSS
            right_i = left_i
            while (
                right_i + 1 < len(ctss)
                and ctss[right_i + 1][0] - left_tss < merge_distance
            ):
                right_i += 1
            right_tss = ctss[right_i][0]

            # determine segment midpoint
            seg_mid = (left_tss + right_tss) // 2

            # extend
            seg_start = seg_mid - options.seq_length // 2
            seg_end = seg_start + options.seq_length

            # rescue
            if seg_start < 0 or seg_end >= chrom_sizes[chrom]:
                if chrom_sizes[chrom] == options.seq_length:
                    seg_start = 0
                    seg_end = options.seq_length
                elif chrom_sizes[chrom] > options.seq_length:
                    # also rescuable but not important right now
                    pass

            # save segment
            if seg_start >= 0 and seg_end <= chrom_sizes[chrom]:
                seq_coords.append((chrom, seg_start, seg_end))

                # annotate TSS to indexes
                seq_index = len(seq_coords) - 1
                for i in range(left_i, right_i + 1):
                    tss_pos, gene_id = ctss[i]
                    tss = gene.TSS(
                        "TSS%d" % len(tss_list),
                        gene_id,
                        chrom,
                        tss_pos,
                        seq_index,
                        True,
                        gene_strand[gene_id],
                    )
                    tss_list.append(tss)

            # update
            left_i = right_i + 1

    ################################################################
    # extract target values

    if options.target_wigs_file:
        t0 = time.time()

        # get wig files and labels
        target_wigs_df = pd.read_table(options.target_wigs_file, index_col=0)
        target_wigs = OrderedDict()
        target_labels = []
        for i in range(target_wigs_df.shape[0]):
            target_wig_series = target_wigs_df.iloc[i]
            target_wigs[target_wig_series.identifier] = target_wig_series.file
            target_labels.append(target_wig_series.description)

        # initialize multiprocessing pool
        pool = multiprocessing.Pool(options.processes)

        # bigwig_read parameters
        bwt_params = [
            (wig_file, tss_list, seq_coords, options.pool_width)
            for wig_file in target_wigs.values()
        ]

        # pull the target values in parallel
        if options.w5:
            tss_targets = pool.starmap(wig5_tss_targets, bwt_params)
        else:
            tss_targets = pool.starmap(bigwig_tss_targets, bwt_params)

        # convert to array
        tss_targets = np.transpose(np.array(tss_targets))

    ################################################################
    # extract sequences

    seqs_1hot = []

    for chrom, start, end in seq_coords:
        seq = fasta.fetch(chrom, start, end)
        seqs_1hot.append(dna_io.dna_1hot(seq))

    seqs_1hot = np.array(seqs_1hot)

    fasta.close()

    ################################################################
    # save to HDF5

    # write to HDF5
    hdf5_out = h5py.File(hdf5_file, "w")

    # store pooling
    hdf5_out.create_dataset("pool_width", data=options.pool_width, dtype="int")

    # store gene sequences
    hdf5_out.create_dataset("seqs_1hot", data=seqs_1hot, dtype="bool")

    # store genesequence coordinates
    seq_chrom = np.array([sc[0] for sc in seq_coords], dtype="S")
    seq_start = np.array([sc[1] for sc in seq_coords])
    seq_end = np.array([sc[2] for sc in seq_coords])

    hdf5_out.create_dataset("seq_chrom", data=seq_chrom)
    hdf5_out.create_dataset("seq_start", data=seq_start)
    hdf5_out.create_dataset("seq_end", data=seq_end)

    # store TSSs
    tss_id = np.array([tss.identifier for tss in tss_list], dtype="S")
    tss_gene = np.array([tss.gene_id for tss in tss_list], dtype="S")
    tss_chrom = np.array([tss.chrom for tss in tss_list], dtype="S")
    tss_pos = np.array([tss.pos for tss in tss_list])
    tss_seq = np.array([tss.gene_seq for tss in tss_list])
    tss_strand = np.array([tss.strand for tss in tss_list], dtype="S")

    hdf5_out.create_dataset("tss_id", data=tss_id)
    hdf5_out.create_dataset("tss_gene", data=tss_gene)
    hdf5_out.create_dataset("tss_chrom", data=tss_chrom)
    hdf5_out.create_dataset("tss_pos", data=tss_pos)
    hdf5_out.create_dataset("tss_seq", data=tss_seq)
    hdf5_out.create_dataset("tss_strand", data=tss_strand)

    # store targets
    if options.target_wigs_file:
        # ids
        target_ids = np.array([tl for tl in target_wigs.keys()], dtype="S")
        hdf5_out.create_dataset("target_ids", data=target_ids)

        # labels
        target_labels = np.array(target_labels, dtype="S")
        hdf5_out.create_dataset("target_labels", data=target_labels)

        # values
        hdf5_out.create_dataset("tss_targets", data=tss_targets, dtype="float16")

    hdf5_out.close()
Пример #9
0
 def seqs_gen():
     for seq_dna in model_seqs_dna:
         yield dna_io.dna_1hot(seq_dna)
Пример #10
0
def main():
    usage = 'usage: %prog [options] <scores_file>'
    parser = OptionParser(usage)
    parser.add_option('-d',
                      dest='n_components',
                      default=None,
                      type='int',
                      help='PCA n_components [Default: %default]')
    parser.add_option(
        '-e',
        dest='num_estimators',
        default=100,
        type='int',
        help='Number of random forest estimators [Default: %default]')
    parser.add_option('-g',
                      dest='genome',
                      default='ce11',
                      help='PhyloP and FASTA genome [Default: %default]')
    parser.add_option('-i',
                      dest='iterations',
                      default=1,
                      type='int',
                      help='Cross-validation iterations [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='regr_out')
    parser.add_option(
        '-p',
        dest='parallel_threads',
        default=1,
        type='int',
        help=
        'Parallel threads passed to scikit-learn n_jobs [Default: %default]')
    parser.add_option('-r', dest='random_seed', default=44, type='int')
    parser.add_option('--stat',
                      dest='sad_stat',
                      default='sum',
                      help='HDF5 key stat to consider. [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide ISM scores and PhyloP bigwig file.')
    else:
        scores_file = args[0]

    np.random.seed(options.random_seed)
    options.genome = options.genome.lower()

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    ################################################################
    # read ISM scores

    with h5py.File(scores_file, 'r') as h5o:
        score_chrs = [chrm.decode('UTF-8') for chrm in h5o['chr']]
        score_starts = h5o['start'][:]
        score_ends = h5o['end'][:]
        score_strands = [strand.decode('UTF-8') for strand in h5o['strand']]
        score_seqs = h5o['seqs'][:]
        nt_scores = h5o[options.sad_stat][:].astype('float16')
    num_seqs, mut_len, _, num_targets = nt_scores.shape

    # reference transform
    nt_scores_ref = np.reshape(nt_scores[score_seqs],
                               (num_seqs, mut_len, num_targets))

    # min/max transform
    nt_scores_min = nt_scores.min(axis=-2)
    nt_scores_max = nt_scores.max(axis=-2)
    pos_mask = (nt_scores_ref > 0)
    nt_scores_refm = nt_scores_ref.copy()
    nt_scores_refm[pos_mask] -= nt_scores_min[pos_mask]
    nt_scores_refm[~pos_mask] -= nt_scores_max[~pos_mask]

    ################################################################
    # read phylop bigwig annotations

    genome_path = os.environ[options.genome.upper()]
    fasta_file = '%s/assembly/%s.fa' % (genome_path, options.genome)
    if options.genome == 'ce11':
        phylop_file = '%s/phylop/ce11.phyloP26way.bw' % genome_path
    else:
        print('Genome PhyloP not found', file=sys.stderr)
        exit(1)

    seqs_phylop = []
    seqs_phylop_dna1 = []
    seqs_phylop_mask = np.ones(num_seqs, dtype='bool')

    fasta_open = pysam.FastaFile(fasta_file)
    phylop_open = pyBigWig.open(phylop_file, 'r')

    for si in range(num_seqs):
        phylop_chr = score_chrs[si]
        if not phylop_chr.startswith('chr'):
            phylop_chr = 'chr%s' % phylop_chr

        # read values
        try:
            seq_phylop = phylop_open.values(phylop_chr,
                                            score_starts[si],
                                            score_ends[si],
                                            numpy=True).astype('float16')

            # read DNA
            seq_phylop_dna = fasta_open.fetch(score_chrs[si], score_starts[si],
                                              score_ends[si])
            seq_phylop_dna1 = dna_io.dna_1hot(seq_phylop_dna)

            # reverse complement
            if score_strands[si] == '-':
                seq_phylop = seq_phylop[::-1]
                seq_phylop_dna1 = dna_io.hot1_rc(seq_phylop_dna1)

            # save
            seqs_phylop.append(seq_phylop)
            seqs_phylop_dna1.append(seq_phylop_dna1)

        except RuntimeError:
            print('Ignoring %s:%d-%d; phylop not found.' % \
                (phylop_chr, score_starts[si], score_ends[si]), file=sys.stderr)
            seqs_phylop_mask[si] = False

    # filter for valid sequences
    nt_scores = nt_scores[seqs_phylop_mask]
    nt_scores_ref = nt_scores_ref[seqs_phylop_mask]
    nt_scores_refm = nt_scores_refm[seqs_phylop_mask]
    score_seqs = score_seqs[seqs_phylop_mask]
    num_seqs = len(score_seqs)

    # transform PhyloP
    seqs_phylop = np.array(seqs_phylop, dtype='float32')
    seqs_phylop = np.nan_to_num(seqs_phylop)
    seqs_phylop = np.clip(seqs_phylop, -1.5, 5)

    # verify DNA
    seqs_phylop_dna1 = np.array(seqs_phylop_dna1)
    for si in range(num_seqs):
        seq_diff = np.logical_xor(score_seqs[si], seqs_phylop_dna1[si])
        nts_diff = seq_diff.sum() // 2
        if nts_diff != 0:
            pdb.set_trace()

    ################################################################
    # regression

    # add positions
    seqs_pos = np.arange(mut_len)
    seqs_pos = np.tile(seqs_pos, num_seqs)
    seqs_pos = np.reshape(seqs_pos, (num_seqs, -1, 1))

    # flatten everything
    # seqs_phylop_flat = seqs_phylop.flatten()
    # seqs_pos_flat = seqs_pos.flatten()
    # nt_scores_refm_flat = nt_scores_refm.reshape((-1,num_targets))
    # num_pos = nt_scores_refm_flat.shape[0]

    # form matrix
    # X_scores = nt_scores_refm_flat
    # if options.n_components is not None:
    #     options.n_components = min(options.n_components, num_targets)
    #     X_scores = PCA(options.n_components).fit_transform(nt_scores_refm_flat)
    # X_pos = seqs_pos.reshape(num_pos,1)
    # X = np.concatenate([X_scores,X_pos], axis=1)

    X = np.concatenate([nt_scores_refm, seqs_pos], axis=-1)
    X = X.astype('float32')

    # regressor
    r2s, pcors = randfor_cv(X,
                            seqs_phylop,
                            iterations=options.iterations,
                            n_estimators=options.num_estimators,
                            random_state=options.random_seed,
                            n_jobs=options.parallel_threads)

    # save
    np.save('%s/r2.npy' % options.out_dir, r2s)
    np.save('%s/pcor.npy' % options.out_dir, pcors)

    # print stats
    iterations = len(r2s)
    stats_out = open('%s/stats.txt' % options.out_dir, 'w')
    print('R2 %.4f (%.4f)' % (r2s.mean(), r2s.std() / np.sqrt(iterations)),
          file=stats_out)
    print('pR %.4f (%.4f)' % (pcors.mean(), pcors.std() / np.sqrt(iterations)),
          file=stats_out)
    stats_out.close()
Пример #11
0
 def seqs_gen():
     for seq_dna in seqs_dna:
         # 1 hot code DNA
         seq_1hot = dna_io.dna_1hot(seq_dna)
         yield {'sequence': seq_1hot}
Пример #12
0
    def run(self):
        while True:
            try:
                # unload predictions
                seq_dna, seq_pred_stats, si = self.queue.get()
                seq_preds_sum, seq_preds_center, seq_preds_scd = seq_pred_stats
                print('Writing %d' % si, flush=True)

                # seq_preds_sum is (1 + 3*mut_len) x (num_targets)
                num_preds, num_targets = seq_preds_sum.shape
                mut_len = self.mut_end - self.mut_start

                # one hot code mutagenized DNA
                seq_dna_mut = seq_dna[self.mut_start:self.mut_end]
                seq_1hot_mut = dna_io.dna_1hot(seq_dna_mut)

                # write to HDF5
                self.scores_h5['seqs'][si, :, :] = seq_1hot_mut

                for sad_stat in self.sad_stats:
                    # initialize scores
                    seq_scores = np.zeros((mut_len, 4, num_targets),
                                          dtype='float32')

                    # summary stat
                    if sad_stat == 'sum':
                        seq_preds_stat = seq_preds_sum
                    elif sad_stat == 'center':
                        seq_preds_stat = seq_preds_center
                    elif sad_stat == 'scd':
                        seq_preds_stat = seq_preds_scd
                    else:
                        print('Unrecognized summary statistic "%s"' %
                              options.sad_stat)
                        exit(1)

                    # predictions index (starting at first mutagenesis)
                    pi = 1

                    # for each mutated position
                    for mi in range(mut_len):
                        # for each nucleotide
                        for ni in range(4):
                            if seq_1hot_mut[mi, ni]:
                                # reference score
                                seq_scores[mi, ni, :] = seq_preds_stat[0, :]
                            else:
                                # mutation score
                                seq_scores[mi, ni, :] = seq_preds_stat[pi, :]
                                pi += 1

                    # normalize positions
                    if sad_stat != 'sqdiff':
                        seq_scores -= seq_scores.mean(axis=1, keepdims=True)

                    # write to HDF5
                    self.scores_h5[sad_stat][si, :, :, :] = seq_scores.astype(
                        'float16')

            except:
                # communicate error
                print('ERROR: Sequence %d failed' % si,
                      file=sys.stderr,
                      flush=True)

            # communicate finished task
            self.queue.task_done()
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>'
    parser = OptionParser(usage)
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Sequence distribution value to set unmappable positions to, eg 0.25.')
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2])))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)

    seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['seqs_cov'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets), dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        targets[:, :, ti] = seqs_cov_open['seqs_cov'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # modify unmappable

    if options.umap_npy is not None and options.umap_set is not None:
        unmap_mask = np.load(options.umap_npy)

        for si in range(num_seqs):
            msi = options.start_i + si

            # determine unmappable null value
            seq_target_null = np.percentile(targets[si],
                                            q=[100 * options.umap_set],
                                            axis=0)[0]

            # set unmappable positions to null
            targets[si, unmap_mask[msi, :], :] = np.minimum(
                targets[si, unmap_mask[msi, :], :], seq_target_null)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.python_io.TFRecordOptions(
        tf.python_io.TFRecordCompressionType.ZLIB)

    with tf.python_io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)

            # example = tf.train.Example(features=tf.train.Features(feature={
            #     'sequence': _bytes_feature(seq_1hot.flatten().tostring()),
            #     'target': _float_feature(targets[si,:,:].flatten())}))
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'sequence':
                    _bytes_feature(seq_1hot.flatten().tostring()),
                    'target':
                    _bytes_feature(targets[si, :, :].flatten().tostring())
                }))

            writer.write(example.SerializeToString())

        fasta_open.close()
Пример #14
0
def main():
    usage = 'usage: %prog [options] <fasta> <tss_gff> <expr_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-c',
        dest='cluster_gene_distance',
        default=2000,
        type='int',
        help=
        'Cluster genes into the same split within this distance [Default: %default]'
    )
    parser.add_option(
        '-g',
        dest='gene_index',
        default='gene_name',
        help='Key to match TSS GFF to expression table [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=65536,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='genes_out')
    parser.add_option(
        '-n',
        dest='n_allowed_pct',
        default=0.25,
        type='float',
        help=
        'Proportion of sequence allowed to be Ns on one side [Default: %default]'
    )
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=256,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option(
        '-s',
        dest='sqrt',
        default=False,
        action='store_true',
        help='Square root the expression values [Default: %default]')
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.1,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.1,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('')
    else:
        fasta_file = args[0]
        tss_gff_file = args[1]
        expr_file = args[2]

    if os.path.isdir(options.out_dir):
        print('Remove output directory %s.' % options.out_dir)
        exit(1)
    else:
        os.mkdir(options.out_dir)

    ################################################################
    # read genes and targets

    genes_raw_df = gff_df(tss_gff_file, options.gene_index)
    expr_raw_df = pd.read_csv(expr_file, index_col=0)
    if options.sqrt:
        expr_raw_df = np.sqrt(expr_raw_df)

    # filter for shared genes
    shared_genes = set(genes_raw_df.index) & set(expr_raw_df.index)
    shared_genes = sorted(shared_genes)
    print('Shared %d genes of %d described and %d quantified' %  \
      (len(shared_genes), genes_raw_df.shape[0], expr_raw_df.shape[0]))

    # align gene info and expression
    genes_df = genes_raw_df.loc[shared_genes]
    expr_df = expr_raw_df.loc[shared_genes]
    assert (genes_df.shape[0] == expr_df.shape[0])

    ################################################################
    # filter genes from chromosome ends

    gene_valid_mask = sufficient_sequence(fasta_file, genes_df,
                                          options.seq_length,
                                          options.n_allowed_pct)
    genes_df = genes_df.loc[gene_valid_mask]
    expr_df = expr_df.loc[gene_valid_mask]

    ################################################################
    # divide between train/valid/test

    # permute genes
    np.random.seed(44)
    permute_order = np.random.permutation(genes_df.shape[0])
    genes_df = genes_df.iloc[permute_order]
    expr_df = expr_df.iloc[permute_order]
    assert ((genes_df.index == expr_df.index).all())

    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert (0 <= valid_pct <= 1)
        assert (0 <= test_pct <= 1)

        # divide by pct
        tvt_indexes = divide_genes_pct(genes_df, test_pct, valid_pct,
                                       options.cluster_gene_distance)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chrs = options.valid_pct_or_chr.split(',')
        test_chrs = options.test_pct_or_chr.split(',')
        tvt_indexes = divide_genes_chr(genes_df, test_chrs, valid_chrs)

    # write gene sets
    train_index, valid_index, test_index = tvt_indexes
    genes_df.iloc[train_index].to_csv('%s/genes_train.csv' % options.out_dir,
                                      sep='\t')
    genes_df.iloc[valid_index].to_csv('%s/genes_valid.csv' % options.out_dir,
                                      sep='\t')
    genes_df.iloc[test_index].to_csv('%s/genes_test.csv' % options.out_dir,
                                     sep='\t')

    # write targets
    targets_df = pd.DataFrame({
        'identifier': expr_df.columns,
        'description': expr_df.columns
    })
    targets_df.index.name = 'index'
    targets_df.to_csv('%s/targets.txt' % options.out_dir, sep='\t')

    ################################################################
    # write TFRecords

    tfr_dir = '%s/tfrecords' % options.out_dir
    os.mkdir(tfr_dir)

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.io.TFRecordOptions(compression='ZLIB')

    tvt_tuples = [('train', train_index), ('valid', valid_index),
                  ('test', test_index)]
    for set_label, set_index in tvt_tuples:
        genes_set_df = genes_df.iloc[set_index]
        expr_set_df = expr_df.iloc[set_index]

        num_set = genes_set_df.shape[0]
        num_set_tfrs = int(np.ceil(num_set / options.seqs_per_tfr))

        # gene sequence index
        si = 0

        for tfr_i in range(num_set_tfrs):
            tfr_file = '%s/%s-%d.tfr' % (tfr_dir, set_label, tfr_i)
            print(tfr_file)
            with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
                # TFR index
                ti = 0
                while ti < options.seqs_per_tfr and si < num_set:
                    gene = genes_set_df.iloc[si]
                    seq_chrm = gene.chr
                    mid_pos = (gene.start + gene.end) // 2
                    seq_start = mid_pos - options.seq_length // 2
                    seq_end = seq_start + options.seq_length

                    if seq_start < 0:
                        # fill left side first
                        n_requested = -seq_start
                        seq_dna = ''.join([
                            random.choice('ACGT') for i in range(n_requested)
                        ])
                        seq_dna += fasta_open.fetch(seq_chrm, 0, seq_end)
                    else:
                        seq_dna = fasta_open.fetch(seq_chrm, seq_start,
                                                   seq_end)

                    # fill out right side
                    if len(seq_dna) > 0:
                        n_requested = options.seq_length - len(seq_dna)
                        seq_dna += ''.join([
                            random.choice('ACGT') for i in range(n_requested)
                        ])

                    # verify length
                    assert (len(seq_dna) == options.seq_length)

                    # orient
                    if gene.strand == '-':
                        seq_dna = rc(seq_dna)

                    # one hot code
                    seq_1hot = dna_1hot(seq_dna)

                    # get targets
                    targets = expr_set_df.iloc[si].values
                    targets = targets.reshape((1, -1)).astype('float16')

                    # make example
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'sequence':
                            _bytes_feature(seq_1hot.flatten().tostring()),
                            'target':
                            _bytes_feature(targets.flatten().tostring())
                        }))

                    # write
                    writer.write(example.SerializeToString())

                    # advance indexes
                    ti += 1
                    si += 1

    fasta_open.close()

    ################################################################
    # stats

    stats_dict = {}
    stats_dict['num_targets'] = targets_df.shape[0]
    stats_dict['train_seqs'] = len(train_index)
    stats_dict['valid_seqs'] = len(valid_index)
    stats_dict['test_seqs'] = len(test_index)
    stats_dict['seq_length'] = options.seq_length
    stats_dict['target_length'] = 1

    with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
Пример #15
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>'
    parser = OptionParser(usage)
    parser.add_option('-g',
                      dest='genome_index',
                      default=None,
                      type='int',
                      help='Genome index')
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('--te',
                      dest='target_extend',
                      default=None,
                      type='int',
                      help='Extend targets vector [Default: %default]')
    parser.add_option(
        '--ts',
        dest='target_start',
        default=0,
        type='int',
        help='Write targets into vector starting at index [Default: %default')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Sequence distribution value to set unmappable positions to, eg 0.25.')
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    if options.genome_index is None:
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    else:
        seqs_cov_file = '%s/%d-%d.h5' % (seqs_cov_dir, options.genome_index,
                                         ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        if options.genome_index is None:
            seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
        else:
            seqs_cov_file = '%s/%d-%d.h5' % (seqs_cov_dir,
                                             options.genome_index, ti)

    if len(seqs_cov_files) == 0:
        print('Sequence coverage files not found, e.g. %s' % seqs_cov_file,
              file=sys.stderr)
        exit(1)

    seq_pool_len_hic = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert (options.target_extend >= num_targets_tfr)
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len_hic, num_targets_tfr),
                       dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open['targets'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.io.TFRecordOptions(compression='ZLIB')

    with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)

            if options.genome_index is None:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'genome':
                        _int_feature(0),
                        'sequence':
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        'target':
                        _bytes_feature(targets[si, :, :].flatten().tostring())
                    }))
            else:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'genome':
                        _int_feature(options.genome_index),
                        'sequence':
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        'target':
                        _bytes_feature(targets[si, :, :].flatten().tostring())
                    }))

            writer.write(example.SerializeToString())

        fasta_open.close()
Пример #16
0
def parse_input(input_file, sample):
    """ Parse an input file that might be FASTA or HDF5. """

    try:
        # input_file is FASTA

        # read sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        # one hot code sequences
        seqs_1hot = []
        for seq in seqs:
            seqs_1hot.append(dna_io.dna_1hot(seq))
        seqs_1hot = np.array(seqs_1hot)

        # sample
        if sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            seqs = seqs[sample_i]

        # initialize targets variable
        targets = None

    except (UnicodeDecodeError):
        # input_file is HDF5

        try:
            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")
            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            hdf5_in.close()

            # sample
            if sample:
                sample_i = np.array(
                    random.sample(range(seqs_1hot.shape[0]), sample))
                seqs_1hot = seqs_1hot[sample_i]
                targets = targets[sample_i]

            # convert to ACGT sequences
            seqs = dna_io.hot1_dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    return seqs, seqs_1hot, targets
Пример #17
0
def main():
    usage = (
        "usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>"
    )
    parser = OptionParser(usage)
    parser.add_option("-g",
                      dest="genome_index",
                      default=None,
                      type="int",
                      help="Genome index")
    parser.add_option(
        "-s",
        dest="start_i",
        default=0,
        type="int",
        help="Sequence start index [Default: %default]",
    )
    parser.add_option(
        "-e",
        dest="end_i",
        default=None,
        type="int",
        help="Sequence end index [Default: %default]",
    )
    parser.add_option(
        "--te",
        dest="target_extend",
        default=None,
        type="int",
        help="Extend targets vector [Default: %default]",
    )
    parser.add_option(
        "--ts",
        dest="target_start",
        default=0,
        type="int",
        help="Write targets into vector starting at index [Default: %default",
    )
    parser.add_option("-u",
                      dest="umap_npy",
                      help="Unmappable array numpy file")
    parser.add_option(
        "--umap_set",
        dest="umap_set",
        default=None,
        type="float",
        help=
        "Sequence distribution value to set unmappable positions to, eg 0.25.",
    )
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error("Must provide input arguments.")
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    if options.genome_index is None:
        seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti)
    else:
        seqs_cov_file = "%s/%d-%d.h5" % (seqs_cov_dir, options.genome_index,
                                         ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        if options.genome_index is None:
            seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti)
        else:
            seqs_cov_file = "%s/%d-%d.h5" % (seqs_cov_dir,
                                             options.genome_index, ti)

    if len(seqs_cov_files) == 0:
        print(
            "Sequence coverage files not found, e.g. %s" % seqs_cov_file,
            file=sys.stderr,
        )
        exit(1)

    seq_pool_len = h5py.File(seqs_cov_files[0], "r")["seqs_cov"].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert options.target_extend >= num_targets_tfr
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr),
                       dtype="float16")

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], "r")
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open["seqs_cov"][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # modify unmappable

    if options.umap_npy is not None and options.umap_set is not None:
        unmap_mask = np.load(options.umap_npy)

        for si in range(num_seqs):
            msi = options.start_i + si

            # determine unmappable null value
            seq_target_null = np.percentile(targets[si],
                                            q=[100 * options.umap_set],
                                            axis=0)[0]

            # set unmappable positions to null
            targets[si, unmap_mask[msi, :], :] = np.minimum(
                targets[si, unmap_mask[msi, :], :], seq_target_null)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.python_io.TFRecordOptions(
        tf.python_io.TFRecordCompressionType.ZLIB)

    with tf.python_io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)

            if options.genome_index is None:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        "genome":
                        _int_feature(0),
                        "sequence":
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        "target":
                        _bytes_feature(targets[si, :, :].flatten().tostring()),
                    }))
            else:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        "genome":
                        _int_feature(options.genome_index),
                        "sequence":
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        "target":
                        _bytes_feature(targets[si, :, :].flatten().tostring()),
                    }))

            writer.write(example.SerializeToString())

        fasta_open.close()
Пример #18
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>'
    parser = OptionParser(usage)
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('--te',
                      dest='target_extend',
                      default=None,
                      type='int',
                      help='Extend targets vector [Default: %default]')
    parser.add_option(
        '--ts',
        dest='target_start',
        default=0,
        type='int',
        help='Write targets into vector starting at index [Default: %default')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_clip',
        dest='umap_clip',
        default=1,
        type='float',
        help=
        'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]'
    )
    parser.add_option(
        '--umap_tfr',
        dest='umap_tfr',
        default=False,
        action='store_true',
        help='Save umap array into TFRecords [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)

    if len(seqs_cov_files) == 0:
        print('Sequence coverage files not found, e.g. %s' % seqs_cov_file,
              file=sys.stderr)
        exit(1)

    seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert (options.target_extend >= num_targets_tfr)
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr),
                       dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open['targets'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # modify unmappable

    if options.umap_npy is not None and options.umap_clip < 1:
        unmap_mask = np.load(options.umap_npy)

        for si in range(num_seqs):
            msi = options.start_i + si

            # determine unmappable null value
            seq_target_null = np.percentile(targets[si],
                                            q=[100 * options.umap_clip],
                                            axis=0)[0]

            # set unmappable positions to null
            targets[si, unmap_mask[msi, :], :] = np.minimum(
                targets[si, unmap_mask[msi, :], :], seq_target_null)

    elif options.umap_npy is not None and options.umap_tfr:
        unmap_mask = np.load(options.umap_npy)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')

    with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)
            # seq_1hot = dna_1hot_index(seq_dna) # more efficient, but fighting inertia

            # hash to bytes
            features_dict = {
                'sequence': feature_bytes(seq_1hot),
                'target': feature_bytes(targets[si, :, :])
            }

            # add unmappability
            if options.umap_tfr:
                features_dict['umap'] = feature_bytes(unmap_mask[msi, :])

            # write example
            example = tf.train.Example(features=tf.train.Features(
                feature=features_dict))
            writer.write(example.SerializeToString())

        fasta_open.close()