示例#1
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '-c',
        dest='clip',
        default=None,
        type='float',
        help='Clip target values to have minimum [Default: %default]')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-f',
                      dest='fourier_dim',
                      default=None,
                      type='int',
                      help='Fourier transform dimension [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--log2',
        dest='log10to2',
        default=False,
        action='store_true',
        help='Transform values from log10 to log2 [Default: %default]')
    parser.add_option('-m',
                      dest='params_file',
                      help='Dimension reduction hyper-parameters file')
    parser.add_option(
        '--mult_cov',
        dest='cov_multiplier',
        default=1,
        type='float',
        help=
        'Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]'
    )
    parser.add_option(
        '-n',
        dest='na_t',
        default=0.25,
        type='float',
        help=
        'Remove sequences with an NA% greater than this threshold [Default: %default]'
    )
    parser.add_option(
        '--no_full',
        dest='no_full',
        default=False,
        action='store_true',
        help='Do not save full test sequence targets [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_bed_file',
        help='Output the train/valid/test sequences as a BED file')
    parser.add_option(
        '-p',
        dest='processes',
        default=1,
        type='int',
        help='Number parallel processes to load data [Default: %default]')
    parser.add_option('-s',
                      dest='stride',
                      default=None,
                      type='int',
                      help='Stride to advance segments [Default: seq_length]')
    parser.add_option('--scent',
                      dest='scent_file',
                      help='Dimension reduction model file')
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        type='str',
        default=0.05,
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='unmap_bed',
                      help='Unmappable segments to set to NA')
    parser.add_option('-w',
                      dest='pool_width',
                      type='int',
                      default=128,
                      help='Average pooling width [Default: %default]')
    parser.add_option(
        '--w5',
        dest='w5',
        default=False,
        action='store_true',
        help='Coverage files are w5 rather than BigWig [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        type='str',
        default=0.05,
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option('-z',
                      dest='compression',
                      help='h5py compression [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide genome FASTA file, sample Wig/BigWig labels and paths, '
            'and model output file')
    else:
        fasta_file = args[0]
        sample_wigs_file = args[1]
        hdf5_file = args[2]

    random.seed(1)

    if options.stride is None:
        options.stride = options.seq_length

    ################################################################
    # assess bigwigs
    ################################################################
    # get wig files and labels
    target_wigs = OrderedDict()
    target_strands = []
    target_labels = []
    for line in open(sample_wigs_file, encoding='UTF-8'):
        a = line.rstrip().split('\t')
        target_wigs[a[0]] = a[1]
        if len(a) > 2:
            target_strands.append(a[2])
        else:
            target_strands.append('*')
        if len(a) > 3:
            target_labels.append(a[3])
        else:
            target_labels.append('')

    if options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width:
        print(
            "Fourier transform to %d dims won't compress %d length sequences with %d pooling"
            % (options.fourier_dim, options.seq_length, options.pool_width),
            file=sys.stderr)
        exit(1)

    ################################################################
    # prepare genomic segments
    ################################################################
    chrom_segments = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_segments = genome.split_contigs(chrom_segments,
                                              options.gaps_file)

    # ditch the chromosomes
    segments = []
    for chrom in chrom_segments:
        segments += [(chrom, seg_start, seg_end)
                     for seg_start, seg_end in chrom_segments[chrom]]

    # standardize order
    segments.sort()

    # filter for large enough
    segments = [
        cse for cse in segments if cse[2] - cse[1] >= options.seq_length
    ]

    # down-sample
    if options.sample_pct < 1.0:
        segments = random.sample(segments,
                                 int(options.sample_pct * len(segments)))

    # limit to a BED file
    if options.limit_bed is not None:
        segments = limit_segments(segments, options.limit_bed)

    ################################################################
    # one hot code sequences
    ################################################################
    seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments,
                                             options.seq_length,
                                             options.stride)
    print('%d sequences one hot coded' % seqs_1hot.shape[0])

    ################################################################
    # load model
    ################################################################
    if options.params_file:
        job = dna_io.read_job_params(options.params_file)
        job['num_targets'] = len(target_wigs)
        job['batch_size'] = 1024
        job['model'] = job.get('model', 'autoencoder')

        if job['model'] == 'autoencoder':
            model = autoencoder.AE(job)
            saver = tf.train.Saver()
        else:
            model = joblib.load(options.scent_file)

    ################################################################
    # bigwig read and process
    ################################################################
    print('Reading and pre-processing bigwigs for %d segments' % len(segments),
          flush=True)

    targets_real = []
    targets_imag = []

    include_indexes = []
    include_marker = 0

    targets_test = []
    test_indexes = []
    test_marker = 0

    update_i = 0
    ssi = 0

    # initialize multiprocessing pool
    pool = multiprocessing.Pool(options.processes)

    with tf.Session() as sess:
        if options.scent_file and job['model'] == 'autoencoder':
            saver.restore(sess, options.scent_file)

        # batch segment processing
        bstart = 0
        while bstart < len(segments):
            if update_i % 1 == 0:
                print('Tiling from %s:%d-%d' % segments[bstart], flush=True)

            # determine batch end
            bend = batch_end(segments, bstart, 400000)

            # bigwig_read parameters
            bwr_params = [(wig_file, segments[bstart:bend], options.seq_length,
                           options.pool_width, options.stride,
                           options.log10to2, options.cov_multiplier)
                          for wig_file in target_wigs.values()]

            # pull the target values in parallel
            if options.w5:
                wig_targets = pool.starmap(w5_batch, bwr_params)
            else:
                wig_targets = pool.starmap(bigwig_batch, bwr_params)

            # transpose to S x L x T (making a copy?)
            targets_wig = np.transpose(np.array(wig_targets), axes=(1, 2, 0))

            # clip
            if options.clip is not None:
                targets_wig = targets_wig.clip(options.clip)

            # sample indexes from this batch
            if options.test_pct_or_chr.startswith('chr'):
                test_bindexes = [
                    twi for twi in range(targets_wig.shape[0])
                    if seqs_segments[ssi + twi][0] == options.test_pct_or_chr
                ]
            else:
                test_pct = float(options.test_pct_or_chr)
                test_bindexes = [
                    twi for twi in range(targets_wig.shape[0])
                    if random.random() < test_pct
                ]

            # capture test indexes
            test_indexes += [test_marker + tbi for tbi in test_bindexes]

            # update test marker
            test_marker += targets_wig.shape[0]

            # save the full test targets
            if not options.no_full:
                targets_test.append(targets_wig[test_bindexes])

            # map to latent space
            if options.scent_file is None:
                targets_latent = targets_wig
            else:
                targets_latent = latent_transform(sess, model, job,
                                                  targets_wig)

            # compress across length
            if options.fourier_dim is None:
                targets_rfour = targets_latent
                targets_ifour = None
            else:
                targets_rfour, targets_ifour = fourier_transform(
                    targets_latent, options.fourier_dim)

            # save
            targets_real.append(targets_rfour)
            targets_imag.append(targets_ifour)

            # update seqs_segments index
            ssi += targets_wig.shape[0]

            # update batch
            bstart = bend
            update_i += 1

    pool.close()

    # stack arrays
    targets_real = np.vstack(targets_real)
    if options.fourier_dim is not None:
        targets_imag = np.vstack(targets_imag)
    if not options.no_full:
        targets_test = np.vstack(targets_test)

    print('%d target sequences' % targets_real.shape[0])

    ################################################################
    # correct for unmappable regions
    ################################################################
    if options.unmap_bed is not None:
        seqs_na = annotate_na(seqs_segments, options.unmap_bed,
                              options.seq_length, options.pool_width)

        # determine mappable sequences and update test indexes
        map_indexes = []
        test_indexes_set = set(test_indexes)
        print('test_indexes', len(test_indexes))
        test_indexes_na = []
        new_i = 0

        for old_i in range(seqs_na.shape[0]):
            # mappable
            if seqs_na[old_i, :].mean(dtype='float64') < options.na_t:
                map_indexes.append(old_i)

                if old_i in test_indexes_set:
                    test_indexes_na.append(new_i)

                new_i += 1

            # unmappable
            else:
                # forget it
                pass

        # update data structures
        targets_real = targets_real[map_indexes]
        if options.fourier_dim is not None:
            targets_imag = targets_imag[map_indexes]

        seqs_1hot = seqs_1hot[map_indexes]
        seqs_segments = [seqs_segments[mi] for mi in map_indexes]
        seqs_na = seqs_na[map_indexes]

        test_indexes = test_indexes_na
        print('test_indexes', len(test_indexes))

    ################################################################
    # write to train, valid, test HDF5
    ################################################################

    if options.valid_pct_or_chr.startswith('chr'):
        # sample valid chromosome
        valid_indexes = [
            si for si in range(len(seqs_segments))
            if seqs_segments[si][0] == options.valid_pct_or_chr
        ]

    else:
        # sample valid indexes (we already have test)
        valid_pct = float(options.valid_pct_or_chr)
        valid_n = int(valid_pct * targets_real.shape[0])
        nontest_indexes = set(range(targets_real.shape[0])) - set(test_indexes)
        valid_indexes = random.sample(nontest_indexes, valid_n)

    # remainder is training
    train_indexes = list(
        set(range(len(seqs_segments))) - set(valid_indexes) -
        set(test_indexes))

    # training may requires shuffle
    random.shuffle(sorted(train_indexes))
    random.shuffle(sorted(valid_indexes))
    random.shuffle(sorted(test_indexes))

    # write to HDF5
    hdf5_out = h5py.File(hdf5_file, 'w')

    # store pooling
    hdf5_out.create_dataset('pool_width', data=options.pool_width, dtype='int')

    # store targets
    target_ids = np.array(list(target_wigs.keys()), dtype='S')
    hdf5_out.create_dataset('target_ids', data=target_ids)

    target_labels = np.array(target_labels, dtype='S')
    hdf5_out.create_dataset('target_labels', data=target_labels)

    target_strands = np.array(target_strands, dtype='S')
    hdf5_out.create_dataset('target_strands', data=target_strands)

    # HDF5 train
    hdf5_out.create_dataset('train_in',
                            data=seqs_1hot[train_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('train_out',
                            data=targets_real[train_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('train_out_imag',
                                data=targets_imag[train_indexes],
                                dtype='float16',
                                compression=options.compression)
    if options.unmap_bed is not None:
        hdf5_out.create_dataset('train_na',
                                data=seqs_na[train_indexes],
                                dtype='bool',
                                compression=options.compression)

    # HDF5 valid
    hdf5_out.create_dataset('valid_in',
                            data=seqs_1hot[valid_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('valid_out',
                            data=targets_real[valid_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('valid_out_imag',
                                data=targets_imag[valid_indexes],
                                dtype='float16',
                                compression=options.compression)
    if options.unmap_bed is not None:
        hdf5_out.create_dataset('valid_na',
                                data=seqs_na[valid_indexes],
                                dtype='bool',
                                compression=options.compression)

    # HDF5 test
    hdf5_out.create_dataset('test_in',
                            data=seqs_1hot[test_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('test_out',
                            data=targets_real[test_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('test_out_imag',
                                data=targets_imag[test_indexes],
                                dtype='float16',
                                compression=options.compression)
    if not options.no_full:
        hdf5_out.create_dataset('test_out_full',
                                data=targets_test,
                                dtype='float16',
                                compression=options.compression)
    if options.unmap_bed is not None:
        hdf5_out.create_dataset('test_na',
                                data=seqs_na[test_indexes],
                                dtype='bool',
                                compression=options.compression)

    hdf5_out.close()

    # output BED file
    if options.out_bed_file:
        out_bed_out = open(options.out_bed_file, 'w')
        for si in train_indexes:
            print('%s\t%d\t%d\ttrain' % seqs_segments[si], file=out_bed_out)
        for si in valid_indexes:
            print('%s\t%d\t%d\tvalid' % seqs_segments[si], file=out_bed_out)
        for si in test_indexes:
            print('%s\t%d\t%d\ttest' % seqs_segments[si], file=out_bed_out)
        out_bed_out.close()
示例#2
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=786432,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    parser.add_option('-c',
                      '--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-f',
                      dest='folds',
                      default=None,
                      type='int',
                      help='Generate cross fold split [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-i',
                      dest='interp_nan',
                      default=False,
                      action='store_true',
                      help='Interpolate NaNs [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option(
        '--peaks',
        dest='peaks_only',
        default=False,
        action='store_true',
        help='Create contigs only from peaks [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=256,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option(
        '--restart',
        dest='restart',
        default=False,
        action='store_true',
        help='Skip already read HDF5 coverage values. [Default: %default]')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--snap',
        dest='snap',
        default=1,
        type='int',
        help='Snap sequences to multiple of the given value [Default: %default]'
    )
    parser.add_option('--st',
                      '--split_test',
                      dest='split_test',
                      default=False,
                      action='store_true',
                      help='Exit after split. [Default: %default]')
    parser.add_option(
        '--stride',
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.5,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_clip',
        dest='umap_clip',
        default=1,
        type='float',
        help=
        'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]'
    )
    parser.add_option(
        '--umap_tfr',
        dest='umap_tfr',
        default=False,
        action='store_true',
        help='Save umap array into TFRecords [Default: %default]')
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print('stride_train %.f' % options.stride_train, end='')
        options.stride_train = options.stride_train * options.seq_length
        print(' converted to %f' % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        if options.folds is None:
            print('stride_test %.f' % options.stride_test, end='')
            options.stride_test = options.stride_test * options.seq_length
            print(' converted to %f' % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    # check snap
    if options.snap is not None:
        if np.mod(options.seq_length, options.snap) != 0:
            raise ValueError('seq_length must be a multiple of snap')
        if np.mod(options.stride_train, options.snap) != 0:
            raise ValueError('stride_train must be a multiple of snap')
        if np.mod(options.stride_test, options.snap) != 0:
            raise ValueError('stride_test must be a multiple of snap')

    # setup output directory
    if os.path.isdir(options.out_dir) and not options.restart:
        print('Remove output directory %s or use --restart option.' %
              options.out_dir)
        exit(1)
    elif not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # read target datasets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')

    ################################################################
    # define genomic contigs
    ################################################################
    if not options.restart:
        chrom_contigs = genome.load_chromosomes(fasta_file)

        # remove gaps
        if options.gaps_file:
            chrom_contigs = genome.split_contigs(chrom_contigs,
                                                 options.gaps_file)

        # ditch the chromosomes for contigs
        contigs = []
        for chrom in chrom_contigs:
            contigs += [
                Contig(chrom, ctg_start, ctg_end)
                for ctg_start, ctg_end in chrom_contigs[chrom]
            ]

        # limit to a BED file
        if options.limit_bed is not None:
            contigs = limit_contigs(contigs, options.limit_bed)

        # limit to peaks
        if options.peaks_only:
            peaks_bed = curate_peaks(targets_df, options.out_dir,
                                     options.pool_width, options.crop_bp)
            contigs = limit_contigs(contigs, peaks_bed)

        # filter for large enough
        contigs = [
            ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
        ]

        # break up large contigs
        if options.break_t is not None:
            contigs = break_large_contigs(contigs, options.break_t)

        # print contigs to BED file
        # ctg_bed_file = '%s/contigs.bed' % options.out_dir
        # write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    # label folds
    if options.folds is not None:
        fold_labels = ['fold%d' % fi for fi in range(options.folds)]
        num_folds = options.folds
    else:
        fold_labels = ['train', 'valid', 'test']
        num_folds = 3

    if not options.restart:
        if options.folds is not None:
            # divide by fold pct
            fold_contigs = divide_contigs_folds(contigs, options.folds)

        else:
            try:
                # convert to float pct
                valid_pct = float(options.valid_pct_or_chr)
                test_pct = float(options.test_pct_or_chr)
                assert (0 <= valid_pct <= 1)
                assert (0 <= test_pct <= 1)

                # divide by pct
                fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct)

            except (ValueError, AssertionError):
                # divide by chr
                valid_chrs = options.valid_pct_or_chr.split(',')
                test_chrs = options.test_pct_or_chr.split(',')
                fold_contigs = divide_contigs_chr(contigs, test_chrs,
                                                  valid_chrs)

        # rejoin broken contigs within set
        for fi in range(len(fold_contigs)):
            fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi])

        # write labeled contigs to BED file
        ctg_bed_file = '%s/contigs.bed' % options.out_dir
        ctg_bed_out = open(ctg_bed_file, 'w')
        for fi in range(len(fold_contigs)):
            for ctg in fold_contigs[fi]:
                line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end,
                                           fold_labels[fi])
                print(line, file=ctg_bed_out)
        ctg_bed_out.close()

    if options.split_test:
        exit()

    ################################################################
    # define model sequences
    ################################################################
    if not options.restart:

        fold_mseqs = []
        for fi in range(num_folds):
            if fold_labels[fi] in ['valid', 'test']:
                stride_fold = options.stride_test
            else:
                stride_fold = options.stride_train

            # stride sequences across contig
            fold_mseqs_fi = contig_sequences(fold_contigs[fi],
                                             options.seq_length, stride_fold,
                                             options.snap, fold_labels[fi])
            fold_mseqs.append(fold_mseqs_fi)

            # shuffle
            random.shuffle(fold_mseqs[fi])

            # down-sample
            if options.sample_pct < 1.0:
                fold_mseqs[fi] = random.sample(
                    fold_mseqs[fi],
                    int(options.sample_pct * len(fold_mseqs[fi])))

        # merge into one list
        mseqs = [ms for fm in fold_mseqs for ms in fm]

    ################################################################
    # mappability
    ################################################################
    if not options.restart:
        if options.umap_bed is not None:
            if shutil.which('bedtools') is None:
                print('Install Bedtools to annotate unmappable sites',
                      file=sys.stderr)
                exit(1)

            # annotate unmappable positions
            mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                         options.seq_length,
                                         options.pool_width, options.crop_bp)

            # filter unmappable
            mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                              options.umap_t)
            mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
            mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

            # write to file
            unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
            np.save(unmap_npy, mseqs_unmap)

        # write sequences to BED
        seqs_bed_file = '%s/sequences.bed' % options.out_dir
        write_seqs_bed(seqs_bed_file, mseqs, True)

    else:
        # read from directory
        seqs_bed_file = '%s/sequences.bed' % options.out_dir
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        mseqs = []
        fold_mseqs = []
        for fi in range(num_folds):
            fold_mseqs.append([])
        for line in open(seqs_bed_file):
            a = line.split()
            msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3])
            mseqs.append(msg)
            if a[3] == 'train':
                fi = 0
            elif a[3] == 'valid':
                fi = 1
            elif a[3] == 'test':
                fi = 2
            else:
                fi = int(a[3].replace('fold', ''))
            fold_mseqs[fi].append(msg)

    ################################################################
    # read sequence coverage values
    ################################################################
    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        clipsoft_ti = None
        if 'clip_soft' in targets_df.columns:
            clipsoft_ti = targets_df['clip_soft'].iloc[ti]

        scale_ti = 1
        if 'scale' in targets_df.columns:
            scale_ti = targets_df['scale'].iloc[ti]

        if options.restart and os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = 'basenji_data_read.py'
            cmd += ' --crop %d' % options.crop_bp
            cmd += ' -w %d' % options.pool_width
            cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti]
            if clip_ti is not None:
                cmd += ' -c %f' % clip_ti
            if clipsoft_ti is not None:
                cmd += ' --clip_soft %f' % clipsoft_ti
            cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            if options.interp_nan:
                cmd += ' -i'
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for fold_set in fold_labels:
        fold_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == fold_set
        ]
        fold_set_start = fold_set_indexes[0]
        fold_set_end = fold_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = fold_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end)

        while tfr_start <= fold_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i)

            cmd = 'basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end
            cmd += ' --umap_clip %f' % options.umap_clip
            if options.umap_tfr:
                cmd += ' --umap_tfr'
            if options.umap_bed is not None:
                cmd += ' -u %s' % unmap_npy

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (fold_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # stats
    ################################################################
    stats_dict = {}
    stats_dict['num_targets'] = targets_df.shape[0]
    stats_dict['seq_length'] = options.seq_length
    stats_dict['pool_width'] = options.pool_width
    stats_dict['crop_bp'] = options.crop_bp

    target_length = options.seq_length - 2 * options.crop_bp
    target_length = target_length // options.pool_width
    stats_dict['target_length'] = target_length

    for fi in range(num_folds):
        stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi])

    with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
示例#3
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=786432,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    # parser.add_option('-c', dest='clip',
    #     default=None, type='float',
    #     help='Clip target values to have minimum [Default: %default]')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=256,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '--soft',
        dest='soft_clip',
        default=False,
        action='store_true',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.3,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Set unmappable regions to this percentile in the sequences\' distribution of values'
    )
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.stride_train <= 0 or options.stride_train > 1:
        parser.error('Train stride =%f must be in [0,1]' %
                     options.stride_train)

    if options.stride_test <= 0 or options.stride_test > 1:
        parser.error('Test stride =%f must be in [0,1]' % options.stride_test)

    ################################################################
    # define genomic contigs
    ################################################################
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
        contigs += [
            Contig(chrom, ctg_start, ctg_end)
            for ctg_start, ctg_end in chrom_contigs[chrom]
        ]

    # limit to a BED file
    if options.limit_bed is not None:
        contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [
        ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
    ]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = '%s/contigs.bed' % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert (0 <= valid_pct <= 1)
        assert (0 <= test_pct <= 1)

        # divide by pct
        contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chr = options.valid_pct_or_chr
        test_chr = options.test_pct_or_chr
        contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr)

    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################
    # stride sequences across contig
    train_mseqs = contig_sequences(train_contigs,
                                   options.seq_length,
                                   options.stride_train,
                                   label='train')
    valid_mseqs = contig_sequences(valid_contigs,
                                   options.seq_length,
                                   options.stride_test,
                                   label='valid')
    test_mseqs = contig_sequences(test_contigs,
                                  options.seq_length,
                                  options.stride_test,
                                  label='test')

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # mappability
    ################################################################
    if options.umap_bed is not None:
        # annotate unmappable positions
        mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                     options.seq_length, options.pool_width)

        # filter unmappable
        mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                          options.umap_t)
        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

        # down-sample
    if options.sample_pct < 1.0:
        mseqs = random.sample(mseqs, int(options.sample_pct * len(contigs)))

    # write sequences to BED
    seqs_bed_file = '%s/sequences.bed' % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)

    ################################################################
    # read sequence coverage values
    ################################################################
    # read target datasets
    targets_df = pd.read_table(targets_file, index_col=0)

    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        scale_ti = 1
        if 'scale' in targets_df.columns:
            scale_ti = targets_df['scale'].iloc[ti]

        if os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = 'basenji_data_read.py'
            cmd += ' -w %d' % options.pool_width
            cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti]
            if clip_ti is not None:
                cmd += ' -c %f' % clip_ti
            if options.soft_clip:
                cmd += ' --soft'
            cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard,tbdisk',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for tvt_set in ['train', 'valid', 'test']:
        tvt_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == tvt_set
        ]
        tvt_set_start = tvt_set_indexes[0]
        tvt_set_end = tvt_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = tvt_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

        while tfr_start <= tvt_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i)

            cmd = 'basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end
            if options.umap_bed is not None:
                cmd += ' -u %s' % unmap_npy
            if options.umap_set is not None:
                cmd += ' --umap_set %f' % options.umap_set

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (tvt_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard,tbdisk',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)
示例#4
0
def main():
  usage = 'usage: %prog [options] <fasta0_file,fasta1_file> <targets_file>'
  parser = OptionParser(usage)
  parser.add_option('-a', dest='align_net',
      help='Alignment .net file')
  parser.add_option('-b', dest='blacklist_beds',
      help='Set blacklist nucleotides to a baseline value.')
  parser.add_option('--break', dest='break_t',
      default=None, type='int',
      help='Break in half contigs above length [Default: %default]')
  parser.add_option('-c','--crop', dest='crop_bp',
      default=0, type='int',
      help='Crop bp off each end [Default: %default]')
  parser.add_option('-d', dest='sample_pct',
      default=1.0, type='float',
      help='Down-sample the segments')
  parser.add_option('-g', dest='gap_files',
      help='Comma-separated list of assembly gaps BED files [Default: %default]')
  parser.add_option('-i', dest='interp_nan',
      default=False, action='store_true',
      help='Interpolate NaNs [Default: %default]') 
  parser.add_option('-l', dest='seq_length',
      default=131072, type='int',
      help='Sequence length [Default: %default]')
  parser.add_option('--local', dest='run_local',
      default=False, action='store_true',
      help='Run jobs locally as opposed to on SLURM [Default: %default]')
  parser.add_option('-n', dest='net_fill_min',
    default=100000, type='int',
    help='Alignment net fill size minimum [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='data_out',
      help='Output directory [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number parallel processes [Default: %default]')
  parser.add_option('-r', dest='seqs_per_tfr',
      default=256, type='int',
      help='Sequences per TFRecord file [Default: %default]')
  parser.add_option('--restart', dest='restart',
      default=False, action='store_true',
      help='Skip already read HDF5 coverage values. [Default: %default]')
  parser.add_option('--seed', dest='seed',
      default=44, type='int',
      help='Random seed [Default: %default]')
  parser.add_option('--snap', dest='snap',
      default=None, type='int',
      help='Snap sequences to multiple of the given value [Default: %default]')
  parser.add_option('--stride', '--stride_train', dest='stride_train',
      default=1., type='float',
      help='Stride to advance train sequences [Default: seq_length]')
  parser.add_option('--stride_test', dest='stride_test',
      default=1., type='float',
      help='Stride to advance valid and test sequences [Default: %default]')
  parser.add_option('--soft', dest='soft_clip',
      default=False, action='store_true',
      help='Soft clip values, applying sqrt to the execess above the threshold [Default: %default]')
  parser.add_option('-t', dest='test_pct',
      default=0.1, type='float',
      help='Proportion of the data for testing [Default: %default]')
  parser.add_option('-u', dest='umap_beds',
      help='Comma-separated genome unmappable segments to set to NA')
  parser.add_option('--umap_t', dest='umap_t',
      default=0.5, type='float',
      help='Remove sequences with more than this unmappable bin % [Default: %default]')
  parser.add_option('--umap_clip', dest='umap_clip',
      default=None, type='float',
      help='Clip unmappable regions to this percentile in the sequences\' distribution of values')
  parser.add_option('-w', dest='pool_width',
      default=128, type='int',
      help='Sum pool width [Default: %default]')
  parser.add_option('-v', dest='valid_pct',
      default=0.1, type='float',
      help='Proportion of the data for validation [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide FASTA and sample coverage label and path files for two genomes.')
  else:
    fasta_files = args[0].split(',')
    targets_file = args[1]

  # there is still some source of stochasticity
  random.seed(options.seed)
  np.random.seed(options.seed)

  # transform proportion strides to base pairs
  if options.stride_train <= 1:
    print('stride_train %.f'%options.stride_train, end='')
    options.stride_train = options.stride_train*options.seq_length
    print(' converted to %f' % options.stride_train)
  options.stride_train = int(np.round(options.stride_train))
  if options.stride_test <= 1:
    print('stride_test %.f'%options.stride_test, end='')
    options.stride_test = options.stride_test*options.seq_length
    print(' converted to %f' % options.stride_test)
  options.stride_test = int(np.round(options.stride_test))

  # check snap
  if options.snap is not None:
    if np.mod(options.seq_length, options.snap) != 0: 
      raise ValueError('seq_length must be a multiple of snap')
    if np.mod(options.stride_train, options.snap) != 0: 
      raise ValueError('stride_train must be a multiple of snap')
    if np.mod(options.stride_test, options.snap) != 0:
      raise ValueError('stride_test must be a multiple of snap')

  if os.path.isdir(options.out_dir) and not options.restart:
    print('Remove output directory %s or use --restart option.' % options.out_dir)
    exit(1)
  elif not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  if options.gap_files is not None:
    options.gap_files = options.gap_files.split(',')

  if options.blacklist_beds is not None:
    options.blacklist_beds = options.blacklist_beds.split(',')

  # read targets
  targets_df = pd.read_table(targets_file, index_col=0)

  # verify genomes
  num_genomes = len(fasta_files)
  assert(len(set(targets_df.genome)) == num_genomes)

  ################################################################
  # define genomic contigs
  ################################################################
  genome_chr_contigs = []
  for gi in range(num_genomes):
    genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi]))

    # remove gaps
    if options.gap_files[gi]:
      genome_chr_contigs[gi] = genome.split_contigs(genome_chr_contigs[gi],
                                                    options.gap_files[gi])

  # ditch the chromosomes
  contigs = []
  for gi in range(num_genomes):
    for chrom in genome_chr_contigs[gi]:
      contigs += [Contig(gi, chrom, ctg_start, ctg_end)
                  for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]]

  # filter for large enough
  contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

  # break up large contigs
  if options.break_t is not None:
    contigs = break_large_contigs(contigs, options.break_t)

  # print contigs to BED file
  for gi in range(num_genomes):
    contigs_i = [ctg for ctg in contigs if ctg.genome == gi]
    ctg_bed_file = '%s/contigs%d.bed' % (options.out_dir, gi)
    write_seqs_bed(ctg_bed_file, contigs_i)

  ################################################################
  # divide between train/valid/test
  ################################################################

  # connect contigs across genomes by alignment
  contig_components = connect_contigs(contigs, options.align_net, options.net_fill_min, options.out_dir)

  # divide contig connected components between train/valid/test
  contig_sets = divide_contig_components(contig_components, options.test_pct, options.valid_pct)
  train_contigs, valid_contigs, test_contigs = contig_sets

  # rejoin broken contigs within set
  train_contigs = rejoin_large_contigs(train_contigs)
  valid_contigs = rejoin_large_contigs(valid_contigs)
  test_contigs = rejoin_large_contigs(test_contigs)

  # quantify leakage across sets
  quantify_leakage(options.align_net, train_contigs, valid_contigs, test_contigs, options.out_dir)

  ################################################################
  # define model sequences
  ################################################################

  # stride sequences across contig
  train_mseqs = contig_sequences(train_contigs, options.seq_length,
                                 options.stride_train, options.snap, 'train')
  valid_mseqs = contig_sequences(valid_contigs, options.seq_length,
                                 options.stride_test, options.snap, 'valid')
  test_mseqs = contig_sequences(test_contigs, options.seq_length,
                                options.stride_test, options.snap, 'test')

  # shuffle
  random.shuffle(train_mseqs)
  random.shuffle(valid_mseqs)
  random.shuffle(test_mseqs)

  # down-sample
  if options.sample_pct < 1.0:
    train_mseqs = random.sample(train_mseqs, int(options.sample_pct*len(train_mseqs)))
    valid_mseqs = random.sample(valid_mseqs, int(options.sample_pct*len(valid_mseqs)))
    test_mseqs = random.sample(test_mseqs, int(options.sample_pct*len(test_mseqs)))

  # merge
  mseqs = train_mseqs + valid_mseqs + test_mseqs

  ################################################################
  # separate sequences by genome
  ################################################################
  mseqs_genome = []
  for gi in range(num_genomes):
    mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi]
    mseqs_genome.append(mseqs_gi)

  ################################################################
  # mappability
  ################################################################

  options.umap_beds = options.umap_beds.split(',')
  unmap_npys = [None, None]

  for gi in range(num_genomes):
    if options.umap_beds[gi] is not None:
      # annotate unmappable positions
      mseqs_unmap = annotate_unmap(mseqs_genome[gi], options.umap_beds[gi],
                                   options.seq_length, options.pool_width)

      # filter unmappable
      mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t)
      mseqs_genome[gi] = [mseqs_genome[gi][si] for si in range(len(mseqs_genome[gi])) if mseqs_map_mask[si]]
      mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

      # write to file
      unmap_npys[gi] = '%s/mseqs%d_unmap.npy' % (options.out_dir, gi)
      np.save(unmap_npys[gi], mseqs_unmap)

  seqs_bed_files = []
  for gi in range(num_genomes):
    # write sequences to BED
    seqs_bed_files.append('%s/sequences%d.bed' % (options.out_dir, gi))
    write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True)

  ################################################################
  # read sequence coverage values
  ################################################################
  seqs_cov_dir = '%s/seqs_cov' % options.out_dir
  if not os.path.isdir(seqs_cov_dir):
    os.mkdir(seqs_cov_dir)

  read_jobs = []
  for gi in range(num_genomes):
    read_jobs += make_read_jobs(seqs_bed_files[gi], targets_df,
                                gi, seqs_cov_dir, options)

  if options.run_local:
    util.exec_par(read_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(read_jobs, options.processes, verbose=True,
                    launch_sleep=1, update_sleep=5)

  ################################################################
  # write TF Records
  ################################################################

  tfr_dir = '%s/tfrecords' % options.out_dir
  if not os.path.isdir(tfr_dir):
    os.mkdir(tfr_dir)

  # set genome target index starts
  sum_targets = 0
  genome_targets_start = []
  for gi in range(num_genomes):
    genome_targets_start.append(sum_targets)
    targets_df_gi = targets_df[targets_df.genome == gi]
    sum_targets += targets_df_gi.shape[0]

  write_jobs = []
  for gi in range(num_genomes):
    write_jobs += make_write_jobs(mseqs_genome[gi], fasta_files[gi], seqs_bed_files[gi],
                                  seqs_cov_dir, tfr_dir, gi, unmap_npys[gi],
                                  genome_targets_start[gi], sum_targets, options)

  if options.run_local:
    util.exec_par(write_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(write_jobs, options.processes, verbose=True,
                    launch_sleep=1, update_sleep=5)

  ################################################################
  # stats
  ################################################################
  stats_dict = {}
  # stats_dict['num_targets'] = targets_df.shape[0]
  # stats_dict['train_seqs'] = len(train_mseqs)
  # stats_dict['valid_seqs'] = len(valid_mseqs)
  # stats_dict['test_seqs'] = len(test_mseqs)
  stats_dict['seq_length'] = options.seq_length
  stats_dict['pool_width'] = options.pool_width
  stats_dict['crop_bp'] = options.crop_bp

  target_length = options.seq_length - 2*options.crop_bp
  target_length = target_length // options.pool_width
  stats_dict['target_length'] = target_length

  with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
    json.dump(stats_dict, stats_json_out, indent=4)
示例#5
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '-c',
        dest='clip',
        default=None,
        type='float',
        help='Clip target values to have minimum [Default: %default]')
    parser.add_option('--cluster_dir',
                      dest='cluster_dir',
                      default='basenji_hdf5')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-f',
                      dest='fourier_dim',
                      default=None,
                      type='int',
                      help='Fourier transform dimension [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=1024,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--log2',
        dest='log10to2',
        default=False,
        action='store_true',
        help='Transform values from log10 to log2 [Default: %default]')
    parser.add_option(
        '--mult_cov',
        dest='cov_multiplier',
        default=1,
        type='float',
        help=
        'Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]'
    )
    parser.add_option(
        '-n',
        dest='na_t',
        default=0.25,
        type='float',
        help=
        'Remove sequences with an NA% greater than this threshold [Default: %default]'
    )
    parser.add_option(
        '-o',
        dest='out_bed_file',
        help='Output the train/valid/test sequences as a BED file')
    parser.add_option(
        '-p',
        dest='processes',
        default=1,
        type='int',
        help='Number parallel processes to load data [Default: %default]')
    parser.add_option('-s',
                      dest='stride',
                      type='int',
                      help='Stride to advance segments [Default: seq_length]')
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        type='str',
        default=0.05,
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='unmap_bed',
                      help='Unmappable segments to set to NA')
    parser.add_option('-w',
                      dest='pool_width',
                      type='int',
                      default=1,
                      help='Average pooling width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        type='str',
        default=0.05,
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option('-z',
                      dest='compression',
                      help='h5py compression [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide genome FASTA file, sample Wig/BigWig labels and paths, '
            'and model output file')
    else:
        fasta_file = args[0]
        sample_wigs_file = args[1]
        hdf5_file = args[2]

    random.seed(1)

    if options.stride is None:
        options.stride = options.seq_length

    ################################################################
    # assess bigwigs
    ################################################################
    # get wig files and labels
    target_wigs = OrderedDict()
    target_strands = []
    target_labels = []
    for line in open(sample_wigs_file, encoding='UTF-8'):
        a = line.rstrip().split('\t')

        if a[0] in target_wigs:
            print('WARNING: duplicate target id %s' % a[0], file=sys.stderr)

        target_wigs[a[0]] = a[1]
        target_strands.append(a[2])
        if len(a) > 3:
            target_labels.append(a[3])
        else:
            target_labels.append('')

    if options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width:
        print(
            "Fourier transform to %d dims won't compress %d length sequences with %d pooling"
            % (options.fourier_dim, options.seq_length, options.pool_width),
            file=sys.stderr)
        exit(1)

    ################################################################
    # prepare genomic segments
    ################################################################
    chrom_segments = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_segments = genome.split_contigs(chrom_segments,
                                              options.gaps_file)

    # ditch the chromosomes
    segments = []
    for chrom in chrom_segments:
        segments += [(chrom, seg_start, seg_end)
                     for seg_start, seg_end in chrom_segments[chrom]]

    # standardize order
    segments.sort()

    # filter for large enough
    segments = [
        cse for cse in segments if cse[2] - cse[1] >= options.seq_length
    ]

    # down-sample
    if options.sample_pct < 1.0:
        segments = random.sample(segments,
                                 int(options.sample_pct * len(segments)))

    # limit to a BED file
    if options.limit_bed is not None:
        segments = limit_segments(segments, options.limit_bed)

    if not os.path.isdir(options.cluster_dir):
        os.mkdir(options.cluster_dir)

    # print segments to BED file
    seg_bed_file = '%s/segments.bed' % options.cluster_dir
    seg_bed_out = open(seg_bed_file, 'w')
    for chrom, seg_start, seg_end in segments:
        print('%s\t%d\t%d' % (chrom, seg_start, seg_end), file=seg_bed_out)
    seg_bed_out.close()

    ################################################################
    # bigwig read and process
    ################################################################
    print('Reading and pre-processing bigwigs for %d segments' % len(segments),
          flush=True)

    targets_real = []
    targets_imag = []

    # generate numpy arrays on cluster
    jobs = []
    for target_label in target_wigs.keys():
        wig_file = target_wigs[target_label]
        npy_file = '%s/%s' % (options.cluster_dir, target_label)
        if not os.path.isfile(npy_file) and not os.path.isfile(
                '%s.npy' % npy_file):
            print(npy_file)

            if os.path.splitext(wig_file)[1] == '.h5':
                script = 'seqs_hdf5.py'
            else:
                script = 'bigwig_hdf5.py'

            cmd = 'echo $HOSTNAME; %s -l %d -s %d -w %d %s %s %s' % (
                script, options.seq_length, options.stride, options.pool_width,
                wig_file, seg_bed_file, npy_file)
            name = 'hdf5_%s' % target_label
            outf = '%s/%s.out' % (options.cluster_dir, target_label)
            errf = '%s/%s.err' % (options.cluster_dir, target_label)
            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue='standard,tbdisk',
                          mem=15000,
                          time='12:0:0')
            jobs.append(j)

    slurm.multi_run(jobs)

    # load into targets_real, targets_imag
    for target_label in target_wigs.keys():
        npy_file = '%s/%s.npy' % (options.cluster_dir, target_label)
        wig_targets = np.load(npy_file)
        targets_real.append(wig_targets)

    # transpose from TxSxL to SxLxT
    targets_real = np.transpose(np.array(targets_real), axes=(1, 2, 0))

    print('%d target sequences' % targets_real.shape[0])

    ################################################################
    # one hot code sequences
    ################################################################
    seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments,
                                             options.seq_length,
                                             options.stride)
    print('%d sequences one hot coded' % seqs_1hot.shape[0])

    ################################################################
    # correct for unmappable regions
    ################################################################
    if options.unmap_bed is not None:
        seqs_na = annotate_na(seqs_segments, options.unmap_bed,
                              options.seq_length, options.pool_width)

        # determine mappable sequences and update test indexes
        map_indexes = []

        for i in range(seqs_na.shape[0]):
            # mappable
            if seqs_na[i, :].mean(dtype='float64') < options.na_t:
                map_indexes.append(i)

            # unmappable
            else:
                # forget it
                pass

        # update data structures
        targets_real = targets_real[map_indexes]
        if options.fourier_dim is not None:
            targets_imag = targets_imag[map_indexes]

        seqs_1hot = seqs_1hot[map_indexes]
        seqs_segments = [seqs_segments[mi] for mi in map_indexes]
        seqs_na = seqs_na[map_indexes]

    ################################################################
    # write to train, valid, test HDF5
    ################################################################

    # choose test indexes
    if options.test_pct_or_chr.startswith('chr'):
        test_indexes = [
            si for si in range(len(seqs_segments))
            if seqs_segments[si][0] == options.test_pct_or_chr
        ]
    else:
        test_pct = float(options.test_pct_or_chr)
        test_indexes = [
            twi for twi in range(len(seqs_segments))
            if random.random() < test_pct
        ]

    # choose valid indexes
    if options.valid_pct_or_chr.startswith('chr'):
        # valid_indexes = np.array([seq_seg[0] == options.valid_pct_or_chr for seq_seg in seqs_segments])
        valid_indexes = [
            si for si in range(len(seqs_segments))
            if seqs_segments[si][0] == options.valid_pct_or_chr
        ]
    else:
        valid_pct = float(options.valid_pct_or_chr)
        valid_n = int(valid_pct * len(seqs_segments))
        nontest_indexes = set(range(len(seqs_segments))) - set(test_indexes)
        valid_indexes = random.sample(nontest_indexes, valid_n)

    # remainder is training
    train_indexes = list(
        set(range(len(seqs_segments))) - set(valid_indexes) -
        set(test_indexes))

    # training may require shuffling
    random.shuffle(train_indexes)
    random.shuffle(valid_indexes)
    random.shuffle(test_indexes)

    # write to HDF5
    hdf5_out = h5py.File(hdf5_file, 'w')

    # store pooling
    hdf5_out.create_dataset('pool_width', data=options.pool_width, dtype='int')

    # store targets
    target_ids = np.array(list(target_wigs.keys()), dtype='S')
    hdf5_out.create_dataset('target_ids', data=target_ids)

    target_labels = np.array(target_labels, dtype='S')
    hdf5_out.create_dataset('target_labels', data=target_labels)

    target_strands = np.array(target_strands, dtype='S')
    hdf5_out.create_dataset('target_strands', data=target_strands)

    # HDF5 train
    hdf5_out.create_dataset('train_in',
                            data=seqs_1hot[train_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('train_out',
                            data=targets_real[train_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('train_out_imag',
                                data=targets_imag[train_indexes],
                                dtype='float16',
                                compression=options.compression)
    hdf5_out.create_dataset('train_na',
                            data=seqs_na[train_indexes],
                            dtype='bool',
                            compression=options.compression)

    # HDF5 valid
    hdf5_out.create_dataset('valid_in',
                            data=seqs_1hot[valid_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('valid_out',
                            data=targets_real[valid_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('valid_out_imag',
                                data=targets_imag[valid_indexes],
                                dtype='float16',
                                compression=options.compression)
    hdf5_out.create_dataset('valid_na',
                            data=seqs_na[valid_indexes],
                            dtype='bool',
                            compression=options.compression)

    # HDF5 test
    hdf5_out.create_dataset('test_in',
                            data=seqs_1hot[test_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('test_out',
                            data=targets_real[test_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('test_out_imag',
                                data=targets_imag[test_indexes],
                                dtype='float16',
                                compression=options.compression)
    hdf5_out.create_dataset('test_na',
                            data=seqs_na[test_indexes],
                            dtype='bool',
                            compression=options.compression)

    hdf5_out.close()

    # output BED file
    if options.out_bed_file:
        out_bed_out = open(options.out_bed_file, 'w')
        for si in train_indexes:
            print('%s\t%d\t%d\ttrain' % seqs_segments[si], file=out_bed_out)
        for si in valid_indexes:
            print('%s\t%d\t%d\tvalid' % seqs_segments[si], file=out_bed_out)
        for si in test_indexes:
            print('%s\t%d\t%d\ttest' % seqs_segments[si], file=out_bed_out)
        out_bed_out.close()
示例#6
0
def main():
    usage = "usage: %prog [options] <fasta0_file,fasta1_file> <targets_file>"
    parser = OptionParser(usage)
    parser.add_option("-a", dest="align_net", help="Alignment .net file")
    parser.add_option(
        "-b",
        dest="blacklist_beds",
        help="Set blacklist nucleotides to a baseline value.",
    )
    parser.add_option(
        "--break",
        dest="break_t",
        default=None,
        type="int",
        help="Break in half contigs above length [Default: %default]",
    )
    # parser.add_option('-c', dest='clip',
    #     default=None, type='float',
    #     help='Clip target values to have minimum [Default: %default]')
    parser.add_option(
        "-d",
        dest="sample_pct",
        default=1.0,
        type="float",
        help="Down-sample the segments",
    )
    parser.add_option(
        "-f",
        dest="fill_min",
        default=100000,
        type="int",
        help="Alignment net fill size minimum [Default: %default]",
    )
    parser.add_option(
        "-g",
        dest="gap_files",
        help="Comma-separated list of assembly gaps BED files [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="seq_length",
        default=131072,
        type="int",
        help="Sequence length [Default: %default]",
    )
    parser.add_option(
        "--local",
        dest="run_local",
        default=False,
        action="store_true",
        help="Run jobs locally as opposed to on SLURM [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="data_out",
        help="Output directory [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number parallel processes [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="seqs_per_tfr",
        default=256,
        type="int",
        help="Sequences per TFRecord file [Default: %default]",
    )
    parser.add_option(
        "--seed",
        dest="seed",
        default=44,
        type="int",
        help="Random seed [Default: %default]",
    )
    parser.add_option(
        "--stride_train",
        dest="stride_train",
        default=1.0,
        type="float",
        help="Stride to advance train sequences [Default: %default]",
    )
    parser.add_option(
        "--stride_test",
        dest="stride_test",
        default=1.0,
        type="float",
        help="Stride to advance valid and test sequences [Default: %default]",
    )
    parser.add_option(
        "--soft",
        dest="soft_clip",
        default=False,
        action="store_true",
        help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="test_pct",
        default=0.1,
        type="float",
        help="Proportion of the data for testing [Default: %default]",
    )
    parser.add_option(
        "-u",
        dest="umap_beds",
        help="Comma-separated genome unmappable segments to set to NA",
    )
    parser.add_option(
        "--umap_t",
        dest="umap_t",
        default=0.5,
        type="float",
        help="Remove sequences with more than this unmappable bin % [Default: %default]",
    )
    parser.add_option(
        "--umap_set",
        dest="umap_set",
        default=None,
        type="float",
        help="Set unmappable regions to this percentile in the sequences' distribution of values",
    )
    parser.add_option(
        "-w",
        dest="pool_width",
        default=128,
        type="int",
        help="Sum pool width [Default: %default]",
    )
    parser.add_option(
        "-v",
        dest="valid_pct",
        default=0.1,
        type="float",
        help="Proportion of the data for validation [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            "Must provide FASTA and sample coverage label and path files for two genomes."
        )
    else:
        fasta_files = args[0].split(",")
        targets_file = args[1]

    # there is still some source of stochasticity
    random.seed(options.seed)
    np.random.seed(options.seed)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print("stride_train %.f" % options.stride_train, end="")
        options.stride_train = options.stride_train * options.seq_length
        print(" converted to %f" % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        print("stride_test %.f" % options.stride_test, end="")
        options.stride_test = options.stride_test * options.seq_length
        print(" converted to %f" % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.gap_files is not None:
        options.gap_files = options.gap_files.split(",")

    if options.blacklist_beds is not None:
        options.blacklist_beds = options.blacklist_beds.split(",")

    # read targets
    targets_df = pd.read_table(targets_file, index_col=0)

    # verify genomes
    num_genomes = len(fasta_files)
    assert len(set(targets_df.genome)) == num_genomes

    ################################################################
    # define genomic contigs
    ################################################################
    genome_chr_contigs = []
    for gi in range(num_genomes):
        genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi]))

        # remove gaps
        if options.gap_files[gi]:
            genome_chr_contigs[gi] = genome.split_contigs(
                genome_chr_contigs[gi], options.gap_files[gi]
            )

    # ditch the chromosomes
    contigs = []
    for gi in range(num_genomes):
        for chrom in genome_chr_contigs[gi]:
            contigs += [
                Contig(gi, chrom, ctg_start, ctg_end)
                for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]
            ]

    # filter for large enough
    contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    for gi in range(num_genomes):
        contigs_i = [ctg for ctg in contigs if ctg.genome == gi]
        ctg_bed_file = "%s/contigs%d.bed" % (options.out_dir, gi)
        write_seqs_bed(ctg_bed_file, contigs_i)

    ################################################################
    # divide between train/valid/test
    ################################################################

    # connect contigs across genomes by alignment
    contig_components = connect_contigs(
        contigs, options.align_net, options.fill_min, options.out_dir
    )

    # divide contig connected components between train/valid/test
    contig_sets = divide_contig_components(
        contig_components, options.test_pct, options.valid_pct
    )
    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################

    # stride sequences across contig
    train_mseqs = contig_sequences(
        train_contigs, options.seq_length, options.stride_train, label="train"
    )
    valid_mseqs = contig_sequences(
        valid_contigs, options.seq_length, options.stride_test, label="valid"
    )
    test_mseqs = contig_sequences(
        test_contigs, options.seq_length, options.stride_test, label="test"
    )

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # down-sample
    if options.sample_pct < 1.0:
        train_mseqs = random.sample(
            train_mseqs, int(options.sample_pct * len(train_mseqs))
        )
        valid_mseqs = random.sample(
            valid_mseqs, int(options.sample_pct * len(valid_mseqs))
        )
        test_mseqs = random.sample(
            test_mseqs, int(options.sample_pct * len(test_mseqs))
        )

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # separate sequences by genome
    ################################################################
    mseqs_genome = []
    for gi in range(num_genomes):
        mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi]
        mseqs_genome.append(mseqs_gi)

    ################################################################
    # mappability
    ################################################################

    options.umap_beds = options.umap_beds.split(",")
    unmap_npys = [None, None]

    for gi in range(num_genomes):
        if options.umap_beds[gi] is not None:
            # annotate unmappable positions
            mseqs_unmap = annotate_unmap(
                mseqs_genome[gi],
                options.umap_beds[gi],
                options.seq_length,
                options.pool_width,
            )

            # filter unmappable
            mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t
            mseqs_genome[gi] = [
                mseqs_genome[gi][si]
                for si in range(len(mseqs_genome[gi]))
                if mseqs_map_mask[si]
            ]
            mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

            # write to file
            unmap_npys[gi] = "%s/mseqs%d_unmap.npy" % (options.out_dir, gi)
            np.save(unmap_npys[gi], mseqs_unmap)

    seqs_bed_files = []
    for gi in range(num_genomes):
        # write sequences to BED
        seqs_bed_files.append("%s/sequences%d.bed" % (options.out_dir, gi))
        write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True)

    ################################################################
    # read sequence coverage values
    ################################################################
    seqs_cov_dir = "%s/seqs_cov" % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []
    for gi in range(num_genomes):
        read_jobs += make_read_jobs(
            seqs_bed_files[gi], targets_df, gi, seqs_cov_dir, options
        )

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )

    ################################################################
    # write TF Records
    ################################################################

    tfr_dir = "%s/tfrecords" % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    # set genome target index starts
    sum_targets = 0
    genome_targets_start = []
    for gi in range(num_genomes):
        genome_targets_start.append(sum_targets)
        targets_df_gi = targets_df[targets_df.genome == gi]
        sum_targets += targets_df_gi.shape[0]

    write_jobs = []
    for gi in range(num_genomes):
        write_jobs += make_write_jobs(
            mseqs_genome[gi],
            fasta_files[gi],
            seqs_bed_files[gi],
            seqs_cov_dir,
            tfr_dir,
            gi,
            unmap_npys[gi],
            genome_targets_start[gi],
            sum_targets,
            options,
        )

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )
示例#7
0
def main():
  usage = 'usage: %prog [options] <align_net> <fasta0_file,fasta1_file>'
  parser = OptionParser(usage)
  parser.add_option('-a', dest='genome_labels',
      default=None, help='Genome labels in output')
  parser.add_option('--break', dest='break_t',
      default=None, type='int',
      help='Break in half contigs above length [Default: %default]')
  parser.add_option('-c','--crop', dest='crop_bp',
      default=0, type='int',
      help='Crop bp off each end [Default: %default]')
  parser.add_option('-d', dest='sample_pct',
      default=1.0, type='float',
      help='Down-sample the segments')
  parser.add_option('-f', dest='folds',
      default=None, type='int',
      help='Generate cross fold split [Default: %default]')
  parser.add_option('-g', dest='gap_files',
      help='Comma-separated list of assembly gaps BED files [Default: %default]')
  parser.add_option('-l', dest='seq_length',
      default=131072, type='int',
      help='Sequence length [Default: %default]')
  parser.add_option('--nf', dest='net_fill_min',
    default=100000, type='int',
    help='Alignment net fill size minimum [Default: %default]')
  parser.add_option('--no', dest='net_olap_min',
    default=1024, type='int',
    help='Alignment net and contig overlap minimum [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='align_out',
      help='Output directory [Default: %default]')
  parser.add_option('--seed', dest='seed',
      default=44, type='int',
      help='Random seed [Default: %default]')
  parser.add_option('--snap', dest='snap',
      default=1, type='int',
      help='Snap sequences to multiple of the given value [Default: %default]')
  parser.add_option('--stride', '--stride_train', dest='stride_train',
      default=1., type='float',
      help='Stride to advance train sequences [Default: seq_length]')
  parser.add_option('--stride_test', dest='stride_test',
      default=1., type='float',
      help='Stride to advance valid and test sequences [Default: %default]')
  parser.add_option('-t', dest='test_pct',
      default=0.1, type='float',
      help='Proportion of the data for testing [Default: %default]')
  parser.add_option('-u', dest='umap_beds',
      help='Comma-separated genome unmappable segments to set to NA')
  parser.add_option('--umap_t', dest='umap_t',
      default=0.5, type='float',
      help='Remove sequences with more than this unmappable bin % [Default: %default]')
  parser.add_option('-w', dest='pool_width',
      default=128, type='int',
      help='Sum pool width [Default: %default]')
  parser.add_option('-v', dest='valid_pct',
      default=0.1, type='float',
      help='Proportion of the data for validation [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide alignment and FASTA files.')
  else:
    align_net_file = args[0]
    fasta_files = args[1].split(',')

  # there is still some source of stochasticity
  random.seed(options.seed)
  np.random.seed(options.seed)

  # transform proportion strides to base pairs
  if options.stride_train <= 1:
    print('stride_train %.f'%options.stride_train, end='')
    options.stride_train = options.stride_train*options.seq_length
    print(' converted to %f' % options.stride_train)
  options.stride_train = int(np.round(options.stride_train))
  if options.stride_test <= 1:
    print('stride_test %.f'%options.stride_test, end='')
    options.stride_test = options.stride_test*options.seq_length
    print(' converted to %f' % options.stride_test)
  options.stride_test = int(np.round(options.stride_test))

  # check snap
  if options.snap is not None:
    if np.mod(options.seq_length, options.snap) != 0: 
      raise ValueError('seq_length must be a multiple of snap')
    if np.mod(options.stride_train, options.snap) != 0: 
      raise ValueError('stride_train must be a multiple of snap')
    if np.mod(options.stride_test, options.snap) != 0:
      raise ValueError('stride_test must be a multiple of snap')

  # count genomes
  num_genomes = len(fasta_files)

  # parse gap files
  if options.gap_files is not None:
    options.gap_files = options.gap_files.split(',')
    assert(len(options.gap_files) == num_genomes)

  # parse unmappable files
  if options.umap_beds is not None:
    options.umap_beds = options.umap_beds.split(',')
    assert(len(options.umap_beds) == num_genomes)

  # label genomes
  if options.genome_labels is None:
    options.genome_labels = ['genome%d' % (gi+1) for gi in range(num_genomes)]
  else:
    options.genome_labels = options.genome_labels.split(',')
    assert(len(options.genome_labels) == num_genomes)

  # create output directorys
  if not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)
  genome_out_dirs = []
  for gi in range(num_genomes):
    gout_dir = '%s/%s' % (options.out_dir, options.genome_labels[gi])
    if not os.path.isdir(gout_dir):
      os.mkdir(gout_dir)
    genome_out_dirs.append(gout_dir)

  ################################################################
  # define genomic contigs
  ################################################################
  genome_chr_contigs = []
  for gi in range(num_genomes):
    genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi]))

    # remove gaps
    if options.gap_files[gi]:
      genome_chr_contigs[gi] = genome.split_contigs(genome_chr_contigs[gi],
                                                    options.gap_files[gi])

  # ditch the chromosomes
  contigs = []
  for gi in range(num_genomes):
    for chrom in genome_chr_contigs[gi]:
      contigs += [Contig(gi, chrom, ctg_start, ctg_end)
                  for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]]

  # filter for large enough
  contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

  # break up large contigs
  if options.break_t is not None:
    contigs = break_large_contigs(contigs, options.break_t)

  # print contigs to BED file
  for gi in range(num_genomes):
    contigs_i = [ctg for ctg in contigs if ctg.genome == gi]
    ctg_bed_file = '%s/contigs.bed' % genome_out_dirs[gi]
    write_seqs_bed(ctg_bed_file, contigs_i)

  ################################################################
  # divide between train/valid/test
  ################################################################

  # connect contigs across genomes by alignment
  contig_components = connect_contigs(contigs, align_net_file, options.net_fill_min,
                                      options.net_olap_min, options.out_dir, genome_out_dirs)

  if options.folds is not None:
    # divide by fold
    fold_contigs = divide_components_folds(contig_components, options.folds)

  else:
    # divide by train/valid/test pct
    fold_contigs = divide_components_pct(contig_components, options.test_pct,
                                         options.valid_pct)

  # rejoin broken contigs within set
  for fi in range(len(fold_contigs)):
    fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi])

  # label folds
  if options.folds is not None:
    fold_labels = ['fold%d' % fi for fi in range(options.folds)]
    num_folds = options.folds
  else:
    fold_labels = ['train', 'valid', 'test']
    num_folds = 3

  if options.folds is None:
    # quantify leakage across sets
    quantify_leakage(align_net_file, fold_contigs[0], fold_contigs[1],
                     fold_contigs[2], options.out_dir)

  ################################################################
  # define model sequences
  ################################################################

  fold_mseqs = []
  for fi in range(num_folds):
    if fold_labels[fi] in ['valid','test']:
      stride_fold = options.stride_test
    else:
      stride_fold = options.stride_train

    # stride sequences across contig
    fold_mseqs_fi = contig_sequences(fold_contigs[fi], options.seq_length,
                                     stride_fold, options.snap, fold_labels[fi])
    fold_mseqs.append(fold_mseqs_fi)

    # shuffle
    random.shuffle(fold_mseqs[fi])

    # down-sample
    if options.sample_pct < 1.0:
      fold_mseqs[fi] = random.sample(fold_mseqs[fi], int(options.sample_pct*len(fold_mseqs[fi])))

  # merge into one list
  mseqs = [ms for fm in fold_mseqs for ms in fm]

  # separate by genome
  mseqs_genome = []
  for gi in range(num_genomes):
    mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi]
    mseqs_genome.append(mseqs_gi)

  ################################################################
  # filter for sufficient mappability
  ################################################################
  for gi in range(num_genomes):
    if options.umap_beds[gi] is not None:
      # annotate unmappable positions
      mseqs_unmap = annotate_unmap(mseqs_genome[gi], options.umap_beds[gi], options.seq_length,
                                   options.pool_width, options.crop_bp)

      # filter unmappable
      mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t)
      mseqs_genome[gi] = [mseqs_genome[gi][si] for si in range(len(mseqs_genome[gi])) if mseqs_map_mask[si]]
      mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

      # write to file
      unmap_npy_file = '%s/mseqs_unmap.npy' % genome_out_dirs[gi]
      np.save(unmap_npy_file, mseqs_unmap)

  seqs_bed_files = []
  for gi in range(num_genomes):
    # write sequences to BED
    seqs_bed_files.append('%s/sequences.bed' % genome_out_dirs[gi])
    write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True)
示例#8
0
def main():
    usage = "usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-b",
        dest="limit_bed",
        help="Limit to segments that overlap regions in a BED file",
    )
    parser.add_option(
        "-c",
        dest="clip",
        default=None,
        type="float",
        help="Clip target values to have minimum [Default: %default]",
    )
    parser.add_option(
        "-d",
        dest="sample_pct",
        default=1.0,
        type="float",
        help="Down-sample the segments",
    )
    parser.add_option(
        "-f",
        dest="fourier_dim",
        default=None,
        type="int",
        help="Fourier transform dimension [Default: %default]",
    )
    parser.add_option("-g",
                      dest="gaps_file",
                      help="Genome assembly gaps BED [Default: %default]")
    parser.add_option(
        "-l",
        dest="seq_length",
        default=131072,
        type="int",
        help="Sequence length [Default: %default]",
    )
    parser.add_option(
        "--log2",
        dest="log10to2",
        default=False,
        action="store_true",
        help="Transform values from log10 to log2 [Default: %default]",
    )
    parser.add_option("-m",
                      dest="params_file",
                      help="Dimension reduction hyper-parameters file")
    parser.add_option(
        "--mult_cov",
        dest="cov_multiplier",
        default=1,
        type="float",
        help=
        "Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]",
    )
    parser.add_option(
        "-n",
        dest="na_t",
        default=0.25,
        type="float",
        help=
        "Remove sequences with an NA% greater than this threshold [Default: %default]",
    )
    parser.add_option(
        "--no_full",
        dest="no_full",
        default=False,
        action="store_true",
        help="Do not save full test sequence targets [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_bed_file",
        help="Output the train/valid/test sequences as a BED file",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=1,
        type="int",
        help="Number parallel processes to load data [Default: %default]",
    )
    parser.add_option(
        "-s",
        dest="stride",
        default=None,
        type="int",
        help="Stride to advance segments [Default: seq_length]",
    )
    parser.add_option("--scent",
                      dest="scent_file",
                      help="Dimension reduction model file")
    parser.add_option(
        "-t",
        dest="test_pct_or_chr",
        type="str",
        default=0.05,
        help="Proportion of the data for testing [Default: %default]",
    )
    parser.add_option("-u",
                      dest="unmap_bed",
                      help="Unmappable segments to set to NA")
    parser.add_option(
        "-w",
        dest="pool_width",
        type="int",
        default=128,
        help="Average pooling width [Default: %default]",
    )
    parser.add_option(
        "--w5",
        dest="w5",
        default=False,
        action="store_true",
        help="Coverage files are w5 rather than BigWig [Default: %default]",
    )
    parser.add_option(
        "-v",
        dest="valid_pct_or_chr",
        type="str",
        default=0.05,
        help="Proportion of the data for validation [Default: %default]",
    )
    parser.add_option("-z",
                      dest="compression",
                      help="h5py compression [Default: %default]")
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            "Must provide genome FASTA file, sample Wig/BigWig labels and paths, "
            "and model output file")
    else:
        fasta_file = args[0]
        sample_wigs_file = args[1]
        hdf5_file = args[2]

    random.seed(1)

    if options.stride is None:
        options.stride = options.seq_length

    ################################################################
    # assess bigwigs
    ################################################################
    # get wig files and labels
    target_wigs = OrderedDict()
    target_strands = []
    target_labels = []
    for line in open(sample_wigs_file, encoding="UTF-8"):
        a = line.rstrip().split("\t")
        target_wigs[a[0]] = a[1]
        if len(a) > 2:
            target_strands.append(a[2])
        else:
            target_strands.append("*")
        if len(a) > 3:
            target_labels.append(a[3])
        else:
            target_labels.append("")

    if (options.fourier_dim is not None and 2 * options.fourier_dim >=
            options.seq_length / options.pool_width):
        print(
            "Fourier transform to %d dims won't compress %d length sequences with %d pooling"
            % (options.fourier_dim, options.seq_length, options.pool_width),
            file=sys.stderr,
        )
        exit(1)

    ################################################################
    # prepare genomic segments
    ################################################################
    chrom_segments = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_segments = genome.split_contigs(chrom_segments,
                                              options.gaps_file)

    # ditch the chromosomes
    segments = []
    for chrom in chrom_segments:
        segments += [(chrom, seg_start, seg_end)
                     for seg_start, seg_end in chrom_segments[chrom]]

    # standardize order
    segments.sort()

    # filter for large enough
    segments = [
        cse for cse in segments if cse[2] - cse[1] >= options.seq_length
    ]

    # down-sample
    if options.sample_pct < 1.0:
        segments = random.sample(segments,
                                 int(options.sample_pct * len(segments)))

    # limit to a BED file
    if options.limit_bed is not None:
        segments = limit_segments(segments, options.limit_bed)

    ################################################################
    # one hot code sequences
    ################################################################
    seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments,
                                             options.seq_length,
                                             options.stride)
    print("%d sequences one hot coded" % seqs_1hot.shape[0])

    ################################################################
    # load model
    ################################################################
    if options.params_file:
        job = dna_io.read_job_params(options.params_file)
        job["num_targets"] = len(target_wigs)
        job["batch_size"] = 1024
        job["model"] = job.get("model", "autoencoder")

        if job["model"] == "autoencoder":
            model = autoencoder.AE(job)
            saver = tf.train.Saver()
        else:
            model = joblib.load(options.scent_file)

    ################################################################
    # bigwig read and process
    ################################################################
    print("Reading and pre-processing bigwigs for %d segments" % len(segments),
          flush=True)

    targets_real = []
    targets_imag = []

    include_indexes = []
    include_marker = 0

    targets_test = []
    test_indexes = []
    test_marker = 0

    update_i = 0
    ssi = 0

    # initialize multiprocessing pool
    pool = multiprocessing.Pool(options.processes)

    with tf.Session() as sess:
        if options.scent_file and job["model"] == "autoencoder":
            saver.restore(sess, options.scent_file)

        # batch segment processing
        bstart = 0
        while bstart < len(segments):
            if update_i % 1 == 0:
                print("Tiling from %s:%d-%d" % segments[bstart], flush=True)

            # determine batch end
            bend = batch_end(segments, bstart, 400000)

            # bigwig_read parameters
            bwr_params = [(
                wig_file,
                segments[bstart:bend],
                options.seq_length,
                options.pool_width,
                options.stride,
                options.log10to2,
                options.cov_multiplier,
            ) for wig_file in target_wigs.values()]

            # pull the target values in parallel
            if options.w5:
                wig_targets = pool.starmap(w5_batch, bwr_params)
            else:
                wig_targets = pool.starmap(bigwig_batch, bwr_params)

            # transpose to S x L x T (making a copy?)
            targets_wig = np.transpose(np.array(wig_targets), axes=(1, 2, 0))

            # clip
            if options.clip is not None:
                targets_wig = targets_wig.clip(options.clip)

            # sample indexes from this batch
            if options.test_pct_or_chr.startswith("chr"):
                test_bindexes = [
                    twi for twi in range(targets_wig.shape[0])
                    if seqs_segments[ssi + twi][0] == options.test_pct_or_chr
                ]
            else:
                test_pct = float(options.test_pct_or_chr)
                test_bindexes = [
                    twi for twi in range(targets_wig.shape[0])
                    if random.random() < test_pct
                ]

            # capture test indexes
            test_indexes += [test_marker + tbi for tbi in test_bindexes]

            # update test marker
            test_marker += targets_wig.shape[0]

            # save the full test targets
            if not options.no_full:
                targets_test.append(targets_wig[test_bindexes])

            # map to latent space
            if options.scent_file is None:
                targets_latent = targets_wig
            else:
                targets_latent = latent_transform(sess, model, job,
                                                  targets_wig)

            # compress across length
            if options.fourier_dim is None:
                targets_rfour = targets_latent
                targets_ifour = None
            else:
                targets_rfour, targets_ifour = fourier_transform(
                    targets_latent, options.fourier_dim)

            # save
            targets_real.append(targets_rfour)
            targets_imag.append(targets_ifour)

            # update seqs_segments index
            ssi += targets_wig.shape[0]

            # update batch
            bstart = bend
            update_i += 1

    pool.close()

    # stack arrays
    targets_real = np.vstack(targets_real)
    if options.fourier_dim is not None:
        targets_imag = np.vstack(targets_imag)
    if not options.no_full:
        targets_test = np.vstack(targets_test)

    print("%d target sequences" % targets_real.shape[0])

    ################################################################
    # correct for unmappable regions
    ################################################################
    if options.unmap_bed is not None:
        seqs_na = annotate_na(seqs_segments, options.unmap_bed,
                              options.seq_length, options.pool_width)

        # determine mappable sequences and update test indexes
        map_indexes = []
        test_indexes_set = set(test_indexes)
        print("test_indexes", len(test_indexes))
        test_indexes_na = []
        new_i = 0

        for old_i in range(seqs_na.shape[0]):
            # mappable
            if seqs_na[old_i, :].mean(dtype="float64") < options.na_t:
                map_indexes.append(old_i)

                if old_i in test_indexes_set:
                    test_indexes_na.append(new_i)

                new_i += 1

            # unmappable
            else:
                # forget it
                pass

        # update data structures
        targets_real = targets_real[map_indexes]
        if options.fourier_dim is not None:
            targets_imag = targets_imag[map_indexes]

        seqs_1hot = seqs_1hot[map_indexes]
        seqs_segments = [seqs_segments[mi] for mi in map_indexes]
        seqs_na = seqs_na[map_indexes]

        test_indexes = test_indexes_na
        print("test_indexes", len(test_indexes))

    ################################################################
    # write to train, valid, test HDF5
    ################################################################

    if options.valid_pct_or_chr.startswith("chr"):
        # sample valid chromosome
        valid_indexes = [
            si for si in range(len(seqs_segments))
            if seqs_segments[si][0] == options.valid_pct_or_chr
        ]

    else:
        # sample valid indexes (we already have test)
        valid_pct = float(options.valid_pct_or_chr)
        valid_n = int(valid_pct * targets_real.shape[0])
        nontest_indexes = set(range(targets_real.shape[0])) - set(test_indexes)
        valid_indexes = random.sample(nontest_indexes, valid_n)

    # remainder is training
    train_indexes = list(
        set(range(len(seqs_segments))) - set(valid_indexes) -
        set(test_indexes))

    # training may requires shuffle
    random.shuffle(sorted(train_indexes))
    random.shuffle(sorted(valid_indexes))
    random.shuffle(sorted(test_indexes))

    # write to HDF5
    hdf5_out = h5py.File(hdf5_file, "w")

    # store pooling
    hdf5_out.create_dataset("pool_width", data=options.pool_width, dtype="int")

    # store targets
    target_ids = np.array(list(target_wigs.keys()), dtype="S")
    hdf5_out.create_dataset("target_ids", data=target_ids)

    target_labels = np.array(target_labels, dtype="S")
    hdf5_out.create_dataset("target_labels", data=target_labels)

    target_strands = np.array(target_strands, dtype="S")
    hdf5_out.create_dataset("target_strands", data=target_strands)

    # HDF5 train
    hdf5_out.create_dataset(
        "train_in",
        data=seqs_1hot[train_indexes],
        dtype="bool",
        compression=options.compression,
    )
    hdf5_out.create_dataset(
        "train_out",
        data=targets_real[train_indexes],
        dtype="float16",
        compression=options.compression,
    )
    if options.fourier_dim is not None:
        hdf5_out.create_dataset(
            "train_out_imag",
            data=targets_imag[train_indexes],
            dtype="float16",
            compression=options.compression,
        )
    if options.unmap_bed is not None:
        hdf5_out.create_dataset(
            "train_na",
            data=seqs_na[train_indexes],
            dtype="bool",
            compression=options.compression,
        )

    # HDF5 valid
    hdf5_out.create_dataset(
        "valid_in",
        data=seqs_1hot[valid_indexes],
        dtype="bool",
        compression=options.compression,
    )
    hdf5_out.create_dataset(
        "valid_out",
        data=targets_real[valid_indexes],
        dtype="float16",
        compression=options.compression,
    )
    if options.fourier_dim is not None:
        hdf5_out.create_dataset(
            "valid_out_imag",
            data=targets_imag[valid_indexes],
            dtype="float16",
            compression=options.compression,
        )
    if options.unmap_bed is not None:
        hdf5_out.create_dataset(
            "valid_na",
            data=seqs_na[valid_indexes],
            dtype="bool",
            compression=options.compression,
        )

    # HDF5 test
    hdf5_out.create_dataset(
        "test_in",
        data=seqs_1hot[test_indexes],
        dtype="bool",
        compression=options.compression,
    )
    hdf5_out.create_dataset(
        "test_out",
        data=targets_real[test_indexes],
        dtype="float16",
        compression=options.compression,
    )
    if options.fourier_dim is not None:
        hdf5_out.create_dataset(
            "test_out_imag",
            data=targets_imag[test_indexes],
            dtype="float16",
            compression=options.compression,
        )
    if not options.no_full:
        hdf5_out.create_dataset(
            "test_out_full",
            data=targets_test,
            dtype="float16",
            compression=options.compression,
        )
    if options.unmap_bed is not None:
        hdf5_out.create_dataset(
            "test_na",
            data=seqs_na[test_indexes],
            dtype="bool",
            compression=options.compression,
        )

    hdf5_out.close()

    # output BED file
    if options.out_bed_file:
        out_bed_out = open(options.out_bed_file, "w")
        for si in train_indexes:
            print("%s\t%d\t%d\ttrain" % seqs_segments[si], file=out_bed_out)
        for si in valid_indexes:
            print("%s\t%d\t%d\tvalid" % seqs_segments[si], file=out_bed_out)
        for si in test_indexes:
            print("%s\t%d\t%d\ttest" % seqs_segments[si], file=out_bed_out)
        out_bed_out.close()
示例#9
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=8388608,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    parser.add_option('--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option(
        '-d',
        dest='diagonal_offset',
        default=2,
        type='int',
        help='Positions on the diagonal to ignore [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option(
        '-k',
        dest='kernel_stddev',
        default=0,
        type='int',
        help='Gaussian kernel stddev to smooth values [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=128,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option(
        '--restart',
        dest='restart',
        default=False,
        action='store_true',
        help='Skip already read HDF5 coverage values. [Default: %default]')
    parser.add_option('--sample',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '--soft',
        dest='soft_clip',
        default=False,
        action='store_true',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_midpoints',
        dest='umap_midpoints',
        help='Regions with midpoints to exclude in BED format. Used for 4C/HiC.'
    )
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.3,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Set unmappable regions to this percentile in the sequences\' distribution of values'
    )
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option(
        '--snap',
        dest='snap',
        default=None,
        type='int',
        help=
        'snap stride to multiple for binned targets in bp, if not None seq_length must be a multiple of snap'
    )
    parser.add_option('--as_obsexp',
                      dest='as_obsexp',
                      action="store_true",
                      default=False,
                      help='save targets as obsexp profiles')
    parser.add_option('--global_obsexp',
                      dest='global_obsexp',
                      action="store_true",
                      default=False,
                      help='use pre-calculated by-chromosome obs/exp')
    parser.add_option('--no_log',
                      dest='no_log',
                      action="store_true",
                      default=False,
                      help='do not take log for obs/exp')

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print('stride_train %.f' % options.stride_train, end='')
        options.stride_train = options.stride_train * options.seq_length
        print(' converted to %f' % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        print('stride_test %.f' % options.stride_test, end='')
        options.stride_test = options.stride_test * options.seq_length
        print(' converted to %f' % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    if options.snap != None:
        if np.mod(options.seq_length, options.snap) != 0:
            raise ValueError('seq_length must be a multiple of snap')
        if np.mod(options.stride_train, options.snap) != 0:
            raise ValueError('stride_train must be a multiple of snap')
        if np.mod(options.stride_test, options.snap) != 0:
            raise ValueError('stride_test must be a multiple of snap')

    if os.path.isdir(options.out_dir) and not options.restart:
        print('Remove output directory %s or use --restart option.' %
              options.out_dir)
        exit(1)
    elif not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # dump options
    with open('%s/options.json' % options.out_dir, 'w') as options_json_out:
        json.dump(options.__dict__, options_json_out, sort_keys=True, indent=4)

    ################################################################
    # define genomic contigs
    ################################################################
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
        contigs += [
            Contig(chrom, ctg_start, ctg_end)
            for ctg_start, ctg_end in chrom_contigs[chrom]
        ]

    # limit to a BED file
    if options.limit_bed is not None:
        contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [
        ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
    ]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = '%s/contigs.bed' % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert (0 <= valid_pct <= 1)
        assert (0 <= test_pct <= 1)

        # divide by pct
        contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chrs = options.valid_pct_or_chr.split(',')
        test_chrs = options.test_pct_or_chr.split(',')
        contig_sets = divide_contigs_chr(contigs, test_chrs, valid_chrs)

    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################
    # stride sequences across contig
    train_mseqs = contig_sequences(train_contigs,
                                   options.seq_length,
                                   options.stride_train,
                                   options.snap,
                                   label='train')
    valid_mseqs = contig_sequences(valid_contigs,
                                   options.seq_length,
                                   options.stride_test,
                                   options.snap,
                                   label='valid')
    test_mseqs = contig_sequences(test_contigs,
                                  options.seq_length,
                                  options.stride_test,
                                  options.snap,
                                  label='test')

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # down-sample
    if options.sample_pct < 1.0:
        train_mseqs = random.sample(train_mseqs,
                                    int(options.sample_pct * len(train_mseqs)))
        valid_mseqs = random.sample(valid_mseqs,
                                    int(options.sample_pct * len(valid_mseqs)))
        test_mseqs = random.sample(test_mseqs,
                                   int(options.sample_pct * len(test_mseqs)))

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # mappability
    ################################################################
    if (options.umap_bed is not None) or (options.umap_midpoints is not None):
        if shutil.which('bedtools') is None:
            print('Install Bedtools to annotate unmappable sites',
                  file=sys.stderr)
            exit(1)

    if options.umap_bed is not None:
        # annotate unmappable positions
        mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                     options.seq_length, options.pool_width)

        # filter unmappable
        mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                          options.umap_t)
        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

    if options.umap_midpoints is not None:
        # annotate unmappable midpoints for 4C/HiC
        mseqs_unmap = annotate_unmap(mseqs, options.umap_midpoints,
                                     options.seq_length, options.pool_width)

        # filter unmappable
        seqmid = mseqs_unmap.shape[
            1] // 2  #int( options.seq_length / options.pool_width /2)
        mseqs_map_mask = (np.sum(mseqs_unmap[:, seqmid - 1:seqmid + 1],
                                 axis=1) == 0)

        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = '%s/mseqs_unmap_midpoints.npy' % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

    # write sequences to BED
    print('writing sequences to BED')
    seqs_bed_file = '%s/sequences.bed' % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)

    ################################################################
    # read sequence coverage values
    ################################################################
    # read target datasets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')

    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        # scale_ti = 1
        # if 'scale' in targets_df.columns:
        #   scale_ti = targets_df['scale'].iloc[ti]

        if options.restart and os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = 'akita_data_read.py'
            cmd += ' --crop %d' % options.crop_bp
            cmd += ' -k %d' % options.kernel_stddev
            cmd += ' -w %d' % options.pool_width
            if clip_ti is not None:
                cmd += ' --clip %f' % clip_ti
            if options.soft_clip:
                cmd += ' --soft'
            # cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            if options.as_obsexp:
                cmd += ' --as_obsexp'
                if options.global_obsexp:
                    cmd += ' --global_obsexp'
                if options.no_log:
                    cmd += ' --no_log'
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for tvt_set in ['train', 'valid', 'test']:
        tvt_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == tvt_set
        ]
        tvt_set_start = tvt_set_indexes[0]
        tvt_set_end = tvt_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = tvt_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

        while tfr_start <= tvt_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i)

            cmd = 'basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end

            # do not use
            # if options.umap_bed is not None:
            #   cmd += ' -u %s' % unmap_npy
            # if options.umap_set is not None:
            #   cmd += ' --umap_set %f' % options.umap_set

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (tvt_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # stats
    ################################################################
    stats_dict = {}
    stats_dict['num_targets'] = targets_df.shape[0]
    stats_dict['train_seqs'] = len(train_mseqs)
    stats_dict['valid_seqs'] = len(valid_mseqs)
    stats_dict['test_seqs'] = len(test_mseqs)
    stats_dict['seq_length'] = options.seq_length
    stats_dict['pool_width'] = options.pool_width
    stats_dict['crop_bp'] = options.crop_bp
    stats_dict['diagonal_offset'] = options.diagonal_offset

    target1_length = options.seq_length - 2 * options.crop_bp
    target1_length = target1_length // options.pool_width
    target1_length = target1_length - options.diagonal_offset
    target_length = target1_length * (target1_length + 1) // 2
    stats_dict['target_length'] = target_length

    with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
示例#10
0
def main():
    usage = "usage: %prog [options] <fasta_file> <targets_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-b",
        dest="blacklist_bed",
        help="Set blacklist nucleotides to a baseline value.",
    )
    parser.add_option(
        "--break",
        dest="break_t",
        default=786432,
        type="int",
        help="Break in half contigs above length [Default: %default]",
    )
    # parser.add_option('-c', dest='clip',
    #     default=None, type='float',
    #     help='Clip target values to have minimum [Default: %default]')
    parser.add_option(
        "-d",
        dest="sample_pct",
        default=1.0,
        type="float",
        help="Down-sample the segments",
    )
    parser.add_option(
        "-g", dest="gaps_file", help="Genome assembly gaps BED [Default: %default]"
    )
    parser.add_option(
        "-l",
        dest="seq_length",
        default=131072,
        type="int",
        help="Sequence length [Default: %default]",
    )
    parser.add_option(
        "--limit",
        dest="limit_bed",
        help="Limit to segments that overlap regions in a BED file",
    )
    parser.add_option(
        "--local",
        dest="run_local",
        default=False,
        action="store_true",
        help="Run jobs locally as opposed to on SLURM [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="data_out",
        help="Output directory [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number parallel processes [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="seqs_per_tfr",
        default=256,
        type="int",
        help="Sequences per TFRecord file [Default: %default]",
    )
    parser.add_option(
        "--seed",
        dest="seed",
        default=44,
        type="int",
        help="Random seed [Default: %default]",
    )
    parser.add_option(
        "--stride_train",
        dest="stride_train",
        default=1.0,
        type="float",
        help="Stride to advance train sequences [Default: seq_length]",
    )
    parser.add_option(
        "--stride_test",
        dest="stride_test",
        default=1.0,
        type="float",
        help="Stride to advance valid and test sequences [Default: seq_length]",
    )
    parser.add_option(
        "--soft",
        dest="soft_clip",
        default=False,
        action="store_true",
        help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="test_pct_or_chr",
        default=0.05,
        type="str",
        help="Proportion of the data for testing [Default: %default]",
    )
    parser.add_option("-u", dest="umap_bed", help="Unmappable regions in BED format")
    parser.add_option(
        "--umap_t",
        dest="umap_t",
        default=0.3,
        type="float",
        help="Remove sequences with more than this unmappable bin % [Default: %default]",
    )
    parser.add_option(
        "--umap_set",
        dest="umap_set",
        default=None,
        type="float",
        help="Set unmappable regions to this percentile in the sequences' distribution of values",
    )
    parser.add_option(
        "-w",
        dest="pool_width",
        default=128,
        type="int",
        help="Sum pool width [Default: %default]",
    )
    parser.add_option(
        "-v",
        dest="valid_pct_or_chr",
        default=0.05,
        type="str",
        help="Proportion of the data for validation [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide FASTA and sample coverage labels and paths.")
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.stride_train <= 0 or options.stride_train > 1:
        parser.error("Train stride =%f must be in [0,1]" % options.stride_train)

    if options.stride_test <= 0 or options.stride_test > 1:
        parser.error("Test stride =%f must be in [0,1]" % options.stride_test)

    ################################################################
    # define genomic contigs
    ################################################################
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
        contigs += [
            Contig(chrom, ctg_start, ctg_end)
            for ctg_start, ctg_end in chrom_contigs[chrom]
        ]

    # limit to a BED file
    if options.limit_bed is not None:
        contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = "%s/contigs.bed" % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert 0 <= valid_pct <= 1
        assert 0 <= test_pct <= 1

        # divide by pct
        contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chr = options.valid_pct_or_chr
        test_chr = options.test_pct_or_chr
        contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr)

    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################
    # stride sequences across contig
    train_mseqs = contig_sequences(
        train_contigs, options.seq_length, options.stride_train, label="train"
    )
    valid_mseqs = contig_sequences(
        valid_contigs, options.seq_length, options.stride_test, label="valid"
    )
    test_mseqs = contig_sequences(
        test_contigs, options.seq_length, options.stride_test, label="test"
    )

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # down-sample
    if options.sample_pct < 1.0:
        train_mseqs = random.sample(
            train_mseqs, int(options.sample_pct * len(train_mseqs))
        )
        valid_mseqs = random.sample(
            valid_mseqs, int(options.sample_pct * len(valid_mseqs))
        )
        test_mseqs = random.sample(
            test_mseqs, int(options.sample_pct * len(test_mseqs))
        )

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # mappability
    ################################################################
    if options.umap_bed is not None:
        if shutil.which("bedtools") is None:
            print("Install Bedtools to annotate unmappable sites", file=sys.stderr)
            exit(1)

        # annotate unmappable positions
        mseqs_unmap = annotate_unmap(
            mseqs, options.umap_bed, options.seq_length, options.pool_width
        )

        # filter unmappable
        mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t
        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = "%s/mseqs_unmap.npy" % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

    # write sequences to BED
    seqs_bed_file = "%s/sequences.bed" % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)

    ################################################################
    # read sequence coverage values
    ################################################################
    # read target datasets
    targets_df = pd.read_table(targets_file, index_col=0)

    seqs_cov_dir = "%s/seqs_cov" % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df["file"].iloc[ti]
        seqs_cov_stem = "%s/%d" % (seqs_cov_dir, ti)
        seqs_cov_file = "%s.h5" % seqs_cov_stem

        clip_ti = None
        if "clip" in targets_df.columns:
            clip_ti = targets_df["clip"].iloc[ti]

        scale_ti = 1
        if "scale" in targets_df.columns:
            scale_ti = targets_df["scale"].iloc[ti]

        if os.path.isfile(seqs_cov_file):
            print("Skipping existing %s" % seqs_cov_file, file=sys.stderr)
        else:
            cmd = "basenji_data_read.py"
            cmd += " -w %d" % options.pool_width
            cmd += " -u %s" % targets_df["sum_stat"].iloc[ti]
            if clip_ti is not None:
                cmd += " -c %f" % clip_ti
            if options.soft_clip:
                cmd += " --soft"
            cmd += " -s %f" % scale_ti
            if options.blacklist_bed:
                cmd += " -b %s" % options.blacklist_bed
            cmd += " %s" % genome_cov_file
            cmd += " %s" % seqs_bed_file
            cmd += " %s" % seqs_cov_file

            if options.run_local:
                cmd += " &> %s.err" % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(
                    cmd,
                    name="read_t%d" % ti,
                    out_file="%s.out" % seqs_cov_stem,
                    err_file="%s.err" % seqs_cov_stem,
                    queue="standard",
                    mem=15000,
                    time="12:0:0",
                )
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, "%s/targets.txt" % options.out_dir)

    # initialize TF Records dir
    tfr_dir = "%s/tfrecords" % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for tvt_set in ["train", "valid", "test"]:
        tvt_set_indexes = [i for i in range(len(mseqs)) if mseqs[i].label == tvt_set]
        tvt_set_start = tvt_set_indexes[0]
        tvt_set_end = tvt_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = tvt_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

        while tfr_start <= tvt_set_end:
            tfr_stem = "%s/%s-%d" % (tfr_dir, tvt_set, tfr_i)

            cmd = "basenji_data_write.py"
            cmd += " -s %d" % tfr_start
            cmd += " -e %d" % tfr_end
            if options.umap_bed is not None:
                cmd += " -u %s" % unmap_npy
            if options.umap_set is not None:
                cmd += " --umap_set %f" % options.umap_set

            cmd += " %s" % fasta_file
            cmd += " %s" % seqs_bed_file
            cmd += " %s" % seqs_cov_dir
            cmd += " %s.tfr" % tfr_stem

            if options.run_local:
                cmd += " &> %s.err" % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(
                    cmd,
                    name="write_%s-%d" % (tvt_set, tfr_i),
                    out_file="%s.out" % tfr_stem,
                    err_file="%s.err" % tfr_stem,
                    queue="standard",
                    mem=15000,
                    time="12:0:0",
                )
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )