Пример #1
0
                    '''Take start and stop from coordinates as specified : full bed coordinates'''
                    # correct bed coordinates
                    patch_start = start
                    patch_new_end = end

                # CONTINUE COMBINED ============================================
                # check start and end of range
                # set start_diff if sequence to query is over the chromosome ends --> ready to padd
                start_diff = 0
                seq_start = patch_start
                if patch_start < 0:
                    start_diff = abs(patch_start)
                    seq_start = 0  # cover over the border cases for sequence retrival

                # extract reference sequence -------------------------------------------
                with pysam.Fastafile(FLAGS.genome) as fa:
                    seq = fa.fetch(reference=chrom,
                                   start=seq_start,
                                   end=patch_new_end)

                # pad if specifiedand at end of chromosome
                if start_diff > 0:
                    if FLAGS.padd_ends in ['left', 'both']:
                        # padd with N's
                        print('padding with N\'s left wards')
                        seq = 'N' * start_diff + seq
                    else:
                        print(
                            '%s:%s-%s is smaller then bp_context and no padding specified ... skipping'
                            % (chrom, start, end))
                        continue
Пример #2
0
    def __init__(self, seqFn):
        import pysam

        self.genome = pysam.Fastafile(seqFn)
        print "seqClass: input 0-based coordinate -- [start, end)"
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-c',
        dest='center_pct',
        default=0.25,
        type='float',
        help='Require clustered SNPs lie in center region [Default: %default]')
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/assembly/hg19.fa' % os.environ['HG19'],
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.genome' %
                      os.environ['HG19'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option('--h5',
                      dest='out_h5',
                      default=False,
                      action='store_true',
                      help='Output stats to sad.h5 [Default: %default]')
    parser.add_option('--local',
                      dest='local',
                      default=1024,
                      type='int',
                      help='Local SAD score [Default: %default]')
    parser.add_option('-n',
                      dest='norm_file',
                      default=None,
                      help='Normalize SAD scores')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sad',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option('--pseudo',
                      dest='log_pseudo',
                      default=1,
                      type='float',
                      help='Log2 pseudocount [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '--stats',
        dest='sad_stats',
        default='SAD',
        help='Comma-separated list of stats to save. [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--ti',
        dest='track_indexes',
        default=None,
        type='str',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '-u',
        dest='penultimate',
        default=False,
        action='store_true',
        help='Compute SED in the penultimate layer [Default: %default]')
    parser.add_option('-z',
                      dest='out_zarr',
                      default=False,
                      action='store_true',
                      help='Output stats to sad.zarr [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) == 3:
        # single worker
        params_file = args[0]
        model_file = args[1]
        vcf_file = args[2]

    elif len(args) == 5:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        vcf_file = args[3]
        worker_index = int(args[4])

        # load options
        options_pkl = open(options_pkl_file, 'rb')
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = '%s/job%d' % (options.out_dir, worker_index)

    else:
        parser.error(
            'Must provide parameters and model files and QTL VCF file')

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.track_indexes is None:
        options.track_indexes = []
    else:
        options.track_indexes = [
            int(ti) for ti in options.track_indexes.split(',')
        ]
        if not os.path.isdir('%s/tracks' % options.out_dir):
            os.mkdir('%s/tracks' % options.out_dir)

    options.shifts = [int(shift) for shift in options.shifts.split(',')]
    options.sad_stats = options.sad_stats.split(',')

    #################################################################
    # read parameters and collet target information

    job = params.read_job_params(params_file,
                                 require=['seq_length', 'num_targets'])

    if options.targets_file is None:
        target_ids = ['t%d' % ti for ti in range(job['num_targets'])]
        target_labels = [''] * len(target_ids)
        target_subset = None

    else:
        targets_df = pd.read_table(options.targets_file)
        target_ids = targets_df.identifier
        target_labels = targets_df.description
        target_subset = targets_df.index
        if len(target_subset) == job['num_targets']:
            target_subset = None

    #################################################################
    # load SNPs

    # read sorted SNPs from VCF
    snps = bvcf.vcf_snps(vcf_file,
                         require_sorted=True,
                         validate_ref_fasta=options.genome_fasta,
                         flip_ref=True)

    # filter for worker SNPs
    if options.processes is not None:
        worker_bounds = np.linspace(0,
                                    len(snps),
                                    options.processes + 1,
                                    dtype='int')
        snps = snps[worker_bounds[worker_index]:worker_bounds[worker_index +
                                                              1]]

    num_snps = len(snps)

    # cluster SNPs by position
    snp_clusters = cluster_snps(snps, job['seq_length'], options.center_pct)

    # delimit sequence boundaries
    [sc.delimit(job['seq_length']) for sc in snp_clusters]

    # open genome FASTA
    genome_open = pysam.Fastafile(options.genome_fasta)

    # make SNP sequence generator
    def snp_gen():
        for sc in snp_clusters:
            snp_1hot_list = sc.get_1hots(genome_open)
            for snp_1hot in snp_1hot_list:
                yield {'sequence': snp_1hot}

    snp_types = {'sequence': tf.float32}
    snp_shapes = {
        'sequence':
        tf.TensorShape([tf.Dimension(job['seq_length']),
                        tf.Dimension(4)])
    }

    dataset = tf.data.Dataset().from_generator(snp_gen,
                                               output_types=snp_types,
                                               output_shapes=snp_shapes)
    dataset = dataset.batch(job['batch_size'])
    dataset = dataset.prefetch(2 * job['batch_size'])
    # dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/device:GPU:0'))

    iterator = dataset.make_one_shot_iterator()
    data_ops = iterator.get_next()

    #################################################################
    # setup model

    # build model
    t0 = time.time()
    model = seqnn.SeqNN()
    model.build_sad(job,
                    data_ops,
                    ensemble_rc=options.rc,
                    ensemble_shifts=options.shifts,
                    embed_penultimate=options.penultimate,
                    target_subset=target_subset)
    print('Model building time %f' % (time.time() - t0), flush=True)

    if options.penultimate:
        # labels become inappropriate
        target_ids = [''] * model.hp.cnn_filters[-1]
        target_labels = target_ids

    # read target normalization factors
    target_norms = np.ones(len(target_labels))
    if options.norm_file is not None:
        ti = 0
        for line in open(options.norm_file):
            target_norms[ti] = float(line.strip())
            ti += 1

    num_targets = len(target_ids)

    #################################################################
    # setup output

    sad_out = initialize_output_h5(options.out_dir, options.sad_stats, snps,
                                   target_ids, target_labels)

    snp_threads = []

    snp_queue = Queue()
    for i in range(1):
        sw = SNPWorker(snp_queue, sad_out)
        sw.start()
        snp_threads.append(sw)

    #################################################################
    # predict SNP scores, write output

    # initialize saver
    saver = tf.train.Saver()
    with tf.Session() as sess:
        # coordinator
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord)

        # load variables into session
        saver.restore(sess, model_file)

        # initialize predictions stream
        preds_stream = PredStream(sess, model, 32)

        # predictions index
        pi = 0

        # SNP index
        si = 0

        for snp_cluster in snp_clusters:
            ref_preds = preds_stream[pi]
            pi += 1

            for snp in snp_cluster.snps:
                # print(snp, flush=True)

                alt_preds = preds_stream[pi]
                pi += 1

                # queue SNP
                snp_queue.put((ref_preds, alt_preds, si))

                # update SNP index
                si += 1

    # finish queue
    print('Waiting for threads to finish.', flush=True)
    snp_queue.join()

    # close genome
    genome_open.close()

    ###################################################
    # compute SAD distributions across variants

    # define percentiles
    d_fine = 0.001
    d_coarse = 0.01
    percentiles_neg = np.arange(d_fine, 0.1, d_fine)
    percentiles_base = np.arange(0.1, 0.9, d_coarse)
    percentiles_pos = np.arange(0.9, 1, d_fine)

    percentiles = np.concatenate(
        [percentiles_neg, percentiles_base, percentiles_pos])
    sad_out.create_dataset('percentiles', data=percentiles)
    pct_len = len(percentiles)

    for sad_stat in options.sad_stats:
        sad_stat_pct = '%s_pct' % sad_stat

        # compute
        sad_pct = np.percentile(sad_out[sad_stat], 100 * percentiles, axis=0).T
        sad_pct = sad_pct.astype('float16')

        # save
        sad_out.create_dataset(sad_stat_pct, data=sad_pct, dtype='float16')

    sad_out.close()
Пример #4
0
def filter_candidates(
    (candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp,
     good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af,
     del_min_af, del_merge_min_af, ins_merge_min_af, merge_r)):
    thread_logger = logging.getLogger("{} ({})".format(
        filter_candidates.__name__,
        multiprocessing.current_process().name))
    try:
        thread_logger.info(
            "---------------------Filter Candidates---------------------")

        records = {}
        with open(candidates_vcf) as v_f:
            for line in v_f:
                if line[0] == "#":
                    continue
                if len(line.strip().split()) != 10:
                    raise RuntimeError(
                        "Bad VCF line (<10 fields): {}".format(line))
                chrom, pos, _, ref, alt, _, _, info_, _, info = line.strip(
                ).split()
                pos = int(pos)
                loc = "{}.{}".format(chrom, pos)
                dp, ro, ao = map(int, info.split(":")[1:4])
                info_dict = dict(
                    map(lambda x: x.split("="), filter(None,
                                                       info_.split(";"))))
                mq_ = safe_read_info_dict(info_dict, "MQ", int, -100)
                bq_ = safe_read_info_dict(info_dict, "BQ", int, -100)
                nm_ = safe_read_info_dict(info_dict, "NM", int, -100)
                as_ = safe_read_info_dict(info_dict, "AS", int, -100)
                xs_ = safe_read_info_dict(info_dict, "XS", int, -100)
                pr_ = safe_read_info_dict(info_dict, "PR", int, -100)
                cl_ = safe_read_info_dict(info_dict, "CL", int, -100)
                st_ = safe_read_info_dict(info_dict, "ST", str, "-100,-100")
                ls_ = safe_read_info_dict(info_dict, "LS", int, -100)
                rs_ = safe_read_info_dict(info_dict, "RS", int, -100)

                if ao < min(ro, min_ao):
                    continue

                if loc not in records:
                    records[loc] = []
                if ref == "N" or "\t".join(line.split()[0:5]) \
                        not in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]):
                    records[loc].append([
                        chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_,
                        rs_, nm_, as_, xs_, pr_, cl_, line
                    ])
                elif "\t".join(line.split()[0:5]) \
                        in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]):
                    for i, x in enumerate(records[loc]):
                        if "\t".join(line.split()[0:5]) == "\t".join(x[-1].split()[0:5]) \
                                and ao / float(ro + 0.0001) > x[6] / float(x[5] + 0.0001):
                            records[loc][i] = [
                                chrom, pos, ref, alt, dp, ro, ao, mq_, bq_,
                                st_, ls_, rs_, nm_, as_, xs_, pr_, cl_, line
                            ]
                            break
        fasta_file = pysam.Fastafile(reference)
        good_records = []
        dels = []
        for loc, rs in sorted(records.iteritems(), key=lambda x: x[1][0:2]) + \
                [["", [["", 0, "", "", 0, 0, 0, ""]]]]:
            ins = filter(lambda x: x[2] == "N", rs)
            if len(ins) > 1:
                # emit ins
                afs = map(lambda x: x[6] / float(x[5] + x[6]), ins)
                max_af = max(afs)
                ins = filter(
                    lambda x: x[6] / float(x[5] + x[6]) >= (max_af * merge_r),
                    ins)
                chrom, pos, ref = ins[0][0:3]
                dp = max(map(lambda x: x[4], ins))
                ro = max(map(lambda x: x[5], ins))
                ao = max(map(lambda x: x[6], ins))
                mq_ = max(map(lambda x: x[7], ins))
                bq_ = max(map(lambda x: x[8], ins))
                st_ = "{},{}".format(
                    max(map(lambda x: int(x[9].split(",")[0]), ins)),
                    max(map(lambda x: int(x[9].split(",")[1]), ins)))
                ls_ = max(map(lambda x: x[10], ins))
                rs_ = max(map(lambda x: x[11], ins))
                nm_ = max(map(lambda x: x[12], ins))
                as_ = max(map(lambda x: x[13], ins))
                xs_ = max(map(lambda x: x[14], ins))
                pr_ = max(map(lambda x: x[15], ins))
                cl_ = max(map(lambda x: x[16], ins))
                alt = "".join(map(lambda x: x[3], ins))
                if (max_af >= ins_merge_min_af) or (ao >= good_ao):
                    ins = [[
                        chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_,
                        rs_, nm_, as_, xs_, pr_, cl_
                    ]]
                else:
                    ins = []
            elif len(ins) == 1:
                # emit 1-base ins
                dp, ro, ao = ins[0][4:7]
                if (ao / float(ro + ao) <
                    (ins_min_af) and ao < good_ao) or dp <= 5:
                    ins = []
                else:
                    ins = [ins[0][:-1]]
            good_records.extend(ins)
            if dels and (ins
                         or filter(lambda x: x[3] != "N" and x[2] != "N", rs)):
                # emit del
                if len(dels) == 1:
                    ro = dels[0][5]
                    ao = dels[0][6]
                    chrom, pos, ref = dels[0][0:3]
                    if ao / float(ro + ao) >= ((del_min_af)) or ao >= good_ao:
                        good_records.extend(dels)

                else:
                    afs = map(lambda x: x[6] / float(x[5] + x[6]), dels)
                    max_af = max(afs)
                    merge_r_thr = merge_r * max_af
                    dels = filter(
                        lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr,
                        dels)
                    chrom, pos = dels[0][0:2]
                    dp = max(map(lambda x: x[4], dels))
                    ro = max(map(lambda x: x[5], dels))
                    ao = max(map(lambda x: x[6], dels))
                    mq_ = max(map(lambda x: x[7], dels))
                    bq_ = max(map(lambda x: x[8], dels))
                    st_ = "{},{}".format(
                        max(map(lambda x: int(x[9].split(",")[0]), dels)),
                        max(map(lambda x: int(x[9].split(",")[1]), dels)))
                    ls_ = max(map(lambda x: x[10], dels))
                    rs_ = max(map(lambda x: x[11], dels))
                    nm_ = max(map(lambda x: x[12], dels))
                    as_ = max(map(lambda x: x[13], dels))
                    xs_ = max(map(lambda x: x[14], dels))
                    pr_ = max(map(lambda x: x[15], dels))
                    cl_ = max(map(lambda x: x[16], dels))
                    ref = "".join(map(lambda x: x[2], dels))
                    alt = "N"
                    good_records.append([
                        chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_,
                        rs_, nm_, as_, xs_, pr_, cl_
                    ])
                dels = []
            if not loc:
                continue

            for record in rs:
                dp = record[4]
                if dp <= min_dp:
                    continue
                ro, ao = record[5:7]
                if record[2] != "N" and record[3] != "N" and record[
                        2] != record[3]:
                    bq = record[8]
                    if (ao / float(ro + ao) >=
                        (snp_min_af) or ao >= snp_min_ao) and bq >= snp_min_bq:
                        # emit SNP
                        good_records.append(record[:-1])
                elif record[2] != "N" and record[3] == "N":
                    if ao / float(ro + ao) >= (
                            del_merge_min_af) or ao >= good_ao:
                        chrom, pos = record[0:2]
                        if dels and pos - dels[-1][1] != 1:
                            # emit del
                            if len(dels) == 1:
                                ro = dels[0][5]
                                ao = dels[0][6]
                                chrom, pos, ref = dels[0][0:3]
                                pos = int(pos)
                                if ao / float(ro + ao) >= ((del_min_af)):
                                    good_records.extend(dels)
                            else:
                                afs = map(lambda x: x[6] / float(x[5] + x[6]),
                                          dels)
                                max_af = max(afs)
                                merge_r_thr = merge_r * max_af
                                dels = filter(
                                    lambda x: x[6] / float(x[5] + x[6]) >=
                                    merge_r_thr, dels)
                                chrom, pos = dels[0][0:2]
                                dp = max(map(lambda x: x[4], dels))
                                ro = max(map(lambda x: x[5], dels))
                                ao = max(map(lambda x: x[6], dels))
                                mq_ = max(map(lambda x: x[7], dels))
                                bq_ = max(map(lambda x: x[8], dels))
                                st_ = "{},{}".format(
                                    max(
                                        map(lambda x: int(x[9].split(",")[0]),
                                            dels)),
                                    max(
                                        map(lambda x: int(x[9].split(",")[1]),
                                            dels)))
                                ls_ = max(map(lambda x: x[10], dels))
                                rs_ = max(map(lambda x: x[11], dels))
                                nm_ = max(map(lambda x: x[12], dels))
                                as_ = max(map(lambda x: x[13], dels))
                                xs_ = max(map(lambda x: x[14], dels))
                                pr_ = max(map(lambda x: x[15], dels))
                                cl_ = max(map(lambda x: x[16], dels))
                                ref = "".join(map(lambda x: x[2], dels))
                                alt = "N"
                                good_records.append([
                                    chrom, pos, ref, alt, dp, ro, ao, mq_, bq_,
                                    st_, ls_, rs_, nm_, as_, xs_, pr_, cl_
                                ])
                            dels = []
                        # accumulate dels
                        dels.append(record[:-1])

        final_records = []
        dels = []
        for i, record in enumerate(good_records):
            chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ = record
            ref = ref.upper()
            alt = alt.upper()
            info_str = ""
            if st_ != "-100,-100":
                info_str += ";ST={}".format(st_)
            if ls_ != -100:
                info_str += ";LS={}".format(ls_)
            if rs_ != -100:
                info_str += ";RS={}".format(rs_)
            if nm_ != -100:
                info_str += ";NM={}".format(nm_)
            if as_ != -100:
                info_str += ";AS={}".format(as_)
            if xs_ != -100:
                info_str += ";XS={}".format(xs_)
            if pr_ != -100:
                info_str += ";PR={}".format(pr_)
            if cl_ != -100:
                info_str += ";CL={}".format(cl_)
            if mq_ != -100:
                info_str += ";MQ={}".format(mq_)
            if bq_ != -100:
                info_str += ";BQ={}".format(bq_)

            af = np.round(ao / float(ao + ro), 4)
            info_str += ";AF={}".format(af)
            if ref != "N" and alt != "N":
                line = "\t".join([
                    chrom,
                    str(pos), ".", ref, alt, "100", ".",
                    "DP={};RO={};AO={}".format(dp, ro, ao) + info_str,
                    "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)
                ])
                final_records.append([chrom, pos, ref, alt, line])
            elif alt == "N":
                ref = fasta_file.fetch(chrom, pos - 2,
                                       pos + len(ref) - 1).upper()
                alt = fasta_file.fetch(chrom, pos - 2, pos - 1).upper()
                line = "\t".join([
                    chrom,
                    str(pos - 1), ".", ref, alt, "100", ".",
                    "DP={};RO={};AO={}".format(dp, ro, ao) + info_str,
                    "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)
                ])
                final_records.append([chrom, pos - 1, ref, alt, line])
            elif ref == "N":
                ref = fasta_file.fetch(chrom, pos - 2, pos - 1).upper()
                alt = ref + alt
                line = "\t".join([
                    chrom,
                    str(pos - 1), ".", ref, alt, "100", ".",
                    "DP={};RO={};AO={}".format(dp, ro, ao) + info_str,
                    "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)
                ])
                final_records.append([chrom, pos - 1, ref, alt, line])
        final_records = sorted(final_records, key=lambda x: x[0:2])
        if dbsnp:
            filtered_bed = pybedtools.BedTool(
                map(
                    lambda x: pybedtools.Interval(x[1][0], int(x[1][1]),
                                                  int(x[1][1]) + 1, x[1][2], x[
                                                      1][3], str(x[0])),
                    enumerate(final_records))).sort()
            dbsnp = pybedtools.BedTool(dbsnp).each(
                lambda x: pybedtools.Interval(x[0], int(x[1]),
                                              int(x[1]) + 1, x[3], x[4])).sort(
                                              )
            non_in_dbsnp_1 = filtered_bed.window(dbsnp, w=0, v=True)
            non_in_dbsnp_2 = filtered_bed.window(
                dbsnp, w=0).filter(lambda x: x[1] != x[7] or x[3] != x[9] or x[
                    4] != x[10]).sort()
            non_in_dbsnp_ids = []
            for x in non_in_dbsnp_1:
                non_in_dbsnp_ids.append(int(x[5]))
            for x in non_in_dbsnp_2:
                non_in_dbsnp_ids.append(int(x[5]))
            final_records = map(
                lambda x: x[1],
                filter(lambda x: x[0] in non_in_dbsnp_ids,
                       enumerate(final_records)))
        with open(filtered_candidates_vcf, "w") as o_f:
            o_f.write("##fileformat=VCFv4.2\n")
            o_f.write(
                "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
            )
            for record in final_records:
                o_f.write(record[-1] + "\n")
        return filtered_candidates_vcf

    except Exception as ex:
        thread_logger.error(traceback.format_exc())
        thread_logger.error(ex)
        return None
Пример #5
0
def crossmap_maf_file(mapping,
                      infile,
                      outfile,
                      liftoverfile,
                      refgenome,
                      ref_name,
                      cstyle='a'):
    '''
	Convert genome coordinates in MAF (mutation annotation foramt) format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	ref_name : str
		The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38".

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        logging.info("Creating index for: %s" % refgenome)
        pysam.faidx(refgenome)
    if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome):
        logging.info(
            "Index file is older than reference genome. Re-creating index for: %s"
            % refgenome)
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('#'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            continue
        elif line.startswith('Hugo_Symbol'):
            print(
                "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s"
                % ("CrossMap", __version__,
                   datetime.date.today().strftime("%B%d,%Y"), liftoverfile,
                   refgenome),
                file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            logging.info("Lifting over ... ")
        else:

            fields = str.split(line, sep='\t')
            total += 1

            fields[3] = ref_name
            chrom = fields[4]
            start = int(fields[5]) - 1  # 0 based
            end = int(fields[6])
            #strand = fields[7]

            a = map_coordinates(mapping,
                                chrom,
                                start,
                                end,
                                '+',
                                chrom_style=cstyle)

            if a is None:
                print(line, file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]

                # update chrom
                fields[4] = target_chr

                # update start coordinate
                fields[5] = target_start + 1

                # update end
                fields[6] = target_end

                # update ref allele
                try:
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[10] = refFasta.fetch(target_chr, target_start,
                                                target_end).upper()
                except:
                    print(line, file=UNMAP)
                    fail += 1
                    continue

                if a[1][3] == '-':
                    fields[10] = revcomp_DNA(fields[10], True)
                print('\t'.join(map(str, fields)), file=FILE_OUT)

            else:
                print(line, file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()
    logging.info("Total entries: %d", total)
    logging.info("Failed to map: %d", fail)
Пример #6
0
def crossmap_vcf_file(mapping, infile, outfile, liftoverfile, refgenome):
    '''
	Convert genome coordinates in VCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue
            fields = str.split(line, maxsplit=7)
            total += 1

            chrom = fields[0]
            start = int(fields[1]) - 1  # 0 based
            end = start + len(fields[3])

            a = map_coordinates(mapping, chrom, start, end, '+')
            if a is None:
                print(line + "\tFail(Unmap)", file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                # update chrom
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]
                fields[0] = target_chr

                # update start coordinate
                fields[1] = target_start + 1

                # update ref allele
                target_chr = update_chromID(refFasta.references[0], target_chr)
                try:
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()
                except:
                    print(line + "\tFail(KeyError)", file=UNMAP)
                    fail += 1
                    continue

                # update END if any
                fields[7] = re.sub('END\=\d+', 'END=' + str(target_end),
                                   fields[7])

                if a[1][3] == '-':
                    fields[4] = revcomp_DNA(fields[4], True)

                if fields[3] != fields[4]:
                    print('\t'.join(map(str, fields)), file=FILE_OUT)
                else:
                    print(line + "\tFail(REF==ALT)", file=UNMAP)
                    fail += 1
            else:
                print(line + "\tFail(Multiple_hits)", file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()
    printlog(["Total entries:", str(total)])
    printlog(["Failed to map:", str(fail)])
Пример #7
0
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts):
    ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end'''

    if args.seed is not None: random.seed(int(args.seed) + int(start))

    mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf)
    if ins is None:
        mutid += ':DEL'
    else:
        mutid += ':INS:' + ins

    bamfile = pysam.AlignmentFile(args.bamFileName, 'rb')
    bammate = pysam.AlignmentFile(
        args.bamFileName, 'rb')  # use for mates to avoid iterator problems
    reffile = pysam.Fastafile(args.refFasta)
    vcffile = pysam.VariantFile(args.germline,
                                'r') if args.germline is not None else None
    tmpbams = []

    is_insertion = ins is not None
    is_deletion = ins is None

    snvfrac = float(args.snvfrac)

    mutstr = get_mutstr(chrom, start, end, ins, reffile)

    del_ln = 0
    if is_deletion:
        del_ln = end - start

    mutpos = start
    mutpos_list = [start]

    # optional CNV file
    cnv = None
    if (args.cnvfile):
        cnv = pysam.Tabixfile(args.cnvfile, 'r')

    log = open(
        'addindel_logs_' + os.path.basename(args.outBamFile) + '/' +
        os.path.basename(args.outBamFile) + "." + "_".join(
            (chrom, str(start), str(end))) + ".log", 'w')

    tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(
        uuid4()) + ".bam"
    logger.info("%s creating tmp bam: %s" % (mutid, tmpoutbamname))
    outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'wb', template=bamfile)

    mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(
        args,
        log,
        bamfile,
        bammate,
        chrom,
        mutpos,
        mutpos + del_ln + 1,
        mutpos_list,
        avoid=avoid,
        mutid_list=[mutid],
        is_insertion=is_insertion,
        is_deletion=is_deletion,
        ins_seq=ins,
        reffile=reffile,
        indel_start=start,
        indel_end=end,
        vcffile=vcffile)

    if mutfail:
        outbam_muts.close()
        os.remove(tmpoutbamname)
        return None

    # pick reads to change
    readlist = []
    for extqname, read in outreads.items():
        if read.seq != mutreads[extqname]:
            readlist.append(extqname)

    logger.info("%s len(readlist): %d" % (mutid, len(readlist)))
    readlist.sort()
    random.shuffle(readlist)

    if len(readlist) < int(args.mindepth):
        logger.warning("%s skipped, too few reads in region: %d" %
                       (mutid, len(readlist)))
        outbam_muts.close()
        os.remove(tmpoutbamname)
        return None

    if vaf is None:
        vaf = float(args.mutfrac
                    )  # default minor allele freq if not otherwise specified

    if cnv:  # cnv file is present
        if chrom in cnv.contigs:
            for cnregion in cnv.fetch(chrom, start, end):
                cn = float(
                    cnregion.strip().split()[3])  # expect chrom,start,end,CN
                logger.info(mutid + "\t" +
                            ' '.join(("copy number in snp region:", chrom,
                                      str(start), str(end), "=", str(cn))))
                if float(cn) > 0.0:
                    vaf = vaf / float(cn)
                else:
                    vaf = 0.0
                logger.info("%s adjusted VAF: %f" % (mutid, vaf))
    else:
        logger.info("%s selected VAF: %f" % (mutid, vaf))

    lastread = int(len(readlist) * vaf)

    # pick at least args.minmutreads if possible
    if lastread < int(args.minmutreads):
        if len(readlist) > int(args.minmutreads):
            lastread = int(args.minmutreads)
            logger.warning("%s forced %d reads" % (mutid, lastread))
        else:
            logger.warning(
                "%s dropped site with fewer reads than --minmutreads" % mutid)
            os.remove(tmpoutbamname)
            return None

    readtrack = dd(list)

    for readname in readlist:
        orig_name, readpos, pairend = readname.split(',')
        readtrack[orig_name].append('%s,%s' % (readpos, pairend))

    usedreads = 0
    newreadlist = []

    for orig_name in readtrack:
        for read_instance in readtrack[orig_name]:
            newreadlist.append(orig_name + ',' + read_instance)
            usedreads += 1

        if usedreads >= lastread:
            break

    readlist = newreadlist

    logger.info("%s picked: %d reads" % (mutid, len(readlist)))

    wrote = 0
    nmut = 0
    mut_out = {}
    # change reads from .bam to mutated sequences
    for extqname, read in outreads.items():
        if read.seq != mutreads[extqname]:
            if not args.nomut and extqname in readlist:
                qual = read.qual  # changing seq resets qual (see pysam API docs)
                read.seq = mutreads[extqname]  # make mutation
                read.qual = qual
                nmut += 1
        if not hasSNP or args.force:
            wrote += 1
            mut_out[extqname] = read

    muts_written = {}

    for extqname in mut_out:
        if extqname not in muts_written:
            outbam_muts.write(mut_out[extqname])
            muts_written[extqname] = True

            if mutmates[extqname] is not None:
                # is mate also in mutated list?
                mate_read = mutmates[extqname]

                pairname = 'F'  # read is first in pair
                if mate_read.is_read2:
                    pairname = 'S'  # read is second in pair
                if not mate_read.is_paired:
                    pairname = 'U'  # read is unpaired

                mateqname = ','.join(
                    (mate_read.qname, str(mate_read.pos), pairname))

                if mateqname in mut_out:
                    # yes: output mutated mate
                    outbam_muts.write(mut_out[mateqname])
                    muts_written[mateqname] = True

                else:
                    # no: output original mate
                    outbam_muts.write(mate_read)

    logger.info("%s wrote: %d, mutated: %d" % (mutid, wrote, nmut))

    if not hasSNP or args.force:
        outbam_muts.close()
        aligners.remap_bam(args.aligner,
                           tmpoutbamname,
                           args.refFasta,
                           alignopts,
                           threads=int(args.alignerthreads),
                           mutid=mutid,
                           paired=(not args.single),
                           insane=args.insane)

        outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'rb')
        coverwindow = 1
        incover = countReadCoverage(bamfile, chrom, mutpos - coverwindow,
                                    mutpos + del_ln + coverwindow)
        outcover = countReadCoverage(outbam_muts, chrom, mutpos - coverwindow,
                                     mutpos + del_ln + coverwindow)

        avgincover = float(sum(incover)) / float(len(incover))
        avgoutcover = float(sum(outcover)) / float(len(outcover))
        spikein_frac = 0.0
        if wrote > 0:
            spikein_frac = float(nmut) / float(wrote)

        # qc cutoff for final snv depth
        if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >=
                float(args.coverdiff)) or args.force:
            tmpbams.append(tmpoutbamname)
            indelstr = ''
            if is_insertion:
                indelstr = ':'.join(('INS', chrom, str(start), ins))
            else:
                indelstr = ':'.join(('DEL', chrom, str(start), str(end)))

            snvstr = chrom + ":" + str(start) + "-" + str(
                end) + " (VAF=" + str(vaf) + ")"
            log.write("\t".join(("indel", indelstr, str(mutpos), mutstr,
                                 str(avgincover), str(avgoutcover),
                                 str(spikein_frac), str(maxfrac))) + "\n")
        else:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            if os.path.exists(tmpoutbamname + '.bai'):
                os.remove(tmpoutbamname + '.bai')

            logger.warning("%s dropped for outcover/incover < %s" %
                           (mutid, str(args.coverdiff)))
            return None

    outbam_muts.close()
    bamfile.close()
    bammate.close()
    log.close()

    return sorted(tmpbams)
Пример #8
0
def main(args):
    logger.info("starting %s called with args: %s" %
                (sys.argv[0], ' '.join(sys.argv)))
    bedfile = open(args.varFileName, 'r')
    reffile = pysam.Fastafile(args.refFasta)

    if not os.path.exists(args.bamFileName + '.bai'):
        logger.error("input bam must be indexed, not .bai file found for %s" %
                     args.bamFileName)
        sys.exit(1)

    alignopts = {}
    if args.alignopts is not None:
        alignopts = dict([o.split(':') for o in args.alignopts.split(',')])

    aligners.checkoptions(args.aligner, alignopts)

    # load readlist to avoid, if specified
    avoid = None
    if args.avoidreads is not None:
        avoid = dictlist(args.avoidreads)

    # make a temporary file to hold mutated reads
    outbam_mutsfile = "addindel." + str(uuid4()) + ".muts.bam"
    bamfile = pysam.AlignmentFile(args.bamFileName, 'rb')
    outbam_muts = pysam.AlignmentFile(outbam_mutsfile, 'wb', template=bamfile)
    outbam_muts.close()
    bamfile.close()
    tmpbams = []

    if not os.path.exists(args.tmpdir):
        os.mkdir(args.tmpdir)
        logger.info("created tmp directory: %s" % args.tmpdir)

    if not os.path.exists('addindel_logs_' +
                          os.path.basename(args.outBamFile)):
        os.mkdir('addindel_logs_' + os.path.basename(args.outBamFile))
        logger.info("created directory: addindel_logs_%s" %
                    os.path.basename(args.outBamFile))

    assert os.path.exists('addindel_logs_' + os.path.basename(args.outBamFile)
                          ), "could not create output directory!"
    assert os.path.exists(args.tmpdir), "could not create temporary directory!"

    pool = Pool(processes=int(args.procs))
    results = []

    ntried = 0
    for bedline in bedfile:
        if ntried < int(args.numsnvs) or int(args.numsnvs) == 0:
            c = bedline.strip().split()
            chrom = c[0]
            start = int(c[1])
            end = int(c[2])
            vaf = float(c[3])
            type = c[4]
            ins = None

            assert type in ('INS', 'DEL')
            if type == 'INS':
                ins = c[5]

            # make mutation (submit job to thread pool)
            result = pool.apply_async(
                makemut, [args, chrom, start, end, vaf, ins, avoid, alignopts])
            results.append(result)
            ntried += 1

    for result in results:
        tmpbamlist = result.get()
        if tmpbamlist is not None:
            for tmpbam in tmpbamlist:
                if os.path.exists(tmpbam):
                    tmpbams.append(tmpbam)

    if len(tmpbams) == 0:
        logger.error("no succesful mutations")
        sys.exit()

    tmpbams.sort()

    # merge tmp bams
    if len(tmpbams) == 1:
        os.rename(tmpbams[0], outbam_mutsfile)
    elif len(tmpbams) > 1:
        mergebams(tmpbams, outbam_mutsfile, maxopen=int(args.maxopen))

    bedfile.close()

    # cleanup
    for bam in tmpbams:
        if os.path.exists(bam):
            os.remove(bam)
        if os.path.exists(bam + '.bai'):
            os.remove(bam + '.bai')

    if os.listdir(args.tmpdir) == []:
        os.rmdir(args.tmpdir)

    if args.skipmerge:
        logger.info("skipping merge, plase merge reads from %s manually." %
                    outbam_mutsfile)
    else:
        if args.tagreads:
            from bamsurgeon.markreads import markreads
            tmp_tag_bam = 'tag.%s.bam' % str(uuid4())
            markreads(outbam_mutsfile, tmp_tag_bam)
            move(tmp_tag_bam, outbam_mutsfile)
            logger.info("tagged reads.")

        logger.info("done making mutations, merging mutations into %s --> %s" %
                    (args.bamFileName, args.outBamFile))
        replace(args.bamFileName,
                outbam_mutsfile,
                args.outBamFile,
                seed=args.seed)

        #cleanup
        os.remove(outbam_mutsfile)

    var_basename = '.'.join(os.path.basename(args.varFileName).split('.')[:-1])
    bam_basename = '.'.join(os.path.basename(args.outBamFile).split('.')[:-1])

    vcf_fn = bam_basename + '.addindel.' + var_basename + '.vcf'

    makevcf.write_vcf_indel(
        'addindel_logs_' + os.path.basename(args.outBamFile), args.refFasta,
        vcf_fn)

    logger.info('vcf output written to ' + vcf_fn)
Пример #9
0
if __name__ == '__main__':
    # user input
    bam_dir = argv[
        1]  # script to move all relevant files (env samples(BAM,FQ,CNS_5)) from NGS_runa to data3/sewer
    min_depth = int(argv[2])  # now 5, maybe change later
    refseq_path = argv[3]
    # preparations
    bam_dir = bam_dir + '/' if not bam_dir.endswith(
        '/') else bam_dir  # make sure path ends with '/'
    # get reference name
    refseq_name = os.path.basename(refseq_path).strip('.fasta')
    # index refseq samtools in python
    pysam.faidx(refseq_path)
    refseq_series = pd.Series(
        [x for x in pysam.Fastafile(refseq_path).fetch(reference=refseq_name)])
    excel_mutTable = pd.read_excel(
        "/data/projects/Dana/scripts/covid19/mutationsTable.xlsx",
        sheet_name=None,
        engine='openpyxl')

    for name in excel_mutTable:
        frame = excel_mutTable[name]
        excel_mutTable[name] = frame[
            frame['Mutation type'].str.lower() != 'insertion']
        excel_mutTable[name][
            'lineage'] = name  # add a lineage column to all variant's tables

    # uniq_lineages = [lin.rsplit('_', 1)[0] for lin in excel_mutTable]
    uniq_lineages = excel_mutTable.keys()
    muttable_by_lineage = excel_mutTable
Пример #10
0
    def testFTPView(self):
        if not check_url(self.url):
            return

        with pysam.Fastafile(self.url) as f:
            self.assertEqual(len(f.fetch("chr1", 0, 1000)), 1000)
Пример #11
0
                    sam_fh_in.getrname(r.tid)))
        # NOTE pysam's set_tag inferred value_type 'd' which is
        # undefined according to
        # https://samtools.github.io/hts-specs/SAMv1.pdf
        r.set_tag(IDENT_TAG,
                  round(ident * 100, 1),
                  value_type='f',
                  replace=REPLACE_TAG)
        sam_fh_out.write(r)


if __name__ == "__main__":
    REPLACE_TAG = False

    try:
        sam_in, sam_out, ref_fa = sys.argv[1:]
    except ValueError:
        sys.stderr.write(
            "FATAL: Need input and output BAM (stdout supported)  as well as the (indexed) reference as (only) arguments (but got {})\n"
            .format(' '.join(sys.argv[1:])))
        sys.exit(1)
    assert not os.path.exists(sam_out)

    fasta_fh = pysam.Fastafile(ref_fa)
    sam_fh_in = pysam.Samfile(sam_in)  # mode automatically inferred
    out_mode = 'w'
    if sam_out.endswith(".bam"):
        out_mode += "b"
    sam_fh_out = pysam.Samfile(sam_out, out_mode, template=sam_fh_in)
    main(sam_fh_in, sam_fh_out, fasta_fh)