示例#1
0
def reads_replace(bam_file, total_chosen_reads, seqer, flow_order, lib_key,
                  barcode, tag, out_dir, aligner, aligner_index, is_single):
    bam = pysam.AlignmentFile(bam_file)
    edit_bam_reads = {}
    for read in bam.fetch():
        read_name = read.query_name
        if read_name in total_chosen_reads:
            strand = getReadStrand(read)
            if read_name not in edit_bam_reads:
                edit_bam_reads[read_name] = {}
            if strand in total_chosen_reads[read_name]:
                my_read = total_chosen_reads[read_name][strand]
                read.query_sequence = my_read.query_sequence
                read.query_qualities = my_read.query_qualities
                if seqer == "life":
                    read = deal_life_reads(read, flow_order, lib_key, barcode)
                if tag:
                    read = add_tag(read)

                edit_bam_reads[read_name][strand] = read
            else:
                edit_bam_reads[read_name][strand] = read

    # write edited reads into edit.bam
    edit_bam_file = os.path.join(out_dir, "edit.bam")
    edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam)
    for read_name, readInfo in edit_bam_reads.items():
        for strand, read in readInfo.items():
            edit_bam.write(read)
    edit_bam.close()

    # write not edited reads into exclude.bam
    exclude_bam_file = os.path.join(out_dir, "exclude.bam")
    exclude_bam = pysam.AlignmentFile(exclude_bam_file, 'wb', template=bam)
    for read in bam.fetch():
        read_name = read.query_name
        if read_name not in edit_bam_reads:
            exclude_bam.write(read)
    exclude_bam.close()

    # remap the edited reads
    header = os.path.join(out_dir, 'bam.header')
    os.system('samtools view -H %s|grep "^@RG" > %s' % (bam_file, header))
    head = open(header, 'r').readline().rstrip()
    if not head:
        head = None
    edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam")
    remap(aligner_index,
          edit_bam_file,
          edit_remap_bam_file,
          aligner,
          is_single,
          header=head)
    edit_remap_bam_sorted_prefix = os.path.join(out_dir, "edit.remap.sort")
    edit_remap_bam_sorted_file = os.path.join(out_dir, "edit.remap.sort.bam")
    bamSort(edit_remap_bam_file, edit_remap_bam_sorted_prefix)
    bamIndex(edit_remap_bam_sorted_file)
    if tag:
        edit_remap_addtag_file = os.path.join(out_dir, "edit.remap.sort.bam")
        bam_add_tag(edit_remap_bam_sorted_file, edit_remap_addtag_file)
    else:
        edit_remap_addtag_file = edit_remap_bam_sorted_file

    return edit_remap_addtag_file, exclude_bam_file
示例#2
0
def deal_haplotype(bam_file, haplotype, reffasta, haplotype_prefix, mindepth,
                   minmutreads, minmapq, diffcover, is_single,
                   is_multmapfilter, aligner, aligner_index, **kwargs):
    reads_dict = OrderedDict()
    bam = pysam.AlignmentFile(bam_file, 'rb')
    reads = bam.fetch(reference=haplotype.chrom,
                      start=haplotype.start,
                      end=haplotype.end + 1)
    depth = 0
    for read in reads:
        depth += 1
        if read.reference_start is not None and not read.is_secondary and bin(
                read.flag & 2048) != bin(2048):
            if read.query_name not in reads_dict:
                reads_dict[read.query_name] = {}
            strand = getReadStrand(read)
            reads_dict[read.query_name][strand] = read

    # judge depth and mut reads whether qualified
    if depth < int(mindepth):
        print "depth less than min depth!"
        return False, "haplotype in position %s:%s-%s: depth less than min depth(%s)" % (
            haplotype.chrom, haplotype.start, haplotype.end, mindepth)
    else:
        mut_reads_num = int(depth * haplotype.freq)
        if mut_reads_num < int(minmutreads):
            print "mutation reads num less than minmutreads!"
            return False, "haplotype in position %s:%s-%s: mut reads less than min mut reads(%s)" % (
                haplotype.chrom, haplotype.start, haplotype.end, minmutreads)

    print "start pick reads"
    # print str(haplotype)
    res = pick_reads(bam, reads_dict, mut_reads_num, is_single, minmapq,
                     is_multmapfilter)
    if res[0] is False:
        return False, "haplotype in position %s:%s-%s: %s" % (
            haplotype.chrom, haplotype.start, haplotype.end, res[1])
    chosen_reads, mate_reads = res
    print "end pick reads"
    # edit
    my_chosen_reads = {}
    my_mate_reads = {}
    tmp_bam_file = haplotype_prefix + ".chosen.edited.bam"
    tmp_bam = pysam.AlignmentFile(tmp_bam_file, 'wb', template=bam)
    chosen_reads_num = 0

    real_mut_reads_num = 0
    for readName, readInfo in chosen_reads.items():
        my_chosen_reads[readName] = {}
        tmp_dict = {}
        tmp_dict2 = {}
        for strand, read in readInfo.items():
            my_read = Read(read)
            res = editRead(my_read, reffasta, haplotype.mutList)
            if res is False:
                continue
            real_mut_reads_num += 1
            sequence, quality, shift = res
            read.query_sequence = sequence
            read.query_qualities = quality
            tmp_dict[strand] = my_read
            tmp_dict2[strand] = read
        if is_single:
            for strand in tmp_dict:
                my_chosen_reads[readName][strand] = tmp_dict[strand]
                tmp_bam.write(tmp_dict2[strand])
                chosen_reads_num += 1
        else:
            if len(tmp_dict) == 0:
                continue
            elif len(tmp_dict) == 1 and readName in mate_reads:
                for strand in tmp_dict:
                    my_chosen_reads[readName][strand] = tmp_dict[strand]
                    tmp_bam.write(tmp_dict2[strand])
                    chosen_reads_num += 1
                mate_read = mate_reads[readName]
                my_mate_reads[readName] = Read(mate_read)
                tmp_bam.write(mate_read)
            elif len(tmp_dict) == 2:
                for strand in tmp_dict:
                    my_chosen_reads[readName][strand] = tmp_dict[strand]
                    tmp_bam.write(tmp_dict2[strand])
                    chosen_reads_num += 1
    tmp_bam.close()

    # alignment and judge coverdiff whether qualified
    chosen_bam_file = haplotype_prefix + ".chosen.remap.bam"
    genome_index = aligner_index
    remap(genome_index, tmp_bam_file, chosen_bam_file, aligner, is_single)
    chosen_bam = pysam.AlignmentFile(chosen_bam_file)
    if judge_coverdiff(bam, depth, chosen_bam, chosen_reads_num, haplotype,
                       float(diffcover)):
        return my_chosen_reads, my_mate_reads, real_mut_reads_num, depth
    else:
        return False, "haplotype in position %s:%s-%s: coverdiff is less than minDiffCover" % (
            haplotype.chrom, haplotype.start, haplotype.end)
示例#3
0
def merge_edit_bam(bam_file, out_dir, is_single, total_modify_reads_file,
                   total_add_reads_file, used_bam_file, total_modify_list,
                   total_add_list, seqer, aligner, aligner_index, flow_order,
                   lib_key, barcode, tag):
    bam = pysam.AlignmentFile(bam_file, 'rb')
    edit_bam_file = os.path.join(out_dir, "edit.bam")
    edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam)
    readname_convert_file = os.path.join(out_dir, "readname_convert.txt")
    fout_convert = open(readname_convert_file, 'w')
    # edit_bam_reads = {}
    used_bam = pysam.AlignmentFile(used_bam_file, 'rb')
    used_reads = {}
    for read in used_bam.fetch():
        keyname = getKeyName(read)
        used_reads[keyname] = read
    used_bam.close()
    # modify_read_name_dict = get_newname_dict(total_modify_list)
    # add_read_name_dict = get_newname_dict(total_add_list)

    modify_reads_seq, modify_reads_quan = get_sequence_dict(
        total_modify_reads_file)
    add_reads_seq, add_reads_quan = get_sequence_dict(total_add_reads_file)
    if is_single:
        with open(total_modify_list) as fin:
            for line in fin:
                if not line:
                    break
                data = line.strip().split(",")
                read1_name = data[0]
                if read1_name not in used_reads:
                    continue
                orig_read1 = used_reads[read1_name]
                new_read1 = copy.deepcopy(orig_read1)
                new_read1.query_sequence = modify_reads_seq[read1_name]
                new_read1.query_qualities = modify_reads_quan[read1_name]
                new_name = get_new_readname()
                new_read1.query_name = new_name
                if seqer == "life":
                    new_read1 = deal_life_reads(new_read1, flow_order, lib_key,
                                                barcode)
                fout_convert.write(
                    "%s: %s, %s, %s-%s\n" %
                    (new_name, orig_read1.query_name, new_read1.is_read1,
                     new_read1.reference_start, new_read1.reference_end))
                edit_bam.write(new_read1)
        fin.close()
        with open(total_add_list) as fin:
            for line in fin:
                if not line:
                    break
                data = line.strip().split(",")
                read1_name = data[0]
                if read1_name not in used_reads:
                    continue
                orig_read1 = used_reads[read1_name]
                new_read1 = copy.deepcopy(orig_read1)
                new_read1.query_sequence = add_reads_seq[read1_name]
                new_read1.query_qualities = add_reads_quan[read1_name]
                new_name = get_new_readname()
                new_read1.query_name = new_name
                fout_convert.write(
                    "%s: %s, %s, %s-%s\n" %
                    (new_name, orig_read1.query_name, new_read1.is_read1,
                     new_read1.reference_start, new_read1.reference_end))
                edit_bam.write(new_read1)
        fin.close()

    else:
        with open(total_modify_list) as fin:
            for line in fin:
                if not line:
                    break
                data = line.strip().split(",")
                read1_name, read2_name = data[0], data[1]
                if read1_name not in used_reads or read2_name not in used_reads:
                    continue
                orig_read1 = used_reads[read1_name]
                orig_read2 = used_reads[read2_name]
                new_read1 = copy.deepcopy(orig_read1)
                new_read2 = copy.deepcopy(orig_read2)
                print read1_name
                new_read1.query_sequence = modify_reads_seq[read1_name]
                new_read1.query_qualities = modify_reads_quan[read1_name]
                new_read2.query_sequence = modify_reads_seq[read2_name]
                new_read2.query_qualities = modify_reads_quan[read2_name]
                new_name = get_new_readname()
                new_read1.query_name = new_name
                new_read2.query_name = new_name
                fout_convert.write(
                    "%s: %s, %s, %s-%s\n" %
                    (new_name, orig_read1.query_name, new_read1.is_read1,
                     new_read1.reference_start, new_read1.reference_end))
                fout_convert.write(
                    "%s: %s, %s, %s-%s\n" %
                    (new_name, orig_read2.query_name, new_read2.is_read1,
                     new_read2.reference_start, new_read2.reference_end))
                edit_bam.write(new_read1)
                edit_bam.write(new_read2)
        fin.close()
        with open(total_add_list) as fin:
            for line in fin:
                if not line:
                    break
                data = line.strip().split(",")
                read1_name, read2_name = data[0], data[1]
                if read1_name not in used_reads or read2_name not in used_reads:
                    continue
                orig_read1 = used_reads[read1_name]
                orig_read2 = used_reads[read2_name]
                new_read1 = copy.deepcopy(orig_read1)
                new_read2 = copy.deepcopy(orig_read2)
                new_read1.query_sequence = add_reads_seq[read1_name]
                new_read1.query_qualities = add_reads_quan[read1_name]
                new_read2.query_sequence = add_reads_seq[read2_name]
                new_read2.query_qualities = add_reads_quan[read2_name]
                new_name = get_new_readname()
                new_read1.query_name = new_name
                new_read2.query_name = new_name
                fout_convert.write(
                    "%s: %s, %s, %s-%s\n" %
                    (new_name, orig_read1.query_name, new_read1.is_read1,
                     new_read1.reference_start, new_read1.reference_end))
                fout_convert.write(
                    "%s: %s, %s, %s-%s\n" %
                    (new_name, orig_read2.query_name, new_read2.is_read1,
                     new_read2.reference_start, new_read2.reference_end))
                edit_bam.write(new_read1)
                edit_bam.write(new_read2)
        fin.close()
    edit_bam.close()

    header = os.path.join(out_dir, 'bam.header')
    os.system('samtools view -H %s|grep "^@RG" > %s' % (bam_file, header))
    head = open(header, 'r').readline().rstrip().replace('\t', '\\t')
    if not head:
        head = None
    edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam")
    remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner,
          is_single, head)
    edit_remap_bam_sorted_prefix = os.path.join(out_dir, "edit.remap.sort")
    edit_remap_bam_sorted_file = os.path.join(out_dir, "edit.remap.sort.bam")
    bamSort(edit_remap_bam_file, edit_remap_bam_sorted_prefix)
    return edit_remap_bam_sorted_file