示例#1
0
def gtf2chain(chain_file, input_file, output_file, chain_genes=False):
    """

    :param chain_file:
    :param input_file:
    :param output_file:
    :param chain_genes:
    :return:
    """
    start = time.time()
    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))

    chain_file = g2g_fu.check_file(chain_file)
    input_file = g2g_fu.check_file(input_file)
    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("FROM CHAIN FILE: {0}".format(chain_file))
    LOG.info("TO CHAIN FILE: {0}".format(output_file))

    temp_db = g2g_fu.gen_file_name("_g2gtempfile", output_file_dir, ".db3")

    gtf2db(input_file, temp_db)

    db2chain(chain_file, temp_db, output_file, chain_genes)

    g2g_fu.delete_file(temp_db)

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#2
0
def gtf2chain(chain_file, input_file, output_file, chain_genes=False):
    """

    :param chain_file:
    :param input_file:
    :param output_file:
    :param chain_genes:
    :return:
    """
    start = time.time()
    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))

    chain_file = g2g_fu.check_file(chain_file)
    input_file = g2g_fu.check_file(input_file)
    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("FROM CHAIN FILE: {0}".format(chain_file))
    LOG.info("TO CHAIN FILE: {0}".format(output_file))

    temp_db = g2g_fu.gen_file_name("_g2gtempfile", output_file_dir, ".db3")

    gtf2db(input_file, temp_db)

    db2chain(chain_file, temp_db, output_file, chain_genes)

    g2g_fu.delete_file(temp_db)

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#3
0
def fasta_extract_exons(fasta_file, database_file, output, raw=False):
    start = time.time()

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta_file = g2g_fu.check_file(fasta_file)
        fasta = FastaFile(fasta_file)

    database_file = g2g_fu.check_file(database_file)

    fasta_out = sys.stdout

    if output:
        output = g2g_fu.check_file(output, 'w')
        fasta_out = open(output, "w")

    LOG.info("FASTA FILE: {0}".format(fasta.filename))
    LOG.info("DATABASE FILE: {0}".format(database_file))
    LOG.info("OUTPUT FILE: {0}".format(fasta_out.name))

    try:
        transcripts = get_transcripts_simple(database_file)
        for i, transcript in enumerate(transcripts):

            if transcript.seqid not in fasta.references:
                continue

            for ensembl_id, exon in transcript.exons.iteritems():
                LOG.debug("Exon={0}".format(exon))

                partial_seq = fasta.fetch(exon.seqid, exon.start-1, exon.end)
                partial_seq_str = partial_seq

                if transcript.strand == -1:
                    partial_seq_str = str(reverse_complement_sequence(partial_seq))

                LOG.debug("{0}:{1}-{2} (Length: {3})\n{4}".format(exon.seqid, exon.start, exon.end, len(partial_seq), partial_seq_str))

                if raw:
                    fasta_out.write(partial_seq_str)
                else:
                    fasta_id = ">{0} {1}:{2}-{3}\n".format(exon.ensembl_id, exon.seqid, exon.start, exon.end)
                    fasta_out.write(fasta_id)

                    for line in wrap_sequence(partial_seq_str):
                        fasta_out.write(line.strip())
                        fasta_out.write('\n')

    except G2GValueError as e:
        LOG.info(e.msg.rstrip())
        raise e
    except G2GFastaError as e:
        LOG.info(e.msg.rstrip())
        raise e

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#4
0
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False):
    """

    :param input_file:
    :param fasta_file:
    :param strain:
    :param output_file:
    :param vcf_keep:
    :param passed:
    :param quality:
    :param diploid:
    :return:
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file)
    fasta_file = g2g_fu.check_file(fasta_file)

    if not strain:
        raise G2GValueError("No strain was specified.")

    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("VCF FILE: {0}".format(input_file))
    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(output_file))

    vcf_discard_file = None

    if vcf_keep:
        vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file))
        vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file)
        LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file))

    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(passed)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not isinstance(fasta_file, FastaFile):
        fasta_file = FastaFile(fasta_file)

    tb = TabixFile(input_file)
    sample_index = None

    for h in tb.header:
        if h[:6] == '#CHROM':
            try:
                elems = h.split('\t')
                samples = elems[9:]
                samples = dict(zip(samples, (x for x in xrange(len(samples)))))
                sample_index = samples[strain]
            except KeyError, ke:
                raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
示例#5
0
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False):
    """

    :param input_file:
    :param fasta_file:
    :param strain:
    :param output_file:
    :param vcf_keep:
    :param passed:
    :param quality:
    :param diploid:
    :return:
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file)
    fasta_file = g2g_fu.check_file(fasta_file)

    if not strain:
        raise G2GValueError("No strain was specified.")

    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("VCF FILE: {0}".format(input_file))
    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(output_file))

    vcf_discard_file = None

    if vcf_keep:
        vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file))
        vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file)
        LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file))

    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(passed)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not isinstance(fasta_file, FastaFile):
        fasta_file = FastaFile(fasta_file)

    tb = TabixFile(input_file)
    sample_index = None

    for h in tb.header:
        if h[:6] == '#CHROM':
            try:
                elems = h.split('\t')
                samples = elems[9:]
                samples = dict(zip(samples, (x for x in xrange(len(samples)))))
                sample_index = samples[strain]
            except KeyError, ke:
                raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
示例#6
0
def offset2chain(from_file, to_file, output_file):
    """
    Convert Seqnature offset files to new chain file.

    :param from_file: from Chromosome File (see docs)
    :param to_file: to Chromosome File (see docs)
    :param output_file: the output chain file
    """
    start = time.time()

    from_file = g2g_fu.check_file(from_file)
    to_file = g2g_fu.check_file(to_file)

    output_file_name = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file_name)

    LOG.info("FROM FILE: {0}".format(from_file))
    LOG.info("TO FILE: {0}".format(to_file))
    LOG.info("CHAIN FILE: {0}".format(output_file_name))

    LOG.info("Generating chain file...")

    try:
        chromosomes = offset_parse_chromosomes(from_file, to_file)

        for c, chromosome in chromosomes.iteritems():
            LOG.debug('Examining chromosome: {0}'.format(chromosome))
            if chromosome['file_path']:
                offset_chromosome_to_chain(chromosome, output_file)
            else:
                LOG.debug("No file for {0}, so skipping".format(chromosome))

        LOG.info("Chain file created")

    except Exception, e:
        raise G2GChainFileError("Unable to generate chain file")
示例#7
0
def offset2chain(from_file, to_file, output_file):
    """
    Convert Seqnature offset files to new chain file.

    :param from_file: from Chromosome File (see docs)
    :param to_file: to Chromosome File (see docs)
    :param output_file: the output chain file
    """
    start = time.time()

    from_file = g2g_fu.check_file(from_file)
    to_file = g2g_fu.check_file(to_file)

    output_file_name = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file_name)

    LOG.info("FROM FILE: {0}".format(from_file))
    LOG.info("TO FILE: {0}".format(to_file))
    LOG.info("CHAIN FILE: {0}".format(output_file_name))

    LOG.info("Generating chain file...")

    try:
        chromosomes = offset_parse_chromosomes(from_file, to_file)

        for c, chromosome in chromosomes.iteritems():
            LOG.debug('Examining chromosome: {0}'.format(chromosome))
            if chromosome['file_path']:
                offset_chromosome_to_chain(chromosome, output_file)
            else:
                LOG.debug("No file for {0}, so skipping".format(chromosome))

        LOG.info("Chain file created")

    except Exception, e:
        raise G2GChainFileError("Unable to generate chain file")
示例#8
0
def convert_bed_file(chain_file, input_file, output_file, reverse=False):
    """
    Convert BED coordinates.

    The mappings of coordinates are stored in the :class:`.chain.ChainFile` object.

    :param chain_file: chain file used for conversion
    :type chain_file: :class:`.chain.ChainFile`
    :param str file_in: the input BED file
    :type file_in: string
    :param file_out: the output BED file
    :type file_out: string
    :param reverse: reverse direction of original chain file
    :type reverse: boolean
    :return: Nothing
    """
    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    input_file = g2g_fu.check_file(input_file)
    output_file_name = g2g_fu.check_file(output_file, 'w')
    unmapped_file_name = "{0}.unmapped".format(output_file_name)

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(input_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))
    LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name))

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file, reverse=reverse)
        LOG.info("Chain file parsed")

    bed_out = open(output_file_name, "w")
    bed_unmapped_file = open(unmapped_file_name, "w")

    LOG.info("Converting BED file")

    bed_file = BED(input_file)

    total = 0
    success = 0
    fail = 0

    # BED is 0 based, bx-python is 0 based

    try:
        for record in bed_file:
            # skip over "track" lines
            if not bed_file.current_line_is_bed:
                bed_out.write(bed_file.current_line)
                bed_out.write("\n")
                continue

            total += 1

            mappings = chain_file.find_mappings(record.chrom, record.start, record.end)

            # unmapped
            if mappings:
                success += 1
            else:
                LOG.debug("Fail due to no mappings")
                bed_unmapped_file.write(bed_file.current_line)
                fail += 1
                continue

            start = mappings[0].to_start
            end = mappings[-1].to_end

            LOG.debug("({0}, {1}) => ({2}, {3})".format(record.start, record.end, start, end))

            elems = bed_file.current_line.split()
            elems[1] = start
            elems[2] = end

            bed_out.write("\t".join(map(str, elems)))
            bed_out.write("\n")

        bed_out.close()
        bed_unmapped_file.close()

        LOG.info("Converted {0} of {1} records".format(success, total))
    except G2GLocationError, le:
        LOG.error("{0}: {1}".format(le.message, bed_file.current_line))
示例#9
0
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False):
    """

    :param fasta_file:
    :param chain_file:
    :param locations:
    :param output_file:
    :param bgzip:
    :param reverse:
    :return:
    """
    start = time.time()

    if not isinstance(fasta_file, FastaFile):
        fasta_file = g2g_fu.check_file(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = g2g_fu.check_file(chain_file)

    output_file = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file)
    g2g_fu.delete_index_files(output_file)

    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file))
    LOG.info("BGZIP: {0}".format(bgzip))
    LOG.info("REVERSE: {0}".format(reverse))

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta = FastaFile(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = ChainIter(chain_file, reverse=reverse)

    seq_ids = []

    if locations:
        LOG.debug("Have locations")
        new_locations = []
        for l in locations:
            if isinstance(l, Location):
                new_locations.append(l)
            else:
                new_locations.append(parse_location(l))
            seq_ids.append(new_locations[-1].seqid)
        locations = new_locations
    else:
        LOG.debug("Calculating locations")
        locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references]
        seq_ids = [a for a in fasta.references]

    temp_output_file = output_file

    if bgzip:
        if g2g_fu.get_extension(output_file) != 'gz':
            output_file = "{0}.gz".format(output_file)
        else:
            temp_output_file = temp_output_file[:-3]

    fasta_out = open(temp_output_file, "w")

    LOG.info("Transforming...")

    chr_info = {}

    try:
        # will need a better way, but for now...
        LOG.info("Parsing chain file...")
        for line in chain_file:
            if len(line) > 7:
                LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1]))
                chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5],
                                  'to_size': line[7], 'to_start': line[9], 'to_end': line[10],
                                  'header_chain':chain_file.current_chain_header, 'lines': []}
            else:
                chr_info[chain_file.current_chain_header[1]]['lines'].append(line)

        LOG.info("Chain file parsed")

        insertion_bases = 0
        deletion_bases = 0

        for location in locations:
            LOG.info("Processing chromosome={0}".format(location.seqid))
            LOG.debug("Location: {0}".format(location))

            chrom_size_from = chr_info[location.seqid]['from_size']
            chrom_size_to = chr_info[location.seqid]['to_size']

            last_pos = chr_info[location.seqid]['from_start']
            new_sequence = StringIO()
            chain_file.reset()

            for chain_line in chr_info[location.seqid]['lines']:
                LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line))

                if len(chain_line) == 1:
                    # last line
                    fragment = chain_line[0]

                    partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment)
                    new_sequence.write(str(partial_seq))

                    if len(new_sequence.getvalue()) < chrom_size_to:
                        LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue())))

                    fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to))

                    for l in wrap_sequence(new_sequence.getvalue()):
                        fasta_out.write(l.strip())
                        fasta_out.write('\n')

                    break

                else:

                    # fragment_size dt_size dq_size same_bases dt_bases dq_bases

                    fragment = chain_line[0]
                    dt = chain_line[1 if not reverse else 2]
                    dq = chain_line[2 if not reverse else 1]
                    same = chain_line[3]
                    dt_bases = chain_line[4 if not reverse else 5]
                    dq_bases = chain_line[5 if not reverse else 4]

                    partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment)
                    new_sequence.write(partial_seq)

                    if dq > 0:
                        # insertion
                        LOG.debug("INSERTION")
                        new_sequence.write(dq_bases)
                        LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq)))
                        if len(partial_seq) > 100:
                            LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:]))
                        else:
                            LOG.debug(partial_seq)
                        LOG.debug("Adding {0}".format(dq_bases))
                        LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):]))

                        insertion_bases += dq

                    if dt > 0:
                        # deletion
                        LOG.debug("DELETION")
                        last_pos += dt
                        LOG.debug("skipping ahead {0} bases".format(dt))

                        deletion_bases += dt

                    last_pos += fragment

                    LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases)))

        # bgzip and index
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(temp_output_file, output_file, 'fa')

    except G2GLocationError, le:
        LOG.debug("Unable to parse location, {0}".format(le.message))
        raise le
示例#10
0
def convert_bam_file(chain_file, file_in, file_out, reverse=False):
    """
    Convert genome coordinates (in BAM/SAM format) between assemblies.  These coordinates
    are stored in the :class:`.chain.ChainFile` object.

    :param chain_file: chain file used for conversion
    :type chain_file: :class:`.chain.ChainFile`
    :param str file_in: the input SAM or BAM file
    :type file_in: string
    :param file_out: the output SAM or file
    :type file_out: string
    :param reverse: reverse direction of original chain file
    :type reverse: boolean
    """
    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    if not isinstance(file_in, pysam.Samfile):
        file_in = g2g_fu.check_file(file_in)

    output_file_name = g2g_fu.check_file(file_out, 'w')
    unmapped_file_name = "{0}.unmapped".format(output_file_name)

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(file_in))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))
    LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name))

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file, reverse=reverse)
        LOG.info("Chain file parsed")

    if not isinstance(file_in, pysam.Samfile):
        try:
            sam_file = pysam.Samfile(file_in, 'rb')
            if len(sam_file.header) == 0:
                raise G2GBAMError("BAM File has no header information")
        except:
            sam_file = pysam.Samfile(file_in, 'r')
            if len(sam_file.header) == 0:
                raise G2GBAMError("SAM File has no header information")

    LOG.info("Converting BAM file")

    new_header = sam_file.header

    # replace 'HD'
    new_header['HD'] = {'VN': 1.0, 'SO': 'coordinate'}

    # replace SQ
    tmp = []
    name_to_id = {}
    id = 0
    for ref_name in sorted(chain_file.chrom_size_to):
        tmp.append({'LN': chain_file.chrom_size_from[ref_name], 'SN': ref_name})
        name_to_id[ref_name] = id
        id += 1

    new_header['SQ'] = tmp

    if 'PG' not in new_header:
        new_header['PG'] = []

    new_header['PG'].append({'ID': 'gtgtools', 'VN': 1.0})

    if 'CO' not in new_header:
        new_header['CO'] = []

    new_header['CO'].append("Original file: {0}".format(file_in))
    new_header['CO'].append("Chain File: {0}".format(chain_file.file_name))

    dir, temp_file_name = os.path.split(file_out)
    parts = temp_file_name.split('.')
    ext = parts[-1]

    if ext.lower() == 'bam':
        new_file = pysam.Samfile(file_out, 'wb', header=new_header)
        new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wb', template=sam_file)
    elif ext.lower() == 'sam':
        new_file = pysam.Samfile(file_out, 'wh', header=new_header)
        new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wh', template=sam_file)
    else:
        raise G2GBAMError("Unable to create new file based upon file extension")

    total = 0
    total_unmapped = 0
    total_fail_qc = 0

    map_statistics = {'total': 0,
                      'fail_cannot_map': 0,
                      'success_simple': 0,
                      'success_complex': 0}

    map_statistics_pair = {'total': 0,
                           'fail_cannot_map': 0,
                           'success_1_fail_2_simple': 0,
                           'success_1_fail_2_complex': 0,
                           'success_1_simple_2_fail': 0,
                           'success_1_simple_2_simple': 0,
                           'success_1_simple_2_complex': 0,
                           'success_1_complex_2_fail': 0,
                           'success_1_complex_2_simple': 0,
                           'success_1_complex_2_complex': 0}

    try:
        while True:
            if total and total % 10000 == 0:
                status_success = 0
                status_failed = 0

                for k, v in map_statistics_pair.iteritems():
                    if k.startswith('success'):
                        status_success += v
                    elif k.startswith('fail'):
                        status_failed += v

                LOG.info("Processed {0:,} reads, {1:,} successful, {2:,} failed".format(total, status_success, status_failed))

            alignment = sam_file.next()
            alignment_new = pysam.AlignedRead()
            read_chr = sam_file.getrname(alignment.tid)

            # READ ONLY

            # aend                  aligned reference position of the read on the reference genome
            # alen                  aligned length of the read on the reference genome.
            # positions             a list of reference positions that this read aligns to
            # qend                  end index of the aligned query portion of the sequence (0-based, exclusive)
            # qlen                  Length of the aligned query sequence
            # qqual                 aligned query sequence quality values
            # qstart                start index of the aligned query portion of the sequence (0-based, inclusive)
            # query                 aligned portion of the read and excludes any flanking bases that were soft clipped
            # rlen                  length of the read

            # TRUE / FALSE (setting effects flag)

            # is_paired             true if read is paired in sequencing
            # is_proper_pair        true if read is mapped in a proper pair
            # is_qcfail             true if QC failure
            # is_read1              true if this is read1
            # is_read2              true if this is read2
            # is_reverse            true if read is mapped to reverse strand
            # is_secondary          true if not primary alignment
            # is_unmapped           true if read itself is unmapped
            # mate_is_reverse       true is read is mapped to reverse strand
            # mate_is_unmapped      true if the mate is unmapped

            # SET

            # cigar                 cigar as list of tuples
            # cigarstring           alignment as a string
            # flag                  properties flag
            # mapq                  mapping quality
            # pnext                 the position of the mate
            # pos                   0-based leftmost coordinate
            # pnext                 the position of the mate
            # qname                 the query name
            # rnext                 the reference id of the mate
            # seq                   read sequence bases, including soft clipped bases
            # tid                   target id, contains the index of the reference sequence in the sequence dictionary

            # DON'T NEED TO SET or SHOULD WE SET?

            # qual                  read sequence base qualities, including soft clipped bases
            # tags                  the tags in the AUX field
            # tlen                  insert size

            total += 1

            LOG.debug('~'*80)
            LOG.debug("Converting {0} {1} {2} {3}".format(alignment.qname, read_chr, alignment.pos, alignment.cigarstring))

            if alignment.is_qcfail:
                LOG.debug("\tFail due to qc of old alignment")
                new_file_unmapped.write(alignment)
                total_fail_qc += 1
                continue

            if alignment.is_unmapped:
                LOG.debug("\tFail due to unmapped old alignment")
                new_file_unmapped.write(alignment)
                total_unmapped += 1
                continue

            if not alignment.is_paired:
                LOG.debug("SINGLE END ALIGNMENT")
                map_statistics['total'] += 1

                alignment_new.seq = alignment.seq
                alignment_new.flag = FLAG_NONE
                alignment_new.mapq = alignment.mapq
                alignment_new.qname = alignment.qname
                alignment_new.qual = alignment.qual
                alignment_new.tags = alignment.tags

                read_start = alignment.pos
                read_end = alignment.aend
                read_strand = '-' if alignment.is_reverse else '+'

                mappings = chain_file.find_mappings(read_chr, read_start, read_end)

                # unmapped
                if mappings is None:
                    LOG.debug("\tFail due to no mappings")
                    new_file_unmapped.write(alignment)
                    map_statistics['fail_cannot_map'] += 1

                elif len(mappings) == 1:
                    if alignment.is_reverse:
                        alignment_new.flag |= FLAG_REVERSE

                    alignment_new.tid = name_to_id[mappings[0].to_chr]
                    alignment_new.pos = mappings[0].to_start
                    alignment_new.cigar = alignment.cigar
                    new_file.write(alignment_new)

                    LOG.debug("\tSuccess (simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    map_statistics['success_simple'] += 1

                else:
                    LOG.debug("MAPPINGS: {0}".format(len(mappings)))
                    for m in mappings:
                        LOG.debug("> {0}".format(m))

                    if alignment.is_reverse:
                        alignment_new.flag |= FLAG_REVERSE

                    alignment_new.tid = name_to_id[mappings[0].to_chr]
                    alignment_new.pos = mappings[0].to_start
                    alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read_strand, alignment.pos)
                    new_file.write(alignment_new)

                    LOG.debug("\tSuccess (complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    map_statistics['success_complex'] += 1

            else:
                LOG.debug("PAIRED END ALIGNMENT")
                map_statistics_pair['total'] += 1

                alignment_new.seq = alignment.seq
                alignment_new.flag = FLAG_PAIRED
                alignment_new.mapq = alignment.mapq
                alignment_new.qname = alignment.qname
                alignment_new.qual = alignment.qual
                alignment_new.tags = alignment.tags

                if alignment.is_read1:
                    alignment_new.flag |= FLAG_READ1
                if alignment.is_read2:
                    alignment_new.flag |= FLAG_READ2

                if alignment.is_reverse:
                    alignment_new.flag |= FLAG_REVERSE
                if alignment.mate_is_reverse:
                    alignment_new.flag |= FLAG_MREVERSE

                read1_chr = sam_file.getrname(alignment.tid)
                read1_start = alignment.pos
                read1_end = alignment.aend
                read1_strand = '-' if alignment.is_reverse else '+'
                read1_mappings = chain_file.find_mappings(read1_chr, read1_start, read1_end) #, read1_strand)

                read2_chr = None
                read2_start = None
                read2_end = None
                read2_strand = None
                read2_mappings = None

                if alignment.mate_is_unmapped:
                    alignment_new.flag |= FLAG_MUNMAP
                else:
                    read2_chr = sam_file.getrname(alignment.rnext)
                    read2_start = alignment.pnext
                    read2_end = read2_start + 1
                    read2_strand = '-' if alignment.mate_is_reverse else '+'
                    try:
                        read2_mappings = chain_file.find_mappings(read2_chr, read2_start, read2_end)
                    except:
                        read2_mappings = None

                if read1_mappings is None and read2_mappings is None:

                    alignment_new.flag |= FLAG_UNMAP
                    alignment_new.flag |= FLAG_MUNMAP

                    LOG.debug("\tFail due to no mappings")
                    new_file_unmapped.write(alignment)
                    map_statistics_pair['fail_cannot_map'] += 1

                elif read1_mappings is None and read2_mappings and len(read2_mappings) == 1:

                    alignment_new.flag |= FLAG_UNMAP

                    alignment_new.pos = 0
                    alignment_new.cigarstring = '0M'
                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0

                    LOG.debug("\tPair Success (1:fail,2:simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_fail_2_simple'] += 1

                elif read1_mappings is None and read2_mappings and len(read2_mappings) > 1:

                    alignment_new.flag |= FLAG_UNMAP

                    alignment_new.pos = 0
                    alignment_new.cigarstring = '0M'
                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0

                    LOG.debug("\tPair Success (1:fail,2:complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_fail_2_complex'] += 1

                elif read1_mappings and len(read1_mappings) == 1 and read2_mappings is None:

                    alignment_new.flag |= FLAG_MUNMAP

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pnext = 0
                    alignment_new.tlen = 0    # CHECK

                    LOG.debug("\tPair Success (1:simple,2:fail): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_fail'] += 1

                elif read1_mappings and len(read1_mappings) == 1 and read2_mappings and len(read2_mappings) == 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0    # CHECK

                    LOG.debug("\tPair Success (1:simple,2:simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_simple'] += 1

                elif read1_mappings and len(read1_mappings) == 1 and read2_mappings and len(read2_mappings) > 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0    # CHECK

                    LOG.debug("\tPair Success (1:simple,2:complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_complex'] += 1

                elif read1_mappings and len(read1_mappings) > 1 and read2_mappings is None:

                    alignment_new.flag |= FLAG_MUNMAP

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pnext = 0
                    alignment_new.tlen = 0    # CHECK

                    LOG.debug("\tPair Success (1:complex,2:fail): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_fail'] += 1

                elif read1_mappings and len(read1_mappings) > 1 and read2_mappings and len(read2_mappings) == 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0    # CHECK

                    LOG.debug("\tPair Success (1:complex,2:simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_simple'] += 1

                elif read1_mappings and len(read1_mappings) > 1 and read2_mappings and len(read2_mappings) > 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0    # CHECK

                    LOG.debug("\tPair Success (1:complex,2:complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_complex'] += 1

                else:
                    raise G2GBAMError("Unknown BAM/SAM conversion/parse situation")

    except StopIteration:
        LOG.info("All reads processed")

    LOG.info("  {:>10} TOTAL ENTRIES".format(total))
    LOG.info("  {:>10} TOTAL UNMAPPED ".format(total_unmapped))
    LOG.info("  {:>10} TOTAL FAIL QC ".format(total_fail_qc))

    if map_statistics['total'] > 0:
        LOG.info("")
        LOG.info("Mapping Summary Single End")
        LOG.info("  {:>10} TOTAL ENTRIES".format(map_statistics['total']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL SUCCESS".format(map_statistics['success_simple'] + map_statistics['success_complex']))
        LOG.info("  {:>10} Simple".format(map_statistics['success_simple']))
        LOG.info("  {:>10} Complex".format(map_statistics['success_complex']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL FAILURES".format(map_statistics['fail_cannot_map']))
        LOG.info("  {:>10} Cannot Map ".format(map_statistics['fail_cannot_map']))

    if map_statistics_pair['total'] > 0:
        total_success = 0
        for k, v in map_statistics_pair.iteritems():
            if k.startswith('success'):
                total_success += v

        LOG.info("")
        LOG.info("Mapping Summary Paired End")
        LOG.info("  {:>10} TOTAL ENTRIES".format(map_statistics_pair['total']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL SUCCESS".format(total_success))
        LOG.info("  {:>10} Read 1 Failed, Read 2 Simple".format(map_statistics_pair['success_1_fail_2_simple']))
        LOG.info("  {:>10} Read 1 Failed, Read 2 Complex".format(map_statistics_pair['success_1_fail_2_complex']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Failed".format(map_statistics_pair['success_1_simple_2_fail']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Simple".format(map_statistics_pair['success_1_simple_2_simple']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Complex".format(map_statistics_pair['success_1_simple_2_complex']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Failed".format(map_statistics_pair['success_1_complex_2_fail']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Simple".format(map_statistics_pair['success_1_complex_2_simple']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Complex".format(map_statistics_pair['success_1_complex_2_complex']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL FAILURES".format(map_statistics_pair['fail_cannot_map']))
        LOG.info("  {:>10} Cannot Map".format(map_statistics_pair['fail_cannot_map']))
        LOG.info("")

    LOG.info("BAM File Converted")
示例#11
0
def db2chain(chain_file, input_file, output_file, chain_genes=False):
    """

    :param chain_file:
    :param input_file:
    :param output_file:
    :param chain_genes:
    :return:
    """
    start = time.time()

    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    input_file = g2g_fu.check_file(input_file)
    output_file_name = g2g_fu.check_file(output_file, 'w')

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(input_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))

    if chain_genes:
        LOG.info("CHAIN TYPE: GENES")
    else:
        LOG.info("CHAIN TYPE: TRANSCRIPTS")

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file)
        LOG.info("Chain file parsed")

    LOG.info('Creating new chain file...')

    if chain_genes:
        LOG.debug("Generating chain for genes")

        for chromosome in chain_file.get_seqids():
            LOG.debug("Generating chain for genes in chromosome {0}".format(chromosome))

            for i, gene in enumerate(get_genes_simple(input_file, location=Location(chromosome))):
                LOG.debug("\n{0}".format(gene))
                chain_entries = []
                from_start = None
                to_start = None
                from_end = None
                to_end = None

                mappings = chain_file.find_mappings(gene.seqid, gene.start, gene.end)

                if gene.strand == 1:
                    if mappings and len(mappings) > 0:
                        if not from_start:
                            from_start = mappings[0].from_start
                            to_start = mappings[0].to_start

                        if len(mappings) == 1:
                            m = mappings[0]
                            c = ChainEntry()
                            c.lines.append([m.from_end - m.from_start])
                            chain_entries.append(c)
                        else:
                            c = ChainEntry()

                            prev_mapping = None
                            sum_size = 0
                            sum_dq = 0
                            sum_dt = 0
                            dq = 0
                            prev_dq = 0
                            dt = 0
                            prev_dt = 0

                            for m in mappings:
                                if not prev_mapping:
                                    prev_mapping = m
                                else:
                                    prev_dt = dt
                                    prev_dq = dq
                                    chain_size = prev_mapping.from_end - prev_mapping.from_start
                                    dt = m.from_start - prev_mapping.from_end
                                    dq = m.to_start - prev_mapping.to_end

                                    if dt > 0:
                                        chain_size += prev_dq

                                    sum_size += chain_size
                                    sum_dq += dq
                                    sum_dt += dt

                                    c.lines.append([chain_size, dt, dq])
                                    LOG.debug(c.lines[-1])
                                    prev_mapping = m

                            chain_size = mappings[-1].from_end - mappings[-1].from_start
                            if dt > 0:
                                chain_size += dq
                                sum_size += dq

                            c.lines.append([chain_size])
                            chain_entries.append(c)
                else:
                    if mappings and len(mappings) > 0:
                        if not from_end:
                            from_end = mappings[-1].from_end
                            to_end = mappings[-1].to_end

                        if len(mappings) == 1:
                            m = mappings[0]
                            c = ChainEntry()
                            c.lines.append([m.from_end - m.from_start])
                            chain_entries.append(c)
                        else:
                            c = ChainEntry()

                            prev_mapping = None
                            sum_size = 0
                            sum_dq = 0
                            sum_dt = 0
                            dq = 0
                            prev_dq = 0
                            dt = 0
                            prev_dt = 0

                            # reverse
                            mappings = mappings[::-1]

                            for m in mappings:
                                LOG.debug("CURRENT MAPPING: {0}".format(m))
                                if not prev_mapping:
                                    prev_mapping = m
                                else:
                                    LOG.debug("PREV MAPPING: {0}".format(prev_mapping))
                                    prev_dt = dt
                                    prev_dq = dq
                                    chain_size = prev_mapping.from_end - prev_mapping.from_start
                                    #dt = m.from_start - prev_mapping.from_end
                                    #dq = m.to_start - prev_mapping.to_end
                                    dt = prev_mapping.from_start - m.from_end
                                    dq = prev_mapping.to_start - m.to_end
                                    LOG.debug("dt={0}, dq={1}".format(dt, dq))

                               #if dt > 0:
                                    #    LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size))
                                    #    chain_size += prev_dq

                                    sum_size += chain_size
                                    sum_dq += dq
                                    sum_dt += dt

                                    c.lines.append([chain_size, dt, dq])
                                    LOG.debug(c.lines[-1])
                                    prev_mapping = m

                            LOG.debug("finding last...{0}".format(mappings[-1]))
                            chain_size = mappings[-1].from_end - mappings[-1].from_start
                            #if dt > 0:
                            #    LOG.debug("WHOA {0}".format(dt))
                            #    LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size))
                            #    chain_size += dq
                            #    sum_size += dq

                            c.lines.append([chain_size])
                            LOG.debug(c.lines[-1])
                            chain_entries.append(c)

                if chain_entries and len(chain_entries) > 0:
                    sum_size = 0
                    sum_dq = 0
                    sum_dt = 0
                    lines = []

                    for line in chain_entries[0].lines:
                        sum_size += line[0]
                        if len(line) > 1:
                            sum_dq += line[1]
                            sum_dt += line[2]
                        lines.append('\t'.join(map(str, line)))

                    if output_file:
                        outf = open(output_file, "a")
                    else:
                        outf = sys.stdout

                outf.write(CHAIN_STRING.format(CHAIN_STRING,
                            from_chr=gene.seqid, from_length=sum_size + sum_dq,
                            from_start=0, from_end=sum_size + sum_dq,
                            to_chr=gene.seqid, to_length=sum_size + sum_dt,
                            to_start=0, to_end=sum_size + sum_dt, id=gene.ensembl_id))
                outf.write("\n")
                outf.write("\n".join(lines))
                outf.write("\n")

                outf.close()
    else:
        for chromosome in chain_file.get_seqids():
            LOG.debug("Generating chain for transcripts in chromosome {0}".format(chromosome))

            for i, transcript in enumerate(get_transcripts_simple(input_file, location=Location(chromosome))):
                LOG.debug("Transcript = {0}".format(transcript))
                chain_entries = []
                from_start = None
                to_start = None
                from_end = None
                to_end = None
                transcript.exons = OrderedDict(sorted(transcript.exons.items(), key=lambda x: x[1].exon_number))

                for ensembl_id, exon in transcript.exons.iteritems():
                    LOG.debug("Exon = {0}".format(exon))

                    mappings = chain_file.find_mappings(exon.seqid, exon.start, exon.end)

                    if exon.strand == 1:
                        if mappings and len(mappings) > 0:
                            if not from_start:
                                from_start = mappings[0].from_start
                                to_start = mappings[0].to_start

                            if len(mappings) == 1:
                                m = mappings[0]
                                c = ChainEntry()
                                c.lines.append([m.from_end - m.from_start])
                                chain_entries.append(c)
                            else:
                                c = ChainEntry()

                                prev_mapping = None
                                sum_size = 0
                                sum_dq = 0
                                sum_dt = 0
                                dq = 0
                                prev_dq = 0
                                dt = 0
                                prev_dt = 0

                                for m in mappings:
                                    if not prev_mapping:
                                        prev_mapping = m
                                    else:
                                        prev_dt = dt
                                        prev_dq = dq
                                        chain_size = prev_mapping.from_end - prev_mapping.from_start
                                        dt = m.from_start - prev_mapping.from_end
                                        dq = m.to_start - prev_mapping.to_end

                                        if dt > 0:
                                            chain_size += prev_dq

                                        sum_size += chain_size
                                        sum_dq += dq
                                        sum_dt += dt

                                        c.lines.append([chain_size, dt, dq])
                                        LOG.debug(c.lines[-1])
                                        prev_mapping = m

                                chain_size = mappings[-1].from_end - mappings[-1].from_start
                                if dt > 0:
                                    chain_size += dq
                                    sum_size += dq

                                c.lines.append([chain_size])
                                chain_entries.append(c)
                    else:
                        if mappings and len(mappings) > 0:
                            if not from_end:
                                from_end = mappings[-1].from_end
                                to_end = mappings[-1].to_end

                            if len(mappings) == 1:
                                m = mappings[0]
                                c = ChainEntry()
                                c.lines.append([m.from_end - m.from_start])
                                chain_entries.append(c)
                            else:
                                c = ChainEntry()

                                prev_mapping = None
                                sum_size = 0
                                sum_dq = 0
                                sum_dt = 0
                                dq = 0
                                prev_dq = 0
                                dt = 0
                                prev_dt = 0

                                # reverse
                                mappings = mappings[::-1]

                                for m in mappings:
                                    LOG.debug("CURRENT MAPPING: {0}".format(m))
                                    if not prev_mapping:
                                        prev_mapping = m
                                    else:
                                        LOG.debug("PREV MAPPING: {0}".format(prev_mapping))
                                        prev_dt = dt
                                        prev_dq = dq
                                        chain_size = prev_mapping.from_end - prev_mapping.from_start
                                        #dt = m.from_start - prev_mapping.from_end
                                        #dq = m.to_start - prev_mapping.to_end
                                        dt = prev_mapping.from_start - m.from_end
                                        dq = prev_mapping.to_start - m.to_end
                                        LOG.debug("dt={0}, dq={1}".format(dt, dq))

                                   #if dt > 0:
                                        #    LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size))
                                        #    chain_size += prev_dq

                                        sum_size += chain_size
                                        sum_dq += dq
                                        sum_dt += dt

                                        c.lines.append([chain_size, dt, dq])
                                        LOG.debug(c.lines[-1])
                                        prev_mapping = m

                                LOG.debug("finding last...{0}".format(mappings[-1]))
                                chain_size = mappings[-1].from_end - mappings[-1].from_start
                                #if dt > 0:
                                #    LOG.debug("WHOA {0}".format(dt))
                                #    chain_size += dq
                                #    sum_size += dq

                                c.lines.append([chain_size])
                                LOG.debug(c.lines[-1])
                                chain_entries.append(c)

                # collapse exons
                if chain_entries and len(chain_entries) > 0:
                    LOG.debug('>>>>>>>')
                    for c in chain_entries:
                        LOG.debug(str(c))
                    LOG.debug('>>>>>>>')
                    chain_entries = collapse_entries(chain_entries)
                    sum_size = 0
                    sum_dq = 0
                    sum_dt = 0
                    lines = []

                    for line in chain_entries[0].lines:
                        sum_size += line[0]
                        if len(line) > 1:
                            sum_dq += line[1]
                            sum_dt += line[2]
                        lines.append('\t'.join(map(str, line)))

                    if output_file:
                        outf = open(output_file, "a")
                    else:
                        outf = sys.stdout

                    outf.write(CHAIN_STRING.format(CHAIN_STRING,
                                from_chr=transcript.seqid, from_length=sum_size + sum_dq,
                                from_start=0, from_end=sum_size + sum_dq,
                                to_chr=transcript.seqid, to_length=sum_size + sum_dt,
                                to_start=0, to_end=sum_size + sum_dt, id=transcript.ensembl_id))
                    outf.write("\n")
                    outf.write("\n".join(lines))
                    outf.write("\n")

                    outf.close()

    LOG.info('New chain file created')

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#12
0
def gtf2db(input_file, output_file):
    """
    Convert a GTF file into SQLite

    :param input_file: the GTF file to convert
    :param output_file: The generated database file
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file, 'r')
    output_file = g2g_fu.check_file(output_file, 'w')

    g2g_fu.delete_file(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("DB File: {0}".format(output_file))

    conn = sqlite3.connect(output_file)
    c = conn.cursor()

    LOG.debug("Generating tables")
    c.execute(SQL_CREATE_GTF_TABLE)
    c.execute(SQL_CREATE_GTF_LOOKUP_TABLE)
    c.execute(SQL_CREATE_GTF_SOURCES_TABLE)
    c.execute(SQL_CREATE_GTF_TYPES_TABLE)
    c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE)

    gtf_types = {}
    gtf_sources = {}
    gtf_attributes = {}

    LOG.info("Parsing GTF file...")

    gtf_file = GTF(input_file)

    counter = 0

    for record in gtf_file:
        if counter and counter % 100000 == 0:
            LOG.info("Processed {0:,} records".format(counter))

        if record.type not in gtf_types:
            _type_key = len(gtf_types.keys())
            gtf_types[record.type] = _type_key
        else:
            _type_key = gtf_types[record.type]

        if record.source not in gtf_sources:
            _source_key = len(gtf_sources.keys())
            gtf_sources[record.source] = _source_key
        else:
            _source_key = gtf_sources[record.source]

        strand = 0
        if record.strand in ['+', '-']:
            strand = 1 if record.strand == '+' else -1

        gene_id = record.attributes['gene_id']
        transcript_id = record.attributes[
            'transcript_id'] if 'transcript_id' in record.attributes else None
        ensembl_id = None

        if record.type == 'gene':
            ensembl_id = record.attributes['gene_id']
        elif record.type == 'transcript':
            ensembl_id = record.attributes['transcript_id']
        elif record.type == 'exon':
            ensembl_id = record.attributes['exon_id']
        else:
            ensembl_id = record.attributes[
                'protein_id'] if 'protein_id' in record.attributes else None

        c.execute(SQL_INSERT_GTF_TABLE,
                  (gene_id, transcript_id, ensembl_id, record.seqid,
                   record.start, record.end, strand, record.score, _source_key,
                   _type_key, record.frame))
        gtf_key = c.lastrowid

        for attribute, value in record.attributes.iteritems():
            if attribute not in ['gene_id', 'transcript_id', 'exon_id']:
                if attribute not in gtf_attributes:
                    _attribute_key = len(gtf_attributes.keys())
                    gtf_attributes[attribute] = _attribute_key
                else:
                    _attribute_key = gtf_attributes[attribute]

                c.execute(SQL_INSERT_GTF_LOOKUP_TABLE,
                          (gtf_key, _attribute_key, value))

        counter += 1

    # save (commit) the changes
    conn.commit()

    for source, _key in gtf_sources.iteritems():
        c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source))
        conn.commit()

    for type, _key in gtf_types.iteritems():
        c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type))
        conn.commit()

    for attribute, _key in gtf_attributes.iteritems():
        c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute))
        conn.commit()

    LOG.info("GTF File parsed")

    LOG.info("Finalizing database...")

    for sql in SQL_INDICES_GTF:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_LOOKUP:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_TYPES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_SOURCES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_ATTRIBUTES:
        LOG.debug(sql)
        c.execute(sql)

    LOG.info("Database created")

    # close connection
    conn.close()

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#13
0
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False):
    """
    Initialize fasta_patch variables

    :param filename_fasta:
    :param filename_vcf:
    :param strain:
    :param filename_output:
    :param bgzip:
    :param diploid:
    :return:
    """

    filename_output = g2g_fu.check_file(filename_output, "w")
    output_file_dir = os.path.abspath(os.path.dirname(filename_output))

    new_filename_output = filename_output

    # let's figure out what our output names will be
    if filename_output.lower().endswith(".gz"):
        # strip off .gz
        new_filename_output = filename_output[:-3]

    if not filename_output.lower().endswith(".fa"):
        raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'")

    if diploid:
        filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, "l")
        filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, "r")

        g2g_fu.delete_index_files(filename_output_l)
        g2g_fu.delete_index_files(filename_output_r)
    else:
        filename_output_l = new_filename_output
        filename_output_r = None

        g2g_fu.delete_index_files(filename_output_l)

    # at this point we are hoping for a .fa extension

    # let's figure out our input and process accordingly
    if filename_fasta.lower().endswith(".fa.gz"):
        # decompress the fasta file if it is compressed

        LOG.info("Copying and decompressing fasta file")

        # copy file and preserve gz extension for bgzip -d to work
        tmp_file_name = os.path.basename(filename_fasta)  # something.gz
        LOG.debug("tmp_file_name={0}".format(tmp_file_name))

        tmp_fasta = os.path.join(output_file_dir, tmp_file_name)  # /path/something.fa.gz
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta))
        shutil.copy(filename_fasta, tmp_fasta)  # cp /original/something.fa.gz /output/something.fa.gz

        LOG.debug("DECOMPRESSING {0}".format(tmp_fasta))
        g2g_fu.bgzip_decompress(tmp_fasta)

        tmp_fasta = tmp_fasta[:-3]  # /path/something.fa
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l))
        shutil.move(tmp_fasta, filename_output_l)

    elif filename_fasta.lower().endswith(".fa"):
        LOG.debug("File is not compressed")

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l))
        shutil.copy(filename_fasta, filename_output_l)
    else:
        raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'")

    if diploid:
        LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r))
        shutil.copy(filename_output_l, filename_output_r)

    # build a temporary fasta index
    pysam.FastaFile(filename_output_l)

    return filename_output_l, filename_output_r
示例#14
0
def fasta_patch(
    filename_fasta,
    filename_vcf,
    strain,
    filename_output,
    bgzip=False,
    num_processes=None,
    pass_only=False,
    quality=False,
    diploid=False,
):
    """
    Patch a Fasta file by replacing the bases where the SNPs are located in the VCF file.

    :param filename_fasta: name of the input Fasta file
    :type filename_fasta: string
    :param filename_vcf: name of the VCF file
    :type filename_vcf: string
    :param strain: name of strain to use in VCF file
    :type strain: string
    :param filename_output: name of the output Fasta file
    :type filename_output: string
    :param bgzip: compress file in BGZIP format
    :type bgzip: boolean
    :param num_processes: the number of processes to spawn
    :type num_processes: int
    :param pass_only: Only process those VCF records with a 'PASS'
    :type pass_only: boolean
    :param quality: filter on quality, FI=PASS
    :type quality: boolean
    :param diploid: don't ignore hets and create 2 files
    :type diploid: boolean
    :return: Nothing
    """
    start = time.time()

    filename_fasta = g2g_fu.check_file(filename_fasta)
    filename_vcf = g2g_fu.check_file(filename_vcf)

    LOG.info("INPUT FASTA FILE: {0}".format(filename_fasta))
    LOG.info("VCF FILE: {0}".format(filename_vcf))
    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(pass_only)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not strain:
        raise G2GValueError("No strain was specified.")

    filename_output_l, filename_output_r = prepare_fasta_patch(filename_fasta, filename_output, bgzip, diploid)

    if not num_processes:
        num_processes = multiprocessing.cpu_count()
    else:
        if num_processes <= 0:
            num_processes = 1

    LOG.info("NUMBER OF PROCESSES: {0}".format(num_processes))
    if bgzip:
        if diploid:
            LOG.info("OUTPUT FASTA FILES: {0}.gz".format(filename_output_l))
            LOG.info("                    {0}.gz".format(filename_output_r))
        else:
            LOG.info("OUTPUT FASTA FILE: {0}.gz".format(filename_output_l))
    else:
        if diploid:
            LOG.info("OUTPUT FASTA FILES: {0}".format(filename_output_l))
            LOG.info("                    {0}".format(filename_output_r))
        else:
            LOG.info("OUTPUT FASTA FILE: {0}".format(filename_output_l))

    LOG.info("Patching...")

    try:
        patch(
            filename_fasta,
            filename_vcf,
            strain,
            filename_output_l,
            filename_output_r,
            num_processes,
            pass_only,
            quality,
            diploid,
        )

        LOG.info("Patching complete")

        # remove the fai
        LOG.debug("removing the FAI index for {0}".format(g2g_fu.delete_index_files(filename_output_l)))
        g2g_fu.delete_index_files(filename_output_l)

        # move temp to final destination
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(filename_output_l, "{0}.gz".format(filename_output_l), "fa")
            if diploid:
                g2g_fu.bgzip_index(filename_output_r, "{0}.gz".format(filename_output_r), "fa")

        LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
    except Exception, e:
        LOG.debug(e)
        raise G2GError("")
示例#15
0
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False):
    """
    Initialize fasta_patch variables

    :param filename_fasta:
    :param filename_vcf:
    :param strain:
    :param filename_output:
    :param bgzip:
    :param diploid:
    :return:
    """

    filename_output = g2g_fu.check_file(filename_output, 'w')
    output_file_dir = os.path.abspath(os.path.dirname(filename_output))

    new_filename_output = filename_output

    # let's figure out what our output names will be
    if filename_output.lower().endswith('.gz'):
        # strip off .gz
        new_filename_output = filename_output[:-3]

    if not filename_output.lower().endswith('.fa'):
        raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'")


    if diploid:
        filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, 'l')
        filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, 'r')

        g2g_fu.delete_index_files(filename_output_l)
        g2g_fu.delete_index_files(filename_output_r)
    else:
        filename_output_l = new_filename_output
        filename_output_r = None

        g2g_fu.delete_index_files(filename_output_l)

    # at this point we are hoping for a .fa extension

    # let's figure out our input and process accordingly
    if filename_fasta.lower().endswith('.fa.gz'):
        # decompress the fasta file if it is compressed

        LOG.info("Copying and decompressing fasta file")

        # copy file and preserve gz extension for bgzip -d to work
        tmp_file_name = os.path.basename(filename_fasta)                        # something.gz
        LOG.debug("tmp_file_name={0}".format(tmp_file_name))

        tmp_fasta = os.path.join(output_file_dir, tmp_file_name)                # /path/something.fa.gz
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta))
        shutil.copy(filename_fasta, tmp_fasta)  # cp /original/something.fa.gz /output/something.fa.gz

        LOG.debug("DECOMPRESSING {0}".format(tmp_fasta))
        g2g_fu.bgzip_decompress(tmp_fasta)

        tmp_fasta = tmp_fasta[:-3]         # /path/something.fa
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l))
        shutil.move(tmp_fasta, filename_output_l)

    elif filename_fasta.lower().endswith('.fa'):
        LOG.debug("File is not compressed")

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l))
        shutil.copy(filename_fasta, filename_output_l)
    else:
        raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'")

    if diploid:
        LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r))
        shutil.copy(filename_output_l, filename_output_r)

    # build a temporary fasta index
    pysam.FastaFile(filename_output_l)

    return filename_output_l, filename_output_r
示例#16
0
def fasta_extract_exons(fasta_file, database_file, output, raw=False):
    start = time.time()

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta_file = g2g_fu.check_file(fasta_file)
        fasta = FastaFile(fasta_file)

    database_file = g2g_fu.check_file(database_file)

    fasta_out = sys.stdout

    if output:
        output = g2g_fu.check_file(output, 'w')
        fasta_out = open(output, "w")

    LOG.info("FASTA FILE: {0}".format(fasta.filename))
    LOG.info("DATABASE FILE: {0}".format(database_file))
    LOG.info("OUTPUT FILE: {0}".format(fasta_out.name))

    try:
        transcripts = get_transcripts_simple(database_file)
        for i, transcript in enumerate(transcripts):

            if transcript.seqid not in fasta.references:
                continue

            for ensembl_id, exon in transcript.exons.iteritems():
                LOG.debug("Exon={0}".format(exon))

                partial_seq = fasta.fetch(exon.seqid, exon.start - 1, exon.end)
                partial_seq_str = partial_seq

                if transcript.strand == -1:
                    partial_seq_str = str(
                        reverse_complement_sequence(partial_seq))

                LOG.debug("{0}:{1}-{2} (Length: {3})\n{4}".format(
                    exon.seqid, exon.start, exon.end, len(partial_seq),
                    partial_seq_str))

                if raw:
                    fasta_out.write(partial_seq_str)
                else:
                    fasta_id = ">{0} {1}:{2}-{3}\n".format(
                        exon.ensembl_id, exon.seqid, exon.start, exon.end)
                    fasta_out.write(fasta_id)

                    for line in wrap_sequence(partial_seq_str):
                        fasta_out.write(line.strip())
                        fasta_out.write('\n')

    except G2GValueError as e:
        LOG.info(e.msg.rstrip())
        raise e
    except G2GFastaError as e:
        LOG.info(e.msg.rstrip())
        raise e

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#17
0
def convert_gtf_file(chain_file, input_file, output_file, reverse=False):
    """
    Convert GTF coordinates.

    The mappings of coordinates are stored in the :class:`.chain.ChainFile` object.

    :param chain_file:
    :type chain_file: :class:`.chain.ChainFile`
    :param input_file: the input GTF file
    :type input_file: string
    :param output_file: the output GTF file
    :type output_file: string
    :param reverse: reverse direction of original chain file
    :type reverse: boolean
    :return:
    """
    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    input_file = g2g_fu.check_file(input_file)
    output_file_name = g2g_fu.check_file(output_file, 'w')
    unmapped_file_name = "{0}.unmapped".format(output_file_name)

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(input_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))
    LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name))

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file, reverse=reverse)
        LOG.info("Chain file parsed")

    gtf_out = open(output_file_name, "w")
    gtf_unmapped_file = open(unmapped_file_name, "w")

    LOG.info("Converting GTF file...")

    gtf_file = GTF(input_file)

    total = 0
    success = 0
    fail = 0

    # GTF is 1 based, bx-python is 0 based
    # when we do the querying, we subtract 1 from the GTF file start position
    # K.B.  Also note in gtf when (s, e) is given...it should mean s <= x <= e.
    #       bx-python (s, e) does it s <= x < e.

    for record in gtf_file:

        LOG.debug("\nORIGINAL: {0}".format(str(gtf_file.current_line).strip()))

        total += 1

        if total % 100000 == 0:
            LOG.info("Processed {0:,} lines".format(total))

        mappings = chain_file.find_mappings(record.seqid, record.start - 1,
                                            record.end)

        # unmapped
        if mappings is None:
            LOG.debug("\tFail due to no mappings")
            gtf_unmapped_file.write(gtf_file.current_line)
            fail += 0
            continue
        else:
            LOG.debug("{0} mappings found".format(len(mappings)))

        success += 1
        start = mappings[0].to_start + 1
        end = mappings[-1].to_end

        LOG.debug("({0}, {1}) => ({2}, {3})".format(record.start - 1,
                                                    record.end, start, end))

        elems = gtf_file.current_line.rstrip().split('\t')
        elems[3] = start
        elems[4] = end

        LOG.debug("     NEW: {0}".format("\t".join(map(str, elems))))

        gtf_out.write("\t".join(map(str, elems)))
        gtf_out.write("\n")

    gtf_out.close()
    gtf_unmapped_file.close()

    LOG.info("Converted {0:,} of {1:,} records".format(success, total))
    LOG.info('GTF file converted')
示例#18
0
def db2chain(chain_file, input_file, output_file, chain_genes=False):
    """

    :param chain_file:
    :param input_file:
    :param output_file:
    :param chain_genes:
    :return:
    """
    start = time.time()

    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    input_file = g2g_fu.check_file(input_file)
    output_file_name = g2g_fu.check_file(output_file, 'w')

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(input_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))

    if chain_genes:
        LOG.info("CHAIN TYPE: GENES")
    else:
        LOG.info("CHAIN TYPE: TRANSCRIPTS")

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file)
        LOG.info("Chain file parsed")

    LOG.info('Creating new chain file...')

    if chain_genes:
        LOG.debug("Generating chain for genes")

        for chromosome in chain_file.get_seqids():
            LOG.debug("Generating chain for genes in chromosome {0}".format(
                chromosome))

            for i, gene in enumerate(
                    get_genes_simple(input_file,
                                     location=Location(chromosome))):
                LOG.debug("\n{0}".format(gene))
                chain_entries = []
                from_start = None
                to_start = None
                from_end = None
                to_end = None

                mappings = chain_file.find_mappings(gene.seqid, gene.start,
                                                    gene.end)

                if gene.strand == 1:
                    if mappings and len(mappings) > 0:
                        if not from_start:
                            from_start = mappings[0].from_start
                            to_start = mappings[0].to_start

                        if len(mappings) == 1:
                            m = mappings[0]
                            c = ChainEntry()
                            c.lines.append([m.from_end - m.from_start])
                            chain_entries.append(c)
                        else:
                            c = ChainEntry()

                            prev_mapping = None
                            sum_size = 0
                            sum_dq = 0
                            sum_dt = 0
                            dq = 0
                            prev_dq = 0
                            dt = 0
                            prev_dt = 0

                            for m in mappings:
                                if not prev_mapping:
                                    prev_mapping = m
                                else:
                                    prev_dt = dt
                                    prev_dq = dq
                                    chain_size = prev_mapping.from_end - prev_mapping.from_start
                                    dt = m.from_start - prev_mapping.from_end
                                    dq = m.to_start - prev_mapping.to_end

                                    if dt > 0:
                                        chain_size += prev_dq

                                    sum_size += chain_size
                                    sum_dq += dq
                                    sum_dt += dt

                                    c.lines.append([chain_size, dt, dq])
                                    LOG.debug(c.lines[-1])
                                    prev_mapping = m

                            chain_size = mappings[-1].from_end - mappings[
                                -1].from_start
                            if dt > 0:
                                chain_size += dq
                                sum_size += dq

                            c.lines.append([chain_size])
                            chain_entries.append(c)
                else:
                    if mappings and len(mappings) > 0:
                        if not from_end:
                            from_end = mappings[-1].from_end
                            to_end = mappings[-1].to_end

                        if len(mappings) == 1:
                            m = mappings[0]
                            c = ChainEntry()
                            c.lines.append([m.from_end - m.from_start])
                            chain_entries.append(c)
                        else:
                            c = ChainEntry()

                            prev_mapping = None
                            sum_size = 0
                            sum_dq = 0
                            sum_dt = 0
                            dq = 0
                            prev_dq = 0
                            dt = 0
                            prev_dt = 0

                            # reverse
                            mappings = mappings[::-1]

                            for m in mappings:
                                LOG.debug("CURRENT MAPPING: {0}".format(m))
                                if not prev_mapping:
                                    prev_mapping = m
                                else:
                                    LOG.debug("PREV MAPPING: {0}".format(
                                        prev_mapping))
                                    prev_dt = dt
                                    prev_dq = dq
                                    chain_size = prev_mapping.from_end - prev_mapping.from_start
                                    #dt = m.from_start - prev_mapping.from_end
                                    #dq = m.to_start - prev_mapping.to_end
                                    dt = prev_mapping.from_start - m.from_end
                                    dq = prev_mapping.to_start - m.to_end
                                    LOG.debug("dt={0}, dq={1}".format(dt, dq))

                                    #if dt > 0:
                                    #    LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size))
                                    #    chain_size += prev_dq

                                    sum_size += chain_size
                                    sum_dq += dq
                                    sum_dt += dt

                                    c.lines.append([chain_size, dt, dq])
                                    LOG.debug(c.lines[-1])
                                    prev_mapping = m

                            LOG.debug("finding last...{0}".format(
                                mappings[-1]))
                            chain_size = mappings[-1].from_end - mappings[
                                -1].from_start
                            #if dt > 0:
                            #    LOG.debug("WHOA {0}".format(dt))
                            #    LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size))
                            #    chain_size += dq
                            #    sum_size += dq

                            c.lines.append([chain_size])
                            LOG.debug(c.lines[-1])
                            chain_entries.append(c)

                if chain_entries and len(chain_entries) > 0:
                    sum_size = 0
                    sum_dq = 0
                    sum_dt = 0
                    lines = []

                    for line in chain_entries[0].lines:
                        sum_size += line[0]
                        if len(line) > 1:
                            sum_dq += line[1]
                            sum_dt += line[2]
                        lines.append('\t'.join(map(str, line)))

                    if output_file:
                        outf = open(output_file, "a")
                    else:
                        outf = sys.stdout

                outf.write(
                    CHAIN_STRING.format(CHAIN_STRING,
                                        from_chr=gene.seqid,
                                        from_length=sum_size + sum_dq,
                                        from_start=0,
                                        from_end=sum_size + sum_dq,
                                        to_chr=gene.seqid,
                                        to_length=sum_size + sum_dt,
                                        to_start=0,
                                        to_end=sum_size + sum_dt,
                                        id=gene.ensembl_id))
                outf.write("\n")
                outf.write("\n".join(lines))
                outf.write("\n")

                outf.close()
    else:
        for chromosome in chain_file.get_seqids():
            LOG.debug(
                "Generating chain for transcripts in chromosome {0}".format(
                    chromosome))

            for i, transcript in enumerate(
                    get_transcripts_simple(input_file,
                                           location=Location(chromosome))):
                LOG.debug("Transcript = {0}".format(transcript))
                chain_entries = []
                from_start = None
                to_start = None
                from_end = None
                to_end = None
                transcript.exons = OrderedDict(
                    sorted(transcript.exons.items(),
                           key=lambda x: x[1].exon_number))

                for ensembl_id, exon in transcript.exons.iteritems():
                    LOG.debug("Exon = {0}".format(exon))

                    mappings = chain_file.find_mappings(
                        exon.seqid, exon.start, exon.end)

                    if exon.strand == 1:
                        if mappings and len(mappings) > 0:
                            if not from_start:
                                from_start = mappings[0].from_start
                                to_start = mappings[0].to_start

                            if len(mappings) == 1:
                                m = mappings[0]
                                c = ChainEntry()
                                c.lines.append([m.from_end - m.from_start])
                                chain_entries.append(c)
                            else:
                                c = ChainEntry()

                                prev_mapping = None
                                sum_size = 0
                                sum_dq = 0
                                sum_dt = 0
                                dq = 0
                                prev_dq = 0
                                dt = 0
                                prev_dt = 0

                                for m in mappings:
                                    if not prev_mapping:
                                        prev_mapping = m
                                    else:
                                        prev_dt = dt
                                        prev_dq = dq
                                        chain_size = prev_mapping.from_end - prev_mapping.from_start
                                        dt = m.from_start - prev_mapping.from_end
                                        dq = m.to_start - prev_mapping.to_end

                                        if dt > 0:
                                            chain_size += prev_dq

                                        sum_size += chain_size
                                        sum_dq += dq
                                        sum_dt += dt

                                        c.lines.append([chain_size, dt, dq])
                                        LOG.debug(c.lines[-1])
                                        prev_mapping = m

                                chain_size = mappings[-1].from_end - mappings[
                                    -1].from_start
                                if dt > 0:
                                    chain_size += dq
                                    sum_size += dq

                                c.lines.append([chain_size])
                                chain_entries.append(c)
                    else:
                        if mappings and len(mappings) > 0:
                            if not from_end:
                                from_end = mappings[-1].from_end
                                to_end = mappings[-1].to_end

                            if len(mappings) == 1:
                                m = mappings[0]
                                c = ChainEntry()
                                c.lines.append([m.from_end - m.from_start])
                                chain_entries.append(c)
                            else:
                                c = ChainEntry()

                                prev_mapping = None
                                sum_size = 0
                                sum_dq = 0
                                sum_dt = 0
                                dq = 0
                                prev_dq = 0
                                dt = 0
                                prev_dt = 0

                                # reverse
                                mappings = mappings[::-1]

                                for m in mappings:
                                    LOG.debug("CURRENT MAPPING: {0}".format(m))
                                    if not prev_mapping:
                                        prev_mapping = m
                                    else:
                                        LOG.debug("PREV MAPPING: {0}".format(
                                            prev_mapping))
                                        prev_dt = dt
                                        prev_dq = dq
                                        chain_size = prev_mapping.from_end - prev_mapping.from_start
                                        #dt = m.from_start - prev_mapping.from_end
                                        #dq = m.to_start - prev_mapping.to_end
                                        dt = prev_mapping.from_start - m.from_end
                                        dq = prev_mapping.to_start - m.to_end
                                        LOG.debug("dt={0}, dq={1}".format(
                                            dt, dq))

                                        #if dt > 0:
                                        #    LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size))
                                        #    chain_size += prev_dq

                                        sum_size += chain_size
                                        sum_dq += dq
                                        sum_dt += dt

                                        c.lines.append([chain_size, dt, dq])
                                        LOG.debug(c.lines[-1])
                                        prev_mapping = m

                                LOG.debug("finding last...{0}".format(
                                    mappings[-1]))
                                chain_size = mappings[-1].from_end - mappings[
                                    -1].from_start
                                #if dt > 0:
                                #    LOG.debug("WHOA {0}".format(dt))
                                #    chain_size += dq
                                #    sum_size += dq

                                c.lines.append([chain_size])
                                LOG.debug(c.lines[-1])
                                chain_entries.append(c)

                # collapse exons
                if chain_entries and len(chain_entries) > 0:
                    LOG.debug('>>>>>>>')
                    for c in chain_entries:
                        LOG.debug(str(c))
                    LOG.debug('>>>>>>>')
                    chain_entries = collapse_entries(chain_entries)
                    sum_size = 0
                    sum_dq = 0
                    sum_dt = 0
                    lines = []

                    for line in chain_entries[0].lines:
                        sum_size += line[0]
                        if len(line) > 1:
                            sum_dq += line[1]
                            sum_dt += line[2]
                        lines.append('\t'.join(map(str, line)))

                    if output_file:
                        outf = open(output_file, "a")
                    else:
                        outf = sys.stdout

                    outf.write(
                        CHAIN_STRING.format(CHAIN_STRING,
                                            from_chr=transcript.seqid,
                                            from_length=sum_size + sum_dq,
                                            from_start=0,
                                            from_end=sum_size + sum_dq,
                                            to_chr=transcript.seqid,
                                            to_length=sum_size + sum_dt,
                                            to_start=0,
                                            to_end=sum_size + sum_dt,
                                            id=transcript.ensembl_id))
                    outf.write("\n")
                    outf.write("\n".join(lines))
                    outf.write("\n")

                    outf.close()

    LOG.info('New chain file created')

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#19
0
def fasta_transform(fasta_file,
                    chain_file,
                    locations,
                    output_file,
                    bgzip=False,
                    reverse=False):
    """

    :param fasta_file:
    :param chain_file:
    :param locations:
    :param output_file:
    :param bgzip:
    :param reverse:
    :return:
    """
    start = time.time()

    if not isinstance(fasta_file, FastaFile):
        fasta_file = g2g_fu.check_file(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = g2g_fu.check_file(chain_file)

    output_file = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file)
    g2g_fu.delete_index_files(output_file)

    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file))
    LOG.info("BGZIP: {0}".format(bgzip))
    LOG.info("REVERSE: {0}".format(reverse))

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta = FastaFile(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = ChainIter(chain_file, reverse=reverse)

    seq_ids = []

    if locations:
        LOG.debug("Have locations")
        new_locations = []
        for l in locations:
            if isinstance(l, Location):
                new_locations.append(l)
            else:
                new_locations.append(parse_location(l))
            seq_ids.append(new_locations[-1].seqid)
        locations = new_locations
    else:
        LOG.debug("Calculating locations")
        locations = [
            parse_location(
                "{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1)
            for a in fasta.references
        ]
        seq_ids = [a for a in fasta.references]

    temp_output_file = output_file

    if bgzip:
        if g2g_fu.get_extension(output_file) != 'gz':
            output_file = "{0}.gz".format(output_file)
        else:
            temp_output_file = temp_output_file[:-3]

    fasta_out = open(temp_output_file, "w")

    LOG.info("Transforming...")

    chr_info = {}

    try:
        # will need a better way, but for now...
        LOG.info("Parsing chain file...")
        for line in chain_file:
            if len(line) > 7:
                LOG.debug("Adding chromosome {0}".format(
                    chain_file.current_chain_header[1]))
                chr_info[chain_file.current_chain_header[1]] = {
                    'from_size': line[2],
                    'from_start': line[4],
                    'from_end': line[5],
                    'to_size': line[7],
                    'to_start': line[9],
                    'to_end': line[10],
                    'header_chain': chain_file.current_chain_header,
                    'lines': []
                }
            else:
                chr_info[chain_file.current_chain_header[1]]['lines'].append(
                    line)

        LOG.info("Chain file parsed")

        insertion_bases = 0
        deletion_bases = 0

        for location in locations:
            LOG.info("Processing chromosome={0}".format(location.seqid))
            LOG.debug("Location: {0}".format(location))

            chrom_size_from = chr_info[location.seqid]['from_size']
            chrom_size_to = chr_info[location.seqid]['to_size']

            last_pos = chr_info[location.seqid]['from_start']
            new_sequence = StringIO()
            chain_file.reset()

            for chain_line in chr_info[location.seqid]['lines']:
                LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no,
                                                     chain_line))

                if len(chain_line) == 1:
                    # last line
                    fragment = chain_line[0]

                    partial_seq = fasta.fetch(location.seqid, last_pos,
                                              last_pos + fragment)
                    new_sequence.write(str(partial_seq))

                    if len(new_sequence.getvalue()) < chrom_size_to:
                        LOG.warn(
                            "Length's do not match, chromosome length in chain: {0}, sequence length: {1}"
                            .format(chrom_size_to,
                                    len(new_sequence.getvalue())))

                    fasta_out.write(">{0} {1}:{2}-{3}\n".format(
                        location.seqid, location.seqid,
                        chr_info[location.seqid]['from_start'] + 1,
                        chrom_size_to))

                    for l in wrap_sequence(new_sequence.getvalue()):
                        fasta_out.write(l.strip())
                        fasta_out.write('\n')

                    break

                else:

                    # fragment_size dt_size dq_size same_bases dt_bases dq_bases

                    fragment = chain_line[0]
                    dt = chain_line[1 if not reverse else 2]
                    dq = chain_line[2 if not reverse else 1]
                    same = chain_line[3]
                    dt_bases = chain_line[4 if not reverse else 5]
                    dq_bases = chain_line[5 if not reverse else 4]

                    partial_seq = fasta.fetch(location.seqid, last_pos,
                                              last_pos + fragment)
                    new_sequence.write(partial_seq)

                    if dq > 0:
                        # insertion
                        LOG.debug("INSERTION")
                        new_sequence.write(dq_bases)
                        LOG.debug("{0}:{1}-{2} (Length: {3})".format(
                            location.seqid, last_pos, last_pos + fragment,
                            len(partial_seq)))
                        if len(partial_seq) > 100:
                            LOG.debug("{0}...{1}".format(
                                partial_seq[:10], partial_seq[-10:]))
                        else:
                            LOG.debug(partial_seq)
                        LOG.debug("Adding {0}".format(dq_bases))
                        LOG.debug("SAME={0}, {1}".format(
                            same, partial_seq[-(len(same)):]))

                        insertion_bases += dq

                    if dt > 0:
                        # deletion
                        LOG.debug("DELETION")
                        last_pos += dt
                        LOG.debug("skipping ahead {0} bases".format(dt))

                        deletion_bases += dt

                    last_pos += fragment

                    LOG.debug(
                        "LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}"
                        .format(last_pos, insertion_bases, deletion_bases,
                                (insertion_bases - deletion_bases)))

        # bgzip and index
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(temp_output_file, output_file, 'fa')

    except G2GLocationError, le:
        LOG.debug("Unable to parse location, {0}".format(le.message))
        raise le
示例#20
0
def gtf2db(input_file, output_file):
    """
    Convert a GTF file into SQLite

    :param input_file: the GTF file to convert
    :param output_file: The generated database file
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file, 'r')
    output_file = g2g_fu.check_file(output_file, 'w')

    g2g_fu.delete_file(output_file)

    LOG.info("GTF FILE: {0}".format(input_file))
    LOG.info("DB File: {0}".format(output_file))

    conn = sqlite3.connect(output_file)
    c = conn.cursor()

    LOG.debug("Generating tables")
    c.execute(SQL_CREATE_GTF_TABLE)
    c.execute(SQL_CREATE_GTF_LOOKUP_TABLE)
    c.execute(SQL_CREATE_GTF_SOURCES_TABLE)
    c.execute(SQL_CREATE_GTF_TYPES_TABLE)
    c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE)



    gtf_types = {}
    gtf_sources = {}
    gtf_attributes = {}

    LOG.info("Parsing GTF file...")

    gtf_file = GTF(input_file)

    counter = 0

    for record in gtf_file:
        if counter and counter % 100000 == 0:
            LOG.info("Processed {0:,} records".format(counter))

        if record.type not in gtf_types:
            _type_key = len(gtf_types.keys())
            gtf_types[record.type] = _type_key
        else:
            _type_key = gtf_types[record.type]

        if record.source not in gtf_sources:
            _source_key = len(gtf_sources.keys())
            gtf_sources[record.source] = _source_key
        else:
            _source_key = gtf_sources[record.source]

        strand = 0
        if record.strand in ['+', '-']:
            strand = 1 if record.strand == '+' else -1

        gene_id = record.attributes['gene_id']
        transcript_id = record.attributes['transcript_id'] if 'transcript_id' in record.attributes else None
        ensembl_id = None

        if record.type == 'gene':
            ensembl_id = record.attributes['gene_id']
        elif record.type == 'transcript':
            ensembl_id = record.attributes['transcript_id']
        elif record.type == 'exon':
            ensembl_id = record.attributes['exon_id']
        else:
            ensembl_id = record.attributes['protein_id'] if 'protein_id' in record.attributes else None

        c.execute(SQL_INSERT_GTF_TABLE, (gene_id, transcript_id, ensembl_id, record.seqid, record.start, record.end, strand, record.score, _source_key, _type_key, record.frame))
        gtf_key = c.lastrowid

        for attribute, value in record.attributes.iteritems():
            if attribute not in ['gene_id', 'transcript_id', 'exon_id']:
                if attribute not in gtf_attributes:
                    _attribute_key = len(gtf_attributes.keys())
                    gtf_attributes[attribute] = _attribute_key
                else:
                    _attribute_key = gtf_attributes[attribute]

                c.execute(SQL_INSERT_GTF_LOOKUP_TABLE, (gtf_key, _attribute_key, value))

        counter += 1

    # save (commit) the changes
    conn.commit()

    for source, _key in gtf_sources.iteritems():
        c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source))
        conn.commit()

    for type, _key in gtf_types.iteritems():
        c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type))
        conn.commit()

    for attribute, _key in gtf_attributes.iteritems():
        c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute))
        conn.commit()

    LOG.info("GTF File parsed")

    LOG.info("Finalizing database...")

    for sql in SQL_INDICES_GTF:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_LOOKUP:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_TYPES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_SOURCES:
        LOG.debug(sql)
        c.execute(sql)

    for sql in SQL_INDICES_GTF_ATTRIBUTES:
        LOG.debug(sql)
        c.execute(sql)

    LOG.info("Database created")

    # close connection
    conn.close()

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
示例#21
0
def fasta_patch(filename_fasta, filename_vcf, strain, filename_output, bgzip=False,
                num_processes=None, pass_only=False, quality=False, diploid=False):
    """
    Patch a Fasta file by replacing the bases where the SNPs are located in the VCF file.

    :param filename_fasta: name of the input Fasta file
    :type filename_fasta: string
    :param filename_vcf: name of the VCF file
    :type filename_vcf: string
    :param strain: name of strain to use in VCF file
    :type strain: string
    :param filename_output: name of the output Fasta file
    :type filename_output: string
    :param bgzip: compress file in BGZIP format
    :type bgzip: boolean
    :param num_processes: the number of processes to spawn
    :type num_processes: int
    :param pass_only: Only process those VCF records with a 'PASS'
    :type pass_only: boolean
    :param quality: filter on quality, FI=PASS
    :type quality: boolean
    :param diploid: don't ignore hets and create 2 files
    :type diploid: boolean
    :return: Nothing
    """
    start = time.time()

    filename_fasta = g2g_fu.check_file(filename_fasta)
    filename_vcf = g2g_fu.check_file(filename_vcf)

    LOG.info("INPUT FASTA FILE: {0}".format(filename_fasta))
    LOG.info("VCF FILE: {0}".format(filename_vcf))
    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(pass_only)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not strain:
        raise G2GValueError("No strain was specified.")

    filename_output_l, filename_output_r = prepare_fasta_patch(filename_fasta, filename_output, bgzip, diploid)

    if not num_processes:
        num_processes = multiprocessing.cpu_count()
    else:
        if num_processes <= 0:
            num_processes = 1

    LOG.info("NUMBER OF PROCESSES: {0}".format(num_processes))
    if bgzip:
        if diploid:
            LOG.info("OUTPUT FASTA FILES: {0}.gz".format(filename_output_l))
            LOG.info("                    {0}.gz".format(filename_output_r))
        else:
            LOG.info("OUTPUT FASTA FILE: {0}.gz".format(filename_output_l))
    else:
        if diploid:
            LOG.info("OUTPUT FASTA FILES: {0}".format(filename_output_l))
            LOG.info("                    {0}".format(filename_output_r))
        else:
            LOG.info("OUTPUT FASTA FILE: {0}".format(filename_output_l))

    LOG.info("Patching...")

    try:
        patch(filename_fasta, filename_vcf, strain, filename_output_l, filename_output_r,
              num_processes, pass_only, quality, diploid)

        LOG.info("Patching complete")

        # remove the fai
        LOG.debug("removing the FAI index for {0}".format(g2g_fu.delete_index_files(filename_output_l)))
        g2g_fu.delete_index_files(filename_output_l)

        # move temp to final destination
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(filename_output_l, "{0}.gz".format(filename_output_l), 'fa')
            if diploid:
                g2g_fu.bgzip_index(filename_output_r, "{0}.gz".format(filename_output_r), 'fa')

        LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
    except Exception, e:
        LOG.debug(e)
        raise G2GError("")
示例#22
0
def convert_bam_file(chain_file, file_in, file_out, reverse=False):
    """
    Convert genome coordinates (in BAM/SAM format) between assemblies.  These coordinates
    are stored in the :class:`.chain.ChainFile` object.

    :param chain_file: chain file used for conversion
    :type chain_file: :class:`.chain.ChainFile`
    :param str file_in: the input SAM or BAM file
    :type file_in: string
    :param file_out: the output SAM or file
    :type file_out: string
    :param reverse: reverse direction of original chain file
    :type reverse: boolean
    """
    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    if not isinstance(file_in, pysam.Samfile):
        file_in = g2g_fu.check_file(file_in)

    output_file_name = g2g_fu.check_file(file_out, 'w')
    unmapped_file_name = "{0}.unmapped".format(output_file_name)

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(file_in))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))
    LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name))

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file, reverse=reverse)
        LOG.info("Chain file parsed")

    if not isinstance(file_in, pysam.Samfile):
        try:
            sam_file = pysam.Samfile(file_in, 'rb')
            if len(sam_file.header) == 0:
                raise G2GBAMError("BAM File has no header information")
        except:
            sam_file = pysam.Samfile(file_in, 'r')
            if len(sam_file.header) == 0:
                raise G2GBAMError("SAM File has no header information")

    LOG.info("Converting BAM file")

    new_header = sam_file.header

    # replace 'HD'
    new_header['HD'] = {'VN': 1.0, 'SO': 'coordinate'}

    # replace SQ
    tmp = []
    name_to_id = {}
    id = 0
    for ref_name in sorted(chain_file.chrom_size_to):
        tmp.append({
            'LN': chain_file.chrom_size_from[ref_name],
            'SN': ref_name
        })
        name_to_id[ref_name] = id
        id += 1

    new_header['SQ'] = tmp

    if 'PG' not in new_header:
        new_header['PG'] = []

    new_header['PG'].append({'ID': 'gtgtools', 'VN': 1.0})

    if 'CO' not in new_header:
        new_header['CO'] = []

    new_header['CO'].append("Original file: {0}".format(file_in))
    new_header['CO'].append("Chain File: {0}".format(chain_file.file_name))

    dir, temp_file_name = os.path.split(file_out)
    parts = temp_file_name.split('.')
    ext = parts[-1]

    if ext.lower() == 'bam':
        new_file = pysam.Samfile(file_out, 'wb', header=new_header)
        new_file_unmapped = pysam.Samfile(unmapped_file_name,
                                          'wb',
                                          template=sam_file)
    elif ext.lower() == 'sam':
        new_file = pysam.Samfile(file_out, 'wh', header=new_header)
        new_file_unmapped = pysam.Samfile(unmapped_file_name,
                                          'wh',
                                          template=sam_file)
    else:
        raise G2GBAMError(
            "Unable to create new file based upon file extension")

    total = 0
    total_unmapped = 0
    total_fail_qc = 0

    map_statistics = {
        'total': 0,
        'fail_cannot_map': 0,
        'success_simple': 0,
        'success_complex': 0
    }

    map_statistics_pair = {
        'total': 0,
        'fail_cannot_map': 0,
        'success_1_fail_2_simple': 0,
        'success_1_fail_2_complex': 0,
        'success_1_simple_2_fail': 0,
        'success_1_simple_2_simple': 0,
        'success_1_simple_2_complex': 0,
        'success_1_complex_2_fail': 0,
        'success_1_complex_2_simple': 0,
        'success_1_complex_2_complex': 0
    }

    try:
        while True:
            if total and total % 10000 == 0:
                status_success = 0
                status_failed = 0

                for k, v in map_statistics_pair.iteritems():
                    if k.startswith('success'):
                        status_success += v
                    elif k.startswith('fail'):
                        status_failed += v

                LOG.info(
                    "Processed {0:,} reads, {1:,} successful, {2:,} failed".
                    format(total, status_success, status_failed))

            alignment = sam_file.next()
            alignment_new = pysam.AlignedRead()
            read_chr = sam_file.getrname(alignment.tid)

            # READ ONLY

            # aend                  aligned reference position of the read on the reference genome
            # alen                  aligned length of the read on the reference genome.
            # positions             a list of reference positions that this read aligns to
            # qend                  end index of the aligned query portion of the sequence (0-based, exclusive)
            # qlen                  Length of the aligned query sequence
            # qqual                 aligned query sequence quality values
            # qstart                start index of the aligned query portion of the sequence (0-based, inclusive)
            # query                 aligned portion of the read and excludes any flanking bases that were soft clipped
            # rlen                  length of the read

            # TRUE / FALSE (setting effects flag)

            # is_paired             true if read is paired in sequencing
            # is_proper_pair        true if read is mapped in a proper pair
            # is_qcfail             true if QC failure
            # is_read1              true if this is read1
            # is_read2              true if this is read2
            # is_reverse            true if read is mapped to reverse strand
            # is_secondary          true if not primary alignment
            # is_unmapped           true if read itself is unmapped
            # mate_is_reverse       true is read is mapped to reverse strand
            # mate_is_unmapped      true if the mate is unmapped

            # SET

            # cigar                 cigar as list of tuples
            # cigarstring           alignment as a string
            # flag                  properties flag
            # mapq                  mapping quality
            # pnext                 the position of the mate
            # pos                   0-based leftmost coordinate
            # pnext                 the position of the mate
            # qname                 the query name
            # rnext                 the reference id of the mate
            # seq                   read sequence bases, including soft clipped bases
            # tid                   target id, contains the index of the reference sequence in the sequence dictionary

            # DON'T NEED TO SET or SHOULD WE SET?

            # qual                  read sequence base qualities, including soft clipped bases
            # tags                  the tags in the AUX field
            # tlen                  insert size

            total += 1

            LOG.debug('~' * 80)
            LOG.debug("Converting {0} {1} {2} {3}".format(
                alignment.qname, read_chr, alignment.pos,
                alignment.cigarstring))

            if alignment.is_qcfail:
                LOG.debug("\tFail due to qc of old alignment")
                new_file_unmapped.write(alignment)
                total_fail_qc += 1
                continue

            if alignment.is_unmapped:
                LOG.debug("\tFail due to unmapped old alignment")
                new_file_unmapped.write(alignment)
                total_unmapped += 1
                continue

            if not alignment.is_paired:
                LOG.debug("SINGLE END ALIGNMENT")
                map_statistics['total'] += 1

                alignment_new.seq = alignment.seq
                alignment_new.flag = FLAG_NONE
                alignment_new.mapq = alignment.mapq
                alignment_new.qname = alignment.qname
                alignment_new.qual = alignment.qual
                alignment_new.tags = alignment.tags

                read_start = alignment.pos
                read_end = alignment.aend
                read_strand = '-' if alignment.is_reverse else '+'

                mappings = chain_file.find_mappings(read_chr, read_start,
                                                    read_end)

                # unmapped
                if mappings is None:
                    LOG.debug("\tFail due to no mappings")
                    new_file_unmapped.write(alignment)
                    map_statistics['fail_cannot_map'] += 1

                elif len(mappings) == 1:
                    if alignment.is_reverse:
                        alignment_new.flag |= FLAG_REVERSE

                    alignment_new.tid = name_to_id[mappings[0].to_chr]
                    alignment_new.pos = mappings[0].to_start
                    alignment_new.cigar = alignment.cigar
                    new_file.write(alignment_new)

                    LOG.debug("\tSuccess (simple): {0} {1}".format(
                        alignment_new.pos, alignment_new.cigarstring))
                    map_statistics['success_simple'] += 1

                else:
                    LOG.debug("MAPPINGS: {0}".format(len(mappings)))
                    for m in mappings:
                        LOG.debug("> {0}".format(m))

                    if alignment.is_reverse:
                        alignment_new.flag |= FLAG_REVERSE

                    alignment_new.tid = name_to_id[mappings[0].to_chr]
                    alignment_new.pos = mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read_strand, alignment.pos)
                    new_file.write(alignment_new)

                    LOG.debug("\tSuccess (complex): {0} {1}".format(
                        alignment_new.pos, alignment_new.cigarstring))
                    map_statistics['success_complex'] += 1

            else:
                LOG.debug("PAIRED END ALIGNMENT")
                map_statistics_pair['total'] += 1

                alignment_new.seq = alignment.seq
                alignment_new.flag = FLAG_PAIRED
                alignment_new.mapq = alignment.mapq
                alignment_new.qname = alignment.qname
                alignment_new.qual = alignment.qual
                alignment_new.tags = alignment.tags

                if alignment.is_read1:
                    alignment_new.flag |= FLAG_READ1
                if alignment.is_read2:
                    alignment_new.flag |= FLAG_READ2

                if alignment.is_reverse:
                    alignment_new.flag |= FLAG_REVERSE
                if alignment.mate_is_reverse:
                    alignment_new.flag |= FLAG_MREVERSE

                read1_chr = sam_file.getrname(alignment.tid)
                read1_start = alignment.pos
                read1_end = alignment.aend
                read1_strand = '-' if alignment.is_reverse else '+'
                read1_mappings = chain_file.find_mappings(
                    read1_chr, read1_start, read1_end)  #, read1_strand)

                read2_chr = None
                read2_start = None
                read2_end = None
                read2_strand = None
                read2_mappings = None

                if alignment.mate_is_unmapped:
                    alignment_new.flag |= FLAG_MUNMAP
                else:
                    read2_chr = sam_file.getrname(alignment.rnext)
                    read2_start = alignment.pnext
                    read2_end = read2_start + 1
                    read2_strand = '-' if alignment.mate_is_reverse else '+'
                    try:
                        read2_mappings = chain_file.find_mappings(
                            read2_chr, read2_start, read2_end)
                    except:
                        read2_mappings = None

                if read1_mappings is None and read2_mappings is None:

                    alignment_new.flag |= FLAG_UNMAP
                    alignment_new.flag |= FLAG_MUNMAP

                    LOG.debug("\tFail due to no mappings")
                    new_file_unmapped.write(alignment)
                    map_statistics_pair['fail_cannot_map'] += 1

                elif read1_mappings is None and read2_mappings and len(
                        read2_mappings) == 1:

                    alignment_new.flag |= FLAG_UNMAP

                    alignment_new.pos = 0
                    alignment_new.cigarstring = '0M'
                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0

                    LOG.debug(
                        "\tPair Success (1:fail,2:simple): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_fail_2_simple'] += 1

                elif read1_mappings is None and read2_mappings and len(
                        read2_mappings) > 1:

                    alignment_new.flag |= FLAG_UNMAP

                    alignment_new.pos = 0
                    alignment_new.cigarstring = '0M'
                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0

                    LOG.debug(
                        "\tPair Success (1:fail,2:complex): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_fail_2_complex'] += 1

                elif read1_mappings and len(
                        read1_mappings) == 1 and read2_mappings is None:

                    alignment_new.flag |= FLAG_MUNMAP

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pnext = 0
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:simple,2:fail): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_fail'] += 1

                elif read1_mappings and len(
                        read1_mappings) == 1 and read2_mappings and len(
                            read2_mappings) == 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:simple,2:simple): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_simple'] += 1

                elif read1_mappings and len(
                        read1_mappings
                ) == 1 and read2_mappings and len(read2_mappings) > 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:simple,2:complex): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_complex'] += 1

                elif read1_mappings and len(
                        read1_mappings) > 1 and read2_mappings is None:

                    alignment_new.flag |= FLAG_MUNMAP

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pnext = 0
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:complex,2:fail): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_fail'] += 1

                elif read1_mappings and len(
                        read1_mappings) > 1 and read2_mappings and len(
                            read2_mappings) == 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:complex,2:simple): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_simple'] += 1

                elif read1_mappings and len(
                        read1_mappings) > 1 and read2_mappings and len(
                            read2_mappings) > 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:complex,2:complex): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_complex'] += 1

                else:
                    raise G2GBAMError(
                        "Unknown BAM/SAM conversion/parse situation")

    except StopIteration:
        LOG.info("All reads processed")

    LOG.info("  {:>10} TOTAL ENTRIES".format(total))
    LOG.info("  {:>10} TOTAL UNMAPPED ".format(total_unmapped))
    LOG.info("  {:>10} TOTAL FAIL QC ".format(total_fail_qc))

    if map_statistics['total'] > 0:
        LOG.info("")
        LOG.info("Mapping Summary Single End")
        LOG.info("  {:>10} TOTAL ENTRIES".format(map_statistics['total']))
        LOG.info("")
        LOG.info(
            "  {:>10} TOTAL SUCCESS".format(map_statistics['success_simple'] +
                                            map_statistics['success_complex']))
        LOG.info("  {:>10} Simple".format(map_statistics['success_simple']))
        LOG.info("  {:>10} Complex".format(map_statistics['success_complex']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL FAILURES".format(
            map_statistics['fail_cannot_map']))
        LOG.info("  {:>10} Cannot Map ".format(
            map_statistics['fail_cannot_map']))

    if map_statistics_pair['total'] > 0:
        total_success = 0
        for k, v in map_statistics_pair.iteritems():
            if k.startswith('success'):
                total_success += v

        LOG.info("")
        LOG.info("Mapping Summary Paired End")
        LOG.info("  {:>10} TOTAL ENTRIES".format(map_statistics_pair['total']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL SUCCESS".format(total_success))
        LOG.info("  {:>10} Read 1 Failed, Read 2 Simple".format(
            map_statistics_pair['success_1_fail_2_simple']))
        LOG.info("  {:>10} Read 1 Failed, Read 2 Complex".format(
            map_statistics_pair['success_1_fail_2_complex']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Failed".format(
            map_statistics_pair['success_1_simple_2_fail']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Simple".format(
            map_statistics_pair['success_1_simple_2_simple']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Complex".format(
            map_statistics_pair['success_1_simple_2_complex']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Failed".format(
            map_statistics_pair['success_1_complex_2_fail']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Simple".format(
            map_statistics_pair['success_1_complex_2_simple']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Complex".format(
            map_statistics_pair['success_1_complex_2_complex']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL FAILURES".format(
            map_statistics_pair['fail_cannot_map']))
        LOG.info("  {:>10} Cannot Map".format(
            map_statistics_pair['fail_cannot_map']))
        LOG.info("")

    LOG.info("BAM File Converted")
示例#23
0
def convert_gtf_file(chain_file, input_file, output_file, reverse=False):
    """
    Convert GTF coordinates.

    The mappings of coordinates are stored in the :class:`.chain.ChainFile` object.

    :param chain_file:
    :type chain_file: :class:`.chain.ChainFile`
    :param input_file: the input GTF file
    :type input_file: string
    :param output_file: the output GTF file
    :type output_file: string
    :param reverse: reverse direction of original chain file
    :type reverse: boolean
    :return:
    """
    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    input_file = g2g_fu.check_file(input_file)
    output_file_name = g2g_fu.check_file(output_file, 'w')
    unmapped_file_name = "{0}.unmapped".format(output_file_name)

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(input_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))
    LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name))

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file, reverse=reverse)
        LOG.info("Chain file parsed")

    gtf_out = open(output_file_name, "w")
    gtf_unmapped_file = open(unmapped_file_name, "w")

    LOG.info("Converting GTF file...")

    gtf_file = GTF(input_file)

    total = 0
    success = 0
    fail = 0

    # GTF is 1 based, bx-python is 0 based
    # when we do the querying, we subtract 1 from the GTF file start position
    # K.B.  Also note in gtf when (s, e) is given...it should mean s <= x <= e.
    #       bx-python (s, e) does it s <= x < e.

    for record in gtf_file:

        LOG.debug("\nORIGINAL: {0}".format(str(gtf_file.current_line).strip()))

        total += 1

        if total % 100000 == 0:
            LOG.info("Processed {0:,} lines".format(total))

        mappings = chain_file.find_mappings(record.seqid, record.start - 1, record.end)

        # unmapped
        if mappings is None:
            LOG.debug("\tFail due to no mappings")
            gtf_unmapped_file.write(gtf_file.current_line)
            fail += 0
            continue
        else:
            LOG.debug("{0} mappings found".format(len(mappings)))

        success += 1
        start = mappings[0].to_start + 1
        end = mappings[-1].to_end

        LOG.debug("({0}, {1}) => ({2}, {3})".format(record.start - 1, record.end, start, end))

        elems = gtf_file.current_line.rstrip().split('\t')
        elems[3] = start
        elems[4] = end

        LOG.debug("     NEW: {0}".format("\t".join(map(str, elems))))

        gtf_out.write("\t".join(map(str, elems)))
        gtf_out.write("\n")

    gtf_out.close()
    gtf_unmapped_file.close()

    LOG.info("Converted {0:,} of {1:,} records".format(success, total))
    LOG.info('GTF file converted')