def gtf2chain(chain_file, input_file, output_file, chain_genes=False): """ :param chain_file: :param input_file: :param output_file: :param chain_genes: :return: """ start = time.time() LOG.info("Execution complete: {0}".format(format_time(start, time.time()))) chain_file = g2g_fu.check_file(chain_file) input_file = g2g_fu.check_file(input_file) output_file = g2g_fu.check_file(output_file, 'w') output_file_dir = os.path.dirname(output_file) LOG.info("GTF FILE: {0}".format(input_file)) LOG.info("FROM CHAIN FILE: {0}".format(chain_file)) LOG.info("TO CHAIN FILE: {0}".format(output_file)) temp_db = g2g_fu.gen_file_name("_g2gtempfile", output_file_dir, ".db3") gtf2db(input_file, temp_db) db2chain(chain_file, temp_db, output_file, chain_genes) g2g_fu.delete_file(temp_db) LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def fasta_extract_exons(fasta_file, database_file, output, raw=False): start = time.time() if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta_file = g2g_fu.check_file(fasta_file) fasta = FastaFile(fasta_file) database_file = g2g_fu.check_file(database_file) fasta_out = sys.stdout if output: output = g2g_fu.check_file(output, 'w') fasta_out = open(output, "w") LOG.info("FASTA FILE: {0}".format(fasta.filename)) LOG.info("DATABASE FILE: {0}".format(database_file)) LOG.info("OUTPUT FILE: {0}".format(fasta_out.name)) try: transcripts = get_transcripts_simple(database_file) for i, transcript in enumerate(transcripts): if transcript.seqid not in fasta.references: continue for ensembl_id, exon in transcript.exons.iteritems(): LOG.debug("Exon={0}".format(exon)) partial_seq = fasta.fetch(exon.seqid, exon.start-1, exon.end) partial_seq_str = partial_seq if transcript.strand == -1: partial_seq_str = str(reverse_complement_sequence(partial_seq)) LOG.debug("{0}:{1}-{2} (Length: {3})\n{4}".format(exon.seqid, exon.start, exon.end, len(partial_seq), partial_seq_str)) if raw: fasta_out.write(partial_seq_str) else: fasta_id = ">{0} {1}:{2}-{3}\n".format(exon.ensembl_id, exon.seqid, exon.start, exon.end) fasta_out.write(fasta_id) for line in wrap_sequence(partial_seq_str): fasta_out.write(line.strip()) fasta_out.write('\n') except G2GValueError as e: LOG.info(e.msg.rstrip()) raise e except G2GFastaError as e: LOG.info(e.msg.rstrip()) raise e LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False): """ :param input_file: :param fasta_file: :param strain: :param output_file: :param vcf_keep: :param passed: :param quality: :param diploid: :return: """ start = time.time() input_file = g2g_fu.check_file(input_file) fasta_file = g2g_fu.check_file(fasta_file) if not strain: raise G2GValueError("No strain was specified.") output_file = g2g_fu.check_file(output_file, 'w') output_file_dir = os.path.dirname(output_file) LOG.info("VCF FILE: {0}".format(input_file)) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(output_file)) vcf_discard_file = None if vcf_keep: vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file)) vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file) LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(passed))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not isinstance(fasta_file, FastaFile): fasta_file = FastaFile(fasta_file) tb = TabixFile(input_file) sample_index = None for h in tb.header: if h[:6] == '#CHROM': try: elems = h.split('\t') samples = elems[9:] samples = dict(zip(samples, (x for x in xrange(len(samples))))) sample_index = samples[strain] except KeyError, ke: raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
def offset2chain(from_file, to_file, output_file): """ Convert Seqnature offset files to new chain file. :param from_file: from Chromosome File (see docs) :param to_file: to Chromosome File (see docs) :param output_file: the output chain file """ start = time.time() from_file = g2g_fu.check_file(from_file) to_file = g2g_fu.check_file(to_file) output_file_name = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file_name) LOG.info("FROM FILE: {0}".format(from_file)) LOG.info("TO FILE: {0}".format(to_file)) LOG.info("CHAIN FILE: {0}".format(output_file_name)) LOG.info("Generating chain file...") try: chromosomes = offset_parse_chromosomes(from_file, to_file) for c, chromosome in chromosomes.iteritems(): LOG.debug('Examining chromosome: {0}'.format(chromosome)) if chromosome['file_path']: offset_chromosome_to_chain(chromosome, output_file) else: LOG.debug("No file for {0}, so skipping".format(chromosome)) LOG.info("Chain file created") except Exception, e: raise G2GChainFileError("Unable to generate chain file")
def convert_bed_file(chain_file, input_file, output_file, reverse=False): """ Convert BED coordinates. The mappings of coordinates are stored in the :class:`.chain.ChainFile` object. :param chain_file: chain file used for conversion :type chain_file: :class:`.chain.ChainFile` :param str file_in: the input BED file :type file_in: string :param file_out: the output BED file :type file_out: string :param reverse: reverse direction of original chain file :type reverse: boolean :return: Nothing """ if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) input_file = g2g_fu.check_file(input_file) output_file_name = g2g_fu.check_file(output_file, 'w') unmapped_file_name = "{0}.unmapped".format(output_file_name) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(input_file)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name)) if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file, reverse=reverse) LOG.info("Chain file parsed") bed_out = open(output_file_name, "w") bed_unmapped_file = open(unmapped_file_name, "w") LOG.info("Converting BED file") bed_file = BED(input_file) total = 0 success = 0 fail = 0 # BED is 0 based, bx-python is 0 based try: for record in bed_file: # skip over "track" lines if not bed_file.current_line_is_bed: bed_out.write(bed_file.current_line) bed_out.write("\n") continue total += 1 mappings = chain_file.find_mappings(record.chrom, record.start, record.end) # unmapped if mappings: success += 1 else: LOG.debug("Fail due to no mappings") bed_unmapped_file.write(bed_file.current_line) fail += 1 continue start = mappings[0].to_start end = mappings[-1].to_end LOG.debug("({0}, {1}) => ({2}, {3})".format(record.start, record.end, start, end)) elems = bed_file.current_line.split() elems[1] = start elems[2] = end bed_out.write("\t".join(map(str, elems))) bed_out.write("\n") bed_out.close() bed_unmapped_file.close() LOG.info("Converted {0} of {1} records".format(success, total)) except G2GLocationError, le: LOG.error("{0}: {1}".format(le.message, bed_file.current_line))
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain':chain_file.current_chain_header, 'lines': []} else: chr_info[chain_file.current_chain_header[1]]['lines'].append(line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def convert_bam_file(chain_file, file_in, file_out, reverse=False): """ Convert genome coordinates (in BAM/SAM format) between assemblies. These coordinates are stored in the :class:`.chain.ChainFile` object. :param chain_file: chain file used for conversion :type chain_file: :class:`.chain.ChainFile` :param str file_in: the input SAM or BAM file :type file_in: string :param file_out: the output SAM or file :type file_out: string :param reverse: reverse direction of original chain file :type reverse: boolean """ if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) if not isinstance(file_in, pysam.Samfile): file_in = g2g_fu.check_file(file_in) output_file_name = g2g_fu.check_file(file_out, 'w') unmapped_file_name = "{0}.unmapped".format(output_file_name) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(file_in)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name)) if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file, reverse=reverse) LOG.info("Chain file parsed") if not isinstance(file_in, pysam.Samfile): try: sam_file = pysam.Samfile(file_in, 'rb') if len(sam_file.header) == 0: raise G2GBAMError("BAM File has no header information") except: sam_file = pysam.Samfile(file_in, 'r') if len(sam_file.header) == 0: raise G2GBAMError("SAM File has no header information") LOG.info("Converting BAM file") new_header = sam_file.header # replace 'HD' new_header['HD'] = {'VN': 1.0, 'SO': 'coordinate'} # replace SQ tmp = [] name_to_id = {} id = 0 for ref_name in sorted(chain_file.chrom_size_to): tmp.append({'LN': chain_file.chrom_size_from[ref_name], 'SN': ref_name}) name_to_id[ref_name] = id id += 1 new_header['SQ'] = tmp if 'PG' not in new_header: new_header['PG'] = [] new_header['PG'].append({'ID': 'gtgtools', 'VN': 1.0}) if 'CO' not in new_header: new_header['CO'] = [] new_header['CO'].append("Original file: {0}".format(file_in)) new_header['CO'].append("Chain File: {0}".format(chain_file.file_name)) dir, temp_file_name = os.path.split(file_out) parts = temp_file_name.split('.') ext = parts[-1] if ext.lower() == 'bam': new_file = pysam.Samfile(file_out, 'wb', header=new_header) new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wb', template=sam_file) elif ext.lower() == 'sam': new_file = pysam.Samfile(file_out, 'wh', header=new_header) new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wh', template=sam_file) else: raise G2GBAMError("Unable to create new file based upon file extension") total = 0 total_unmapped = 0 total_fail_qc = 0 map_statistics = {'total': 0, 'fail_cannot_map': 0, 'success_simple': 0, 'success_complex': 0} map_statistics_pair = {'total': 0, 'fail_cannot_map': 0, 'success_1_fail_2_simple': 0, 'success_1_fail_2_complex': 0, 'success_1_simple_2_fail': 0, 'success_1_simple_2_simple': 0, 'success_1_simple_2_complex': 0, 'success_1_complex_2_fail': 0, 'success_1_complex_2_simple': 0, 'success_1_complex_2_complex': 0} try: while True: if total and total % 10000 == 0: status_success = 0 status_failed = 0 for k, v in map_statistics_pair.iteritems(): if k.startswith('success'): status_success += v elif k.startswith('fail'): status_failed += v LOG.info("Processed {0:,} reads, {1:,} successful, {2:,} failed".format(total, status_success, status_failed)) alignment = sam_file.next() alignment_new = pysam.AlignedRead() read_chr = sam_file.getrname(alignment.tid) # READ ONLY # aend aligned reference position of the read on the reference genome # alen aligned length of the read on the reference genome. # positions a list of reference positions that this read aligns to # qend end index of the aligned query portion of the sequence (0-based, exclusive) # qlen Length of the aligned query sequence # qqual aligned query sequence quality values # qstart start index of the aligned query portion of the sequence (0-based, inclusive) # query aligned portion of the read and excludes any flanking bases that were soft clipped # rlen length of the read # TRUE / FALSE (setting effects flag) # is_paired true if read is paired in sequencing # is_proper_pair true if read is mapped in a proper pair # is_qcfail true if QC failure # is_read1 true if this is read1 # is_read2 true if this is read2 # is_reverse true if read is mapped to reverse strand # is_secondary true if not primary alignment # is_unmapped true if read itself is unmapped # mate_is_reverse true is read is mapped to reverse strand # mate_is_unmapped true if the mate is unmapped # SET # cigar cigar as list of tuples # cigarstring alignment as a string # flag properties flag # mapq mapping quality # pnext the position of the mate # pos 0-based leftmost coordinate # pnext the position of the mate # qname the query name # rnext the reference id of the mate # seq read sequence bases, including soft clipped bases # tid target id, contains the index of the reference sequence in the sequence dictionary # DON'T NEED TO SET or SHOULD WE SET? # qual read sequence base qualities, including soft clipped bases # tags the tags in the AUX field # tlen insert size total += 1 LOG.debug('~'*80) LOG.debug("Converting {0} {1} {2} {3}".format(alignment.qname, read_chr, alignment.pos, alignment.cigarstring)) if alignment.is_qcfail: LOG.debug("\tFail due to qc of old alignment") new_file_unmapped.write(alignment) total_fail_qc += 1 continue if alignment.is_unmapped: LOG.debug("\tFail due to unmapped old alignment") new_file_unmapped.write(alignment) total_unmapped += 1 continue if not alignment.is_paired: LOG.debug("SINGLE END ALIGNMENT") map_statistics['total'] += 1 alignment_new.seq = alignment.seq alignment_new.flag = FLAG_NONE alignment_new.mapq = alignment.mapq alignment_new.qname = alignment.qname alignment_new.qual = alignment.qual alignment_new.tags = alignment.tags read_start = alignment.pos read_end = alignment.aend read_strand = '-' if alignment.is_reverse else '+' mappings = chain_file.find_mappings(read_chr, read_start, read_end) # unmapped if mappings is None: LOG.debug("\tFail due to no mappings") new_file_unmapped.write(alignment) map_statistics['fail_cannot_map'] += 1 elif len(mappings) == 1: if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE alignment_new.tid = name_to_id[mappings[0].to_chr] alignment_new.pos = mappings[0].to_start alignment_new.cigar = alignment.cigar new_file.write(alignment_new) LOG.debug("\tSuccess (simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) map_statistics['success_simple'] += 1 else: LOG.debug("MAPPINGS: {0}".format(len(mappings))) for m in mappings: LOG.debug("> {0}".format(m)) if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE alignment_new.tid = name_to_id[mappings[0].to_chr] alignment_new.pos = mappings[0].to_start alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read_strand, alignment.pos) new_file.write(alignment_new) LOG.debug("\tSuccess (complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) map_statistics['success_complex'] += 1 else: LOG.debug("PAIRED END ALIGNMENT") map_statistics_pair['total'] += 1 alignment_new.seq = alignment.seq alignment_new.flag = FLAG_PAIRED alignment_new.mapq = alignment.mapq alignment_new.qname = alignment.qname alignment_new.qual = alignment.qual alignment_new.tags = alignment.tags if alignment.is_read1: alignment_new.flag |= FLAG_READ1 if alignment.is_read2: alignment_new.flag |= FLAG_READ2 if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE if alignment.mate_is_reverse: alignment_new.flag |= FLAG_MREVERSE read1_chr = sam_file.getrname(alignment.tid) read1_start = alignment.pos read1_end = alignment.aend read1_strand = '-' if alignment.is_reverse else '+' read1_mappings = chain_file.find_mappings(read1_chr, read1_start, read1_end) #, read1_strand) read2_chr = None read2_start = None read2_end = None read2_strand = None read2_mappings = None if alignment.mate_is_unmapped: alignment_new.flag |= FLAG_MUNMAP else: read2_chr = sam_file.getrname(alignment.rnext) read2_start = alignment.pnext read2_end = read2_start + 1 read2_strand = '-' if alignment.mate_is_reverse else '+' try: read2_mappings = chain_file.find_mappings(read2_chr, read2_start, read2_end) except: read2_mappings = None if read1_mappings is None and read2_mappings is None: alignment_new.flag |= FLAG_UNMAP alignment_new.flag |= FLAG_MUNMAP LOG.debug("\tFail due to no mappings") new_file_unmapped.write(alignment) map_statistics_pair['fail_cannot_map'] += 1 elif read1_mappings is None and read2_mappings and len(read2_mappings) == 1: alignment_new.flag |= FLAG_UNMAP alignment_new.pos = 0 alignment_new.cigarstring = '0M' alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 LOG.debug("\tPair Success (1:fail,2:simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_fail_2_simple'] += 1 elif read1_mappings is None and read2_mappings and len(read2_mappings) > 1: alignment_new.flag |= FLAG_UNMAP alignment_new.pos = 0 alignment_new.cigarstring = '0M' alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 LOG.debug("\tPair Success (1:fail,2:complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_fail_2_complex'] += 1 elif read1_mappings and len(read1_mappings) == 1 and read2_mappings is None: alignment_new.flag |= FLAG_MUNMAP alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read1_mappings[0].to_chr] alignment_new.pnext = 0 alignment_new.tlen = 0 # CHECK LOG.debug("\tPair Success (1:simple,2:fail): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_fail'] += 1 elif read1_mappings and len(read1_mappings) == 1 and read2_mappings and len(read2_mappings) == 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug("\tPair Success (1:simple,2:simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_simple'] += 1 elif read1_mappings and len(read1_mappings) == 1 and read2_mappings and len(read2_mappings) > 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug("\tPair Success (1:simple,2:complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_complex'] += 1 elif read1_mappings and len(read1_mappings) > 1 and read2_mappings is None: alignment_new.flag |= FLAG_MUNMAP alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read1_mappings[0].to_chr] alignment_new.pnext = 0 alignment_new.tlen = 0 # CHECK LOG.debug("\tPair Success (1:complex,2:fail): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_fail'] += 1 elif read1_mappings and len(read1_mappings) > 1 and read2_mappings and len(read2_mappings) == 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug("\tPair Success (1:complex,2:simple): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_simple'] += 1 elif read1_mappings and len(read1_mappings) > 1 and read2_mappings and len(read2_mappings) > 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar(alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug("\tPair Success (1:complex,2:complex): {0} {1}".format(alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_complex'] += 1 else: raise G2GBAMError("Unknown BAM/SAM conversion/parse situation") except StopIteration: LOG.info("All reads processed") LOG.info(" {:>10} TOTAL ENTRIES".format(total)) LOG.info(" {:>10} TOTAL UNMAPPED ".format(total_unmapped)) LOG.info(" {:>10} TOTAL FAIL QC ".format(total_fail_qc)) if map_statistics['total'] > 0: LOG.info("") LOG.info("Mapping Summary Single End") LOG.info(" {:>10} TOTAL ENTRIES".format(map_statistics['total'])) LOG.info("") LOG.info(" {:>10} TOTAL SUCCESS".format(map_statistics['success_simple'] + map_statistics['success_complex'])) LOG.info(" {:>10} Simple".format(map_statistics['success_simple'])) LOG.info(" {:>10} Complex".format(map_statistics['success_complex'])) LOG.info("") LOG.info(" {:>10} TOTAL FAILURES".format(map_statistics['fail_cannot_map'])) LOG.info(" {:>10} Cannot Map ".format(map_statistics['fail_cannot_map'])) if map_statistics_pair['total'] > 0: total_success = 0 for k, v in map_statistics_pair.iteritems(): if k.startswith('success'): total_success += v LOG.info("") LOG.info("Mapping Summary Paired End") LOG.info(" {:>10} TOTAL ENTRIES".format(map_statistics_pair['total'])) LOG.info("") LOG.info(" {:>10} TOTAL SUCCESS".format(total_success)) LOG.info(" {:>10} Read 1 Failed, Read 2 Simple".format(map_statistics_pair['success_1_fail_2_simple'])) LOG.info(" {:>10} Read 1 Failed, Read 2 Complex".format(map_statistics_pair['success_1_fail_2_complex'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Failed".format(map_statistics_pair['success_1_simple_2_fail'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Simple".format(map_statistics_pair['success_1_simple_2_simple'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Complex".format(map_statistics_pair['success_1_simple_2_complex'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Failed".format(map_statistics_pair['success_1_complex_2_fail'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Simple".format(map_statistics_pair['success_1_complex_2_simple'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Complex".format(map_statistics_pair['success_1_complex_2_complex'])) LOG.info("") LOG.info(" {:>10} TOTAL FAILURES".format(map_statistics_pair['fail_cannot_map'])) LOG.info(" {:>10} Cannot Map".format(map_statistics_pair['fail_cannot_map'])) LOG.info("") LOG.info("BAM File Converted")
def db2chain(chain_file, input_file, output_file, chain_genes=False): """ :param chain_file: :param input_file: :param output_file: :param chain_genes: :return: """ start = time.time() if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) input_file = g2g_fu.check_file(input_file) output_file_name = g2g_fu.check_file(output_file, 'w') LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(input_file)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) if chain_genes: LOG.info("CHAIN TYPE: GENES") else: LOG.info("CHAIN TYPE: TRANSCRIPTS") if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file) LOG.info("Chain file parsed") LOG.info('Creating new chain file...') if chain_genes: LOG.debug("Generating chain for genes") for chromosome in chain_file.get_seqids(): LOG.debug("Generating chain for genes in chromosome {0}".format(chromosome)) for i, gene in enumerate(get_genes_simple(input_file, location=Location(chromosome))): LOG.debug("\n{0}".format(gene)) chain_entries = [] from_start = None to_start = None from_end = None to_end = None mappings = chain_file.find_mappings(gene.seqid, gene.start, gene.end) if gene.strand == 1: if mappings and len(mappings) > 0: if not from_start: from_start = mappings[0].from_start to_start = mappings[0].to_start if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 for m in mappings: if not prev_mapping: prev_mapping = m else: prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start dt = m.from_start - prev_mapping.from_end dq = m.to_start - prev_mapping.to_end if dt > 0: chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m chain_size = mappings[-1].from_end - mappings[-1].from_start if dt > 0: chain_size += dq sum_size += dq c.lines.append([chain_size]) chain_entries.append(c) else: if mappings and len(mappings) > 0: if not from_end: from_end = mappings[-1].from_end to_end = mappings[-1].to_end if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 # reverse mappings = mappings[::-1] for m in mappings: LOG.debug("CURRENT MAPPING: {0}".format(m)) if not prev_mapping: prev_mapping = m else: LOG.debug("PREV MAPPING: {0}".format(prev_mapping)) prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start #dt = m.from_start - prev_mapping.from_end #dq = m.to_start - prev_mapping.to_end dt = prev_mapping.from_start - m.from_end dq = prev_mapping.to_start - m.to_end LOG.debug("dt={0}, dq={1}".format(dt, dq)) #if dt > 0: # LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size)) # chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m LOG.debug("finding last...{0}".format(mappings[-1])) chain_size = mappings[-1].from_end - mappings[-1].from_start #if dt > 0: # LOG.debug("WHOA {0}".format(dt)) # LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size)) # chain_size += dq # sum_size += dq c.lines.append([chain_size]) LOG.debug(c.lines[-1]) chain_entries.append(c) if chain_entries and len(chain_entries) > 0: sum_size = 0 sum_dq = 0 sum_dt = 0 lines = [] for line in chain_entries[0].lines: sum_size += line[0] if len(line) > 1: sum_dq += line[1] sum_dt += line[2] lines.append('\t'.join(map(str, line))) if output_file: outf = open(output_file, "a") else: outf = sys.stdout outf.write(CHAIN_STRING.format(CHAIN_STRING, from_chr=gene.seqid, from_length=sum_size + sum_dq, from_start=0, from_end=sum_size + sum_dq, to_chr=gene.seqid, to_length=sum_size + sum_dt, to_start=0, to_end=sum_size + sum_dt, id=gene.ensembl_id)) outf.write("\n") outf.write("\n".join(lines)) outf.write("\n") outf.close() else: for chromosome in chain_file.get_seqids(): LOG.debug("Generating chain for transcripts in chromosome {0}".format(chromosome)) for i, transcript in enumerate(get_transcripts_simple(input_file, location=Location(chromosome))): LOG.debug("Transcript = {0}".format(transcript)) chain_entries = [] from_start = None to_start = None from_end = None to_end = None transcript.exons = OrderedDict(sorted(transcript.exons.items(), key=lambda x: x[1].exon_number)) for ensembl_id, exon in transcript.exons.iteritems(): LOG.debug("Exon = {0}".format(exon)) mappings = chain_file.find_mappings(exon.seqid, exon.start, exon.end) if exon.strand == 1: if mappings and len(mappings) > 0: if not from_start: from_start = mappings[0].from_start to_start = mappings[0].to_start if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 for m in mappings: if not prev_mapping: prev_mapping = m else: prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start dt = m.from_start - prev_mapping.from_end dq = m.to_start - prev_mapping.to_end if dt > 0: chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m chain_size = mappings[-1].from_end - mappings[-1].from_start if dt > 0: chain_size += dq sum_size += dq c.lines.append([chain_size]) chain_entries.append(c) else: if mappings and len(mappings) > 0: if not from_end: from_end = mappings[-1].from_end to_end = mappings[-1].to_end if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 # reverse mappings = mappings[::-1] for m in mappings: LOG.debug("CURRENT MAPPING: {0}".format(m)) if not prev_mapping: prev_mapping = m else: LOG.debug("PREV MAPPING: {0}".format(prev_mapping)) prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start #dt = m.from_start - prev_mapping.from_end #dq = m.to_start - prev_mapping.to_end dt = prev_mapping.from_start - m.from_end dq = prev_mapping.to_start - m.to_end LOG.debug("dt={0}, dq={1}".format(dt, dq)) #if dt > 0: # LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size)) # chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m LOG.debug("finding last...{0}".format(mappings[-1])) chain_size = mappings[-1].from_end - mappings[-1].from_start #if dt > 0: # LOG.debug("WHOA {0}".format(dt)) # chain_size += dq # sum_size += dq c.lines.append([chain_size]) LOG.debug(c.lines[-1]) chain_entries.append(c) # collapse exons if chain_entries and len(chain_entries) > 0: LOG.debug('>>>>>>>') for c in chain_entries: LOG.debug(str(c)) LOG.debug('>>>>>>>') chain_entries = collapse_entries(chain_entries) sum_size = 0 sum_dq = 0 sum_dt = 0 lines = [] for line in chain_entries[0].lines: sum_size += line[0] if len(line) > 1: sum_dq += line[1] sum_dt += line[2] lines.append('\t'.join(map(str, line))) if output_file: outf = open(output_file, "a") else: outf = sys.stdout outf.write(CHAIN_STRING.format(CHAIN_STRING, from_chr=transcript.seqid, from_length=sum_size + sum_dq, from_start=0, from_end=sum_size + sum_dq, to_chr=transcript.seqid, to_length=sum_size + sum_dt, to_start=0, to_end=sum_size + sum_dt, id=transcript.ensembl_id)) outf.write("\n") outf.write("\n".join(lines)) outf.write("\n") outf.close() LOG.info('New chain file created') LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def gtf2db(input_file, output_file): """ Convert a GTF file into SQLite :param input_file: the GTF file to convert :param output_file: The generated database file """ start = time.time() input_file = g2g_fu.check_file(input_file, 'r') output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) LOG.info("GTF FILE: {0}".format(input_file)) LOG.info("DB File: {0}".format(output_file)) conn = sqlite3.connect(output_file) c = conn.cursor() LOG.debug("Generating tables") c.execute(SQL_CREATE_GTF_TABLE) c.execute(SQL_CREATE_GTF_LOOKUP_TABLE) c.execute(SQL_CREATE_GTF_SOURCES_TABLE) c.execute(SQL_CREATE_GTF_TYPES_TABLE) c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE) gtf_types = {} gtf_sources = {} gtf_attributes = {} LOG.info("Parsing GTF file...") gtf_file = GTF(input_file) counter = 0 for record in gtf_file: if counter and counter % 100000 == 0: LOG.info("Processed {0:,} records".format(counter)) if record.type not in gtf_types: _type_key = len(gtf_types.keys()) gtf_types[record.type] = _type_key else: _type_key = gtf_types[record.type] if record.source not in gtf_sources: _source_key = len(gtf_sources.keys()) gtf_sources[record.source] = _source_key else: _source_key = gtf_sources[record.source] strand = 0 if record.strand in ['+', '-']: strand = 1 if record.strand == '+' else -1 gene_id = record.attributes['gene_id'] transcript_id = record.attributes[ 'transcript_id'] if 'transcript_id' in record.attributes else None ensembl_id = None if record.type == 'gene': ensembl_id = record.attributes['gene_id'] elif record.type == 'transcript': ensembl_id = record.attributes['transcript_id'] elif record.type == 'exon': ensembl_id = record.attributes['exon_id'] else: ensembl_id = record.attributes[ 'protein_id'] if 'protein_id' in record.attributes else None c.execute(SQL_INSERT_GTF_TABLE, (gene_id, transcript_id, ensembl_id, record.seqid, record.start, record.end, strand, record.score, _source_key, _type_key, record.frame)) gtf_key = c.lastrowid for attribute, value in record.attributes.iteritems(): if attribute not in ['gene_id', 'transcript_id', 'exon_id']: if attribute not in gtf_attributes: _attribute_key = len(gtf_attributes.keys()) gtf_attributes[attribute] = _attribute_key else: _attribute_key = gtf_attributes[attribute] c.execute(SQL_INSERT_GTF_LOOKUP_TABLE, (gtf_key, _attribute_key, value)) counter += 1 # save (commit) the changes conn.commit() for source, _key in gtf_sources.iteritems(): c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source)) conn.commit() for type, _key in gtf_types.iteritems(): c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type)) conn.commit() for attribute, _key in gtf_attributes.iteritems(): c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute)) conn.commit() LOG.info("GTF File parsed") LOG.info("Finalizing database...") for sql in SQL_INDICES_GTF: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_LOOKUP: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_TYPES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_SOURCES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_ATTRIBUTES: LOG.debug(sql) c.execute(sql) LOG.info("Database created") # close connection conn.close() LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False): """ Initialize fasta_patch variables :param filename_fasta: :param filename_vcf: :param strain: :param filename_output: :param bgzip: :param diploid: :return: """ filename_output = g2g_fu.check_file(filename_output, "w") output_file_dir = os.path.abspath(os.path.dirname(filename_output)) new_filename_output = filename_output # let's figure out what our output names will be if filename_output.lower().endswith(".gz"): # strip off .gz new_filename_output = filename_output[:-3] if not filename_output.lower().endswith(".fa"): raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'") if diploid: filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, "l") filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, "r") g2g_fu.delete_index_files(filename_output_l) g2g_fu.delete_index_files(filename_output_r) else: filename_output_l = new_filename_output filename_output_r = None g2g_fu.delete_index_files(filename_output_l) # at this point we are hoping for a .fa extension # let's figure out our input and process accordingly if filename_fasta.lower().endswith(".fa.gz"): # decompress the fasta file if it is compressed LOG.info("Copying and decompressing fasta file") # copy file and preserve gz extension for bgzip -d to work tmp_file_name = os.path.basename(filename_fasta) # something.gz LOG.debug("tmp_file_name={0}".format(tmp_file_name)) tmp_fasta = os.path.join(output_file_dir, tmp_file_name) # /path/something.fa.gz LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta)) shutil.copy(filename_fasta, tmp_fasta) # cp /original/something.fa.gz /output/something.fa.gz LOG.debug("DECOMPRESSING {0}".format(tmp_fasta)) g2g_fu.bgzip_decompress(tmp_fasta) tmp_fasta = tmp_fasta[:-3] # /path/something.fa LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l)) shutil.move(tmp_fasta, filename_output_l) elif filename_fasta.lower().endswith(".fa"): LOG.debug("File is not compressed") LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l)) shutil.copy(filename_fasta, filename_output_l) else: raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'") if diploid: LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r)) shutil.copy(filename_output_l, filename_output_r) # build a temporary fasta index pysam.FastaFile(filename_output_l) return filename_output_l, filename_output_r
def fasta_patch( filename_fasta, filename_vcf, strain, filename_output, bgzip=False, num_processes=None, pass_only=False, quality=False, diploid=False, ): """ Patch a Fasta file by replacing the bases where the SNPs are located in the VCF file. :param filename_fasta: name of the input Fasta file :type filename_fasta: string :param filename_vcf: name of the VCF file :type filename_vcf: string :param strain: name of strain to use in VCF file :type strain: string :param filename_output: name of the output Fasta file :type filename_output: string :param bgzip: compress file in BGZIP format :type bgzip: boolean :param num_processes: the number of processes to spawn :type num_processes: int :param pass_only: Only process those VCF records with a 'PASS' :type pass_only: boolean :param quality: filter on quality, FI=PASS :type quality: boolean :param diploid: don't ignore hets and create 2 files :type diploid: boolean :return: Nothing """ start = time.time() filename_fasta = g2g_fu.check_file(filename_fasta) filename_vcf = g2g_fu.check_file(filename_vcf) LOG.info("INPUT FASTA FILE: {0}".format(filename_fasta)) LOG.info("VCF FILE: {0}".format(filename_vcf)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(pass_only))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not strain: raise G2GValueError("No strain was specified.") filename_output_l, filename_output_r = prepare_fasta_patch(filename_fasta, filename_output, bgzip, diploid) if not num_processes: num_processes = multiprocessing.cpu_count() else: if num_processes <= 0: num_processes = 1 LOG.info("NUMBER OF PROCESSES: {0}".format(num_processes)) if bgzip: if diploid: LOG.info("OUTPUT FASTA FILES: {0}.gz".format(filename_output_l)) LOG.info(" {0}.gz".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}.gz".format(filename_output_l)) else: if diploid: LOG.info("OUTPUT FASTA FILES: {0}".format(filename_output_l)) LOG.info(" {0}".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}".format(filename_output_l)) LOG.info("Patching...") try: patch( filename_fasta, filename_vcf, strain, filename_output_l, filename_output_r, num_processes, pass_only, quality, diploid, ) LOG.info("Patching complete") # remove the fai LOG.debug("removing the FAI index for {0}".format(g2g_fu.delete_index_files(filename_output_l))) g2g_fu.delete_index_files(filename_output_l) # move temp to final destination if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(filename_output_l, "{0}.gz".format(filename_output_l), "fa") if diploid: g2g_fu.bgzip_index(filename_output_r, "{0}.gz".format(filename_output_r), "fa") LOG.info("Execution complete: {0}".format(format_time(start, time.time()))) except Exception, e: LOG.debug(e) raise G2GError("")
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False): """ Initialize fasta_patch variables :param filename_fasta: :param filename_vcf: :param strain: :param filename_output: :param bgzip: :param diploid: :return: """ filename_output = g2g_fu.check_file(filename_output, 'w') output_file_dir = os.path.abspath(os.path.dirname(filename_output)) new_filename_output = filename_output # let's figure out what our output names will be if filename_output.lower().endswith('.gz'): # strip off .gz new_filename_output = filename_output[:-3] if not filename_output.lower().endswith('.fa'): raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'") if diploid: filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, 'l') filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, 'r') g2g_fu.delete_index_files(filename_output_l) g2g_fu.delete_index_files(filename_output_r) else: filename_output_l = new_filename_output filename_output_r = None g2g_fu.delete_index_files(filename_output_l) # at this point we are hoping for a .fa extension # let's figure out our input and process accordingly if filename_fasta.lower().endswith('.fa.gz'): # decompress the fasta file if it is compressed LOG.info("Copying and decompressing fasta file") # copy file and preserve gz extension for bgzip -d to work tmp_file_name = os.path.basename(filename_fasta) # something.gz LOG.debug("tmp_file_name={0}".format(tmp_file_name)) tmp_fasta = os.path.join(output_file_dir, tmp_file_name) # /path/something.fa.gz LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta)) shutil.copy(filename_fasta, tmp_fasta) # cp /original/something.fa.gz /output/something.fa.gz LOG.debug("DECOMPRESSING {0}".format(tmp_fasta)) g2g_fu.bgzip_decompress(tmp_fasta) tmp_fasta = tmp_fasta[:-3] # /path/something.fa LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l)) shutil.move(tmp_fasta, filename_output_l) elif filename_fasta.lower().endswith('.fa'): LOG.debug("File is not compressed") LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l)) shutil.copy(filename_fasta, filename_output_l) else: raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'") if diploid: LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r)) shutil.copy(filename_output_l, filename_output_r) # build a temporary fasta index pysam.FastaFile(filename_output_l) return filename_output_l, filename_output_r
def fasta_extract_exons(fasta_file, database_file, output, raw=False): start = time.time() if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta_file = g2g_fu.check_file(fasta_file) fasta = FastaFile(fasta_file) database_file = g2g_fu.check_file(database_file) fasta_out = sys.stdout if output: output = g2g_fu.check_file(output, 'w') fasta_out = open(output, "w") LOG.info("FASTA FILE: {0}".format(fasta.filename)) LOG.info("DATABASE FILE: {0}".format(database_file)) LOG.info("OUTPUT FILE: {0}".format(fasta_out.name)) try: transcripts = get_transcripts_simple(database_file) for i, transcript in enumerate(transcripts): if transcript.seqid not in fasta.references: continue for ensembl_id, exon in transcript.exons.iteritems(): LOG.debug("Exon={0}".format(exon)) partial_seq = fasta.fetch(exon.seqid, exon.start - 1, exon.end) partial_seq_str = partial_seq if transcript.strand == -1: partial_seq_str = str( reverse_complement_sequence(partial_seq)) LOG.debug("{0}:{1}-{2} (Length: {3})\n{4}".format( exon.seqid, exon.start, exon.end, len(partial_seq), partial_seq_str)) if raw: fasta_out.write(partial_seq_str) else: fasta_id = ">{0} {1}:{2}-{3}\n".format( exon.ensembl_id, exon.seqid, exon.start, exon.end) fasta_out.write(fasta_id) for line in wrap_sequence(partial_seq_str): fasta_out.write(line.strip()) fasta_out.write('\n') except G2GValueError as e: LOG.info(e.msg.rstrip()) raise e except G2GFastaError as e: LOG.info(e.msg.rstrip()) raise e LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def convert_gtf_file(chain_file, input_file, output_file, reverse=False): """ Convert GTF coordinates. The mappings of coordinates are stored in the :class:`.chain.ChainFile` object. :param chain_file: :type chain_file: :class:`.chain.ChainFile` :param input_file: the input GTF file :type input_file: string :param output_file: the output GTF file :type output_file: string :param reverse: reverse direction of original chain file :type reverse: boolean :return: """ if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) input_file = g2g_fu.check_file(input_file) output_file_name = g2g_fu.check_file(output_file, 'w') unmapped_file_name = "{0}.unmapped".format(output_file_name) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(input_file)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name)) if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file, reverse=reverse) LOG.info("Chain file parsed") gtf_out = open(output_file_name, "w") gtf_unmapped_file = open(unmapped_file_name, "w") LOG.info("Converting GTF file...") gtf_file = GTF(input_file) total = 0 success = 0 fail = 0 # GTF is 1 based, bx-python is 0 based # when we do the querying, we subtract 1 from the GTF file start position # K.B. Also note in gtf when (s, e) is given...it should mean s <= x <= e. # bx-python (s, e) does it s <= x < e. for record in gtf_file: LOG.debug("\nORIGINAL: {0}".format(str(gtf_file.current_line).strip())) total += 1 if total % 100000 == 0: LOG.info("Processed {0:,} lines".format(total)) mappings = chain_file.find_mappings(record.seqid, record.start - 1, record.end) # unmapped if mappings is None: LOG.debug("\tFail due to no mappings") gtf_unmapped_file.write(gtf_file.current_line) fail += 0 continue else: LOG.debug("{0} mappings found".format(len(mappings))) success += 1 start = mappings[0].to_start + 1 end = mappings[-1].to_end LOG.debug("({0}, {1}) => ({2}, {3})".format(record.start - 1, record.end, start, end)) elems = gtf_file.current_line.rstrip().split('\t') elems[3] = start elems[4] = end LOG.debug(" NEW: {0}".format("\t".join(map(str, elems)))) gtf_out.write("\t".join(map(str, elems))) gtf_out.write("\n") gtf_out.close() gtf_unmapped_file.close() LOG.info("Converted {0:,} of {1:,} records".format(success, total)) LOG.info('GTF file converted')
def db2chain(chain_file, input_file, output_file, chain_genes=False): """ :param chain_file: :param input_file: :param output_file: :param chain_genes: :return: """ start = time.time() if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) input_file = g2g_fu.check_file(input_file) output_file_name = g2g_fu.check_file(output_file, 'w') LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(input_file)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) if chain_genes: LOG.info("CHAIN TYPE: GENES") else: LOG.info("CHAIN TYPE: TRANSCRIPTS") if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file) LOG.info("Chain file parsed") LOG.info('Creating new chain file...') if chain_genes: LOG.debug("Generating chain for genes") for chromosome in chain_file.get_seqids(): LOG.debug("Generating chain for genes in chromosome {0}".format( chromosome)) for i, gene in enumerate( get_genes_simple(input_file, location=Location(chromosome))): LOG.debug("\n{0}".format(gene)) chain_entries = [] from_start = None to_start = None from_end = None to_end = None mappings = chain_file.find_mappings(gene.seqid, gene.start, gene.end) if gene.strand == 1: if mappings and len(mappings) > 0: if not from_start: from_start = mappings[0].from_start to_start = mappings[0].to_start if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 for m in mappings: if not prev_mapping: prev_mapping = m else: prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start dt = m.from_start - prev_mapping.from_end dq = m.to_start - prev_mapping.to_end if dt > 0: chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m chain_size = mappings[-1].from_end - mappings[ -1].from_start if dt > 0: chain_size += dq sum_size += dq c.lines.append([chain_size]) chain_entries.append(c) else: if mappings and len(mappings) > 0: if not from_end: from_end = mappings[-1].from_end to_end = mappings[-1].to_end if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 # reverse mappings = mappings[::-1] for m in mappings: LOG.debug("CURRENT MAPPING: {0}".format(m)) if not prev_mapping: prev_mapping = m else: LOG.debug("PREV MAPPING: {0}".format( prev_mapping)) prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start #dt = m.from_start - prev_mapping.from_end #dq = m.to_start - prev_mapping.to_end dt = prev_mapping.from_start - m.from_end dq = prev_mapping.to_start - m.to_end LOG.debug("dt={0}, dq={1}".format(dt, dq)) #if dt > 0: # LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size)) # chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m LOG.debug("finding last...{0}".format( mappings[-1])) chain_size = mappings[-1].from_end - mappings[ -1].from_start #if dt > 0: # LOG.debug("WHOA {0}".format(dt)) # LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size)) # chain_size += dq # sum_size += dq c.lines.append([chain_size]) LOG.debug(c.lines[-1]) chain_entries.append(c) if chain_entries and len(chain_entries) > 0: sum_size = 0 sum_dq = 0 sum_dt = 0 lines = [] for line in chain_entries[0].lines: sum_size += line[0] if len(line) > 1: sum_dq += line[1] sum_dt += line[2] lines.append('\t'.join(map(str, line))) if output_file: outf = open(output_file, "a") else: outf = sys.stdout outf.write( CHAIN_STRING.format(CHAIN_STRING, from_chr=gene.seqid, from_length=sum_size + sum_dq, from_start=0, from_end=sum_size + sum_dq, to_chr=gene.seqid, to_length=sum_size + sum_dt, to_start=0, to_end=sum_size + sum_dt, id=gene.ensembl_id)) outf.write("\n") outf.write("\n".join(lines)) outf.write("\n") outf.close() else: for chromosome in chain_file.get_seqids(): LOG.debug( "Generating chain for transcripts in chromosome {0}".format( chromosome)) for i, transcript in enumerate( get_transcripts_simple(input_file, location=Location(chromosome))): LOG.debug("Transcript = {0}".format(transcript)) chain_entries = [] from_start = None to_start = None from_end = None to_end = None transcript.exons = OrderedDict( sorted(transcript.exons.items(), key=lambda x: x[1].exon_number)) for ensembl_id, exon in transcript.exons.iteritems(): LOG.debug("Exon = {0}".format(exon)) mappings = chain_file.find_mappings( exon.seqid, exon.start, exon.end) if exon.strand == 1: if mappings and len(mappings) > 0: if not from_start: from_start = mappings[0].from_start to_start = mappings[0].to_start if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 for m in mappings: if not prev_mapping: prev_mapping = m else: prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start dt = m.from_start - prev_mapping.from_end dq = m.to_start - prev_mapping.to_end if dt > 0: chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m chain_size = mappings[-1].from_end - mappings[ -1].from_start if dt > 0: chain_size += dq sum_size += dq c.lines.append([chain_size]) chain_entries.append(c) else: if mappings and len(mappings) > 0: if not from_end: from_end = mappings[-1].from_end to_end = mappings[-1].to_end if len(mappings) == 1: m = mappings[0] c = ChainEntry() c.lines.append([m.from_end - m.from_start]) chain_entries.append(c) else: c = ChainEntry() prev_mapping = None sum_size = 0 sum_dq = 0 sum_dt = 0 dq = 0 prev_dq = 0 dt = 0 prev_dt = 0 # reverse mappings = mappings[::-1] for m in mappings: LOG.debug("CURRENT MAPPING: {0}".format(m)) if not prev_mapping: prev_mapping = m else: LOG.debug("PREV MAPPING: {0}".format( prev_mapping)) prev_dt = dt prev_dq = dq chain_size = prev_mapping.from_end - prev_mapping.from_start #dt = m.from_start - prev_mapping.from_end #dq = m.to_start - prev_mapping.to_end dt = prev_mapping.from_start - m.from_end dq = prev_mapping.to_start - m.to_end LOG.debug("dt={0}, dq={1}".format( dt, dq)) #if dt > 0: # LOG.debug("DT > 0, ADDING to current chain_size {0}".format(chain_size)) # chain_size += prev_dq sum_size += chain_size sum_dq += dq sum_dt += dt c.lines.append([chain_size, dt, dq]) LOG.debug(c.lines[-1]) prev_mapping = m LOG.debug("finding last...{0}".format( mappings[-1])) chain_size = mappings[-1].from_end - mappings[ -1].from_start #if dt > 0: # LOG.debug("WHOA {0}".format(dt)) # chain_size += dq # sum_size += dq c.lines.append([chain_size]) LOG.debug(c.lines[-1]) chain_entries.append(c) # collapse exons if chain_entries and len(chain_entries) > 0: LOG.debug('>>>>>>>') for c in chain_entries: LOG.debug(str(c)) LOG.debug('>>>>>>>') chain_entries = collapse_entries(chain_entries) sum_size = 0 sum_dq = 0 sum_dt = 0 lines = [] for line in chain_entries[0].lines: sum_size += line[0] if len(line) > 1: sum_dq += line[1] sum_dt += line[2] lines.append('\t'.join(map(str, line))) if output_file: outf = open(output_file, "a") else: outf = sys.stdout outf.write( CHAIN_STRING.format(CHAIN_STRING, from_chr=transcript.seqid, from_length=sum_size + sum_dq, from_start=0, from_end=sum_size + sum_dq, to_chr=transcript.seqid, to_length=sum_size + sum_dt, to_start=0, to_end=sum_size + sum_dt, id=transcript.ensembl_id)) outf.write("\n") outf.write("\n".join(lines)) outf.write("\n") outf.close() LOG.info('New chain file created') LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [ parse_location( "{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references ] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format( chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = { 'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain': chain_file.current_chain_header, 'lines': [] } else: chr_info[chain_file.current_chain_header[1]]['lines'].append( line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn( "Length's do not match, chromosome length in chain: {0}, sequence length: {1}" .format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format( location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format( location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format( partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format( same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug( "LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}" .format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def gtf2db(input_file, output_file): """ Convert a GTF file into SQLite :param input_file: the GTF file to convert :param output_file: The generated database file """ start = time.time() input_file = g2g_fu.check_file(input_file, 'r') output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) LOG.info("GTF FILE: {0}".format(input_file)) LOG.info("DB File: {0}".format(output_file)) conn = sqlite3.connect(output_file) c = conn.cursor() LOG.debug("Generating tables") c.execute(SQL_CREATE_GTF_TABLE) c.execute(SQL_CREATE_GTF_LOOKUP_TABLE) c.execute(SQL_CREATE_GTF_SOURCES_TABLE) c.execute(SQL_CREATE_GTF_TYPES_TABLE) c.execute(SQL_CREATE_GTF_ATTRIBUTES_TABLE) gtf_types = {} gtf_sources = {} gtf_attributes = {} LOG.info("Parsing GTF file...") gtf_file = GTF(input_file) counter = 0 for record in gtf_file: if counter and counter % 100000 == 0: LOG.info("Processed {0:,} records".format(counter)) if record.type not in gtf_types: _type_key = len(gtf_types.keys()) gtf_types[record.type] = _type_key else: _type_key = gtf_types[record.type] if record.source not in gtf_sources: _source_key = len(gtf_sources.keys()) gtf_sources[record.source] = _source_key else: _source_key = gtf_sources[record.source] strand = 0 if record.strand in ['+', '-']: strand = 1 if record.strand == '+' else -1 gene_id = record.attributes['gene_id'] transcript_id = record.attributes['transcript_id'] if 'transcript_id' in record.attributes else None ensembl_id = None if record.type == 'gene': ensembl_id = record.attributes['gene_id'] elif record.type == 'transcript': ensembl_id = record.attributes['transcript_id'] elif record.type == 'exon': ensembl_id = record.attributes['exon_id'] else: ensembl_id = record.attributes['protein_id'] if 'protein_id' in record.attributes else None c.execute(SQL_INSERT_GTF_TABLE, (gene_id, transcript_id, ensembl_id, record.seqid, record.start, record.end, strand, record.score, _source_key, _type_key, record.frame)) gtf_key = c.lastrowid for attribute, value in record.attributes.iteritems(): if attribute not in ['gene_id', 'transcript_id', 'exon_id']: if attribute not in gtf_attributes: _attribute_key = len(gtf_attributes.keys()) gtf_attributes[attribute] = _attribute_key else: _attribute_key = gtf_attributes[attribute] c.execute(SQL_INSERT_GTF_LOOKUP_TABLE, (gtf_key, _attribute_key, value)) counter += 1 # save (commit) the changes conn.commit() for source, _key in gtf_sources.iteritems(): c.execute(SQL_INSERT_GTF_SOURCES_TABLE, (_key, source)) conn.commit() for type, _key in gtf_types.iteritems(): c.execute(SQL_INSERT_GTF_TYPES_TABLE, (_key, type)) conn.commit() for attribute, _key in gtf_attributes.iteritems(): c.execute(SQL_INSERT_GTF_ATTRIBUTES_TABLE, (_key, attribute)) conn.commit() LOG.info("GTF File parsed") LOG.info("Finalizing database...") for sql in SQL_INDICES_GTF: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_LOOKUP: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_TYPES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_SOURCES: LOG.debug(sql) c.execute(sql) for sql in SQL_INDICES_GTF_ATTRIBUTES: LOG.debug(sql) c.execute(sql) LOG.info("Database created") # close connection conn.close() LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def fasta_patch(filename_fasta, filename_vcf, strain, filename_output, bgzip=False, num_processes=None, pass_only=False, quality=False, diploid=False): """ Patch a Fasta file by replacing the bases where the SNPs are located in the VCF file. :param filename_fasta: name of the input Fasta file :type filename_fasta: string :param filename_vcf: name of the VCF file :type filename_vcf: string :param strain: name of strain to use in VCF file :type strain: string :param filename_output: name of the output Fasta file :type filename_output: string :param bgzip: compress file in BGZIP format :type bgzip: boolean :param num_processes: the number of processes to spawn :type num_processes: int :param pass_only: Only process those VCF records with a 'PASS' :type pass_only: boolean :param quality: filter on quality, FI=PASS :type quality: boolean :param diploid: don't ignore hets and create 2 files :type diploid: boolean :return: Nothing """ start = time.time() filename_fasta = g2g_fu.check_file(filename_fasta) filename_vcf = g2g_fu.check_file(filename_vcf) LOG.info("INPUT FASTA FILE: {0}".format(filename_fasta)) LOG.info("VCF FILE: {0}".format(filename_vcf)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(pass_only))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not strain: raise G2GValueError("No strain was specified.") filename_output_l, filename_output_r = prepare_fasta_patch(filename_fasta, filename_output, bgzip, diploid) if not num_processes: num_processes = multiprocessing.cpu_count() else: if num_processes <= 0: num_processes = 1 LOG.info("NUMBER OF PROCESSES: {0}".format(num_processes)) if bgzip: if diploid: LOG.info("OUTPUT FASTA FILES: {0}.gz".format(filename_output_l)) LOG.info(" {0}.gz".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}.gz".format(filename_output_l)) else: if diploid: LOG.info("OUTPUT FASTA FILES: {0}".format(filename_output_l)) LOG.info(" {0}".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}".format(filename_output_l)) LOG.info("Patching...") try: patch(filename_fasta, filename_vcf, strain, filename_output_l, filename_output_r, num_processes, pass_only, quality, diploid) LOG.info("Patching complete") # remove the fai LOG.debug("removing the FAI index for {0}".format(g2g_fu.delete_index_files(filename_output_l))) g2g_fu.delete_index_files(filename_output_l) # move temp to final destination if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(filename_output_l, "{0}.gz".format(filename_output_l), 'fa') if diploid: g2g_fu.bgzip_index(filename_output_r, "{0}.gz".format(filename_output_r), 'fa') LOG.info("Execution complete: {0}".format(format_time(start, time.time()))) except Exception, e: LOG.debug(e) raise G2GError("")
def convert_bam_file(chain_file, file_in, file_out, reverse=False): """ Convert genome coordinates (in BAM/SAM format) between assemblies. These coordinates are stored in the :class:`.chain.ChainFile` object. :param chain_file: chain file used for conversion :type chain_file: :class:`.chain.ChainFile` :param str file_in: the input SAM or BAM file :type file_in: string :param file_out: the output SAM or file :type file_out: string :param reverse: reverse direction of original chain file :type reverse: boolean """ if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) if not isinstance(file_in, pysam.Samfile): file_in = g2g_fu.check_file(file_in) output_file_name = g2g_fu.check_file(file_out, 'w') unmapped_file_name = "{0}.unmapped".format(output_file_name) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(file_in)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name)) if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file, reverse=reverse) LOG.info("Chain file parsed") if not isinstance(file_in, pysam.Samfile): try: sam_file = pysam.Samfile(file_in, 'rb') if len(sam_file.header) == 0: raise G2GBAMError("BAM File has no header information") except: sam_file = pysam.Samfile(file_in, 'r') if len(sam_file.header) == 0: raise G2GBAMError("SAM File has no header information") LOG.info("Converting BAM file") new_header = sam_file.header # replace 'HD' new_header['HD'] = {'VN': 1.0, 'SO': 'coordinate'} # replace SQ tmp = [] name_to_id = {} id = 0 for ref_name in sorted(chain_file.chrom_size_to): tmp.append({ 'LN': chain_file.chrom_size_from[ref_name], 'SN': ref_name }) name_to_id[ref_name] = id id += 1 new_header['SQ'] = tmp if 'PG' not in new_header: new_header['PG'] = [] new_header['PG'].append({'ID': 'gtgtools', 'VN': 1.0}) if 'CO' not in new_header: new_header['CO'] = [] new_header['CO'].append("Original file: {0}".format(file_in)) new_header['CO'].append("Chain File: {0}".format(chain_file.file_name)) dir, temp_file_name = os.path.split(file_out) parts = temp_file_name.split('.') ext = parts[-1] if ext.lower() == 'bam': new_file = pysam.Samfile(file_out, 'wb', header=new_header) new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wb', template=sam_file) elif ext.lower() == 'sam': new_file = pysam.Samfile(file_out, 'wh', header=new_header) new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wh', template=sam_file) else: raise G2GBAMError( "Unable to create new file based upon file extension") total = 0 total_unmapped = 0 total_fail_qc = 0 map_statistics = { 'total': 0, 'fail_cannot_map': 0, 'success_simple': 0, 'success_complex': 0 } map_statistics_pair = { 'total': 0, 'fail_cannot_map': 0, 'success_1_fail_2_simple': 0, 'success_1_fail_2_complex': 0, 'success_1_simple_2_fail': 0, 'success_1_simple_2_simple': 0, 'success_1_simple_2_complex': 0, 'success_1_complex_2_fail': 0, 'success_1_complex_2_simple': 0, 'success_1_complex_2_complex': 0 } try: while True: if total and total % 10000 == 0: status_success = 0 status_failed = 0 for k, v in map_statistics_pair.iteritems(): if k.startswith('success'): status_success += v elif k.startswith('fail'): status_failed += v LOG.info( "Processed {0:,} reads, {1:,} successful, {2:,} failed". format(total, status_success, status_failed)) alignment = sam_file.next() alignment_new = pysam.AlignedRead() read_chr = sam_file.getrname(alignment.tid) # READ ONLY # aend aligned reference position of the read on the reference genome # alen aligned length of the read on the reference genome. # positions a list of reference positions that this read aligns to # qend end index of the aligned query portion of the sequence (0-based, exclusive) # qlen Length of the aligned query sequence # qqual aligned query sequence quality values # qstart start index of the aligned query portion of the sequence (0-based, inclusive) # query aligned portion of the read and excludes any flanking bases that were soft clipped # rlen length of the read # TRUE / FALSE (setting effects flag) # is_paired true if read is paired in sequencing # is_proper_pair true if read is mapped in a proper pair # is_qcfail true if QC failure # is_read1 true if this is read1 # is_read2 true if this is read2 # is_reverse true if read is mapped to reverse strand # is_secondary true if not primary alignment # is_unmapped true if read itself is unmapped # mate_is_reverse true is read is mapped to reverse strand # mate_is_unmapped true if the mate is unmapped # SET # cigar cigar as list of tuples # cigarstring alignment as a string # flag properties flag # mapq mapping quality # pnext the position of the mate # pos 0-based leftmost coordinate # pnext the position of the mate # qname the query name # rnext the reference id of the mate # seq read sequence bases, including soft clipped bases # tid target id, contains the index of the reference sequence in the sequence dictionary # DON'T NEED TO SET or SHOULD WE SET? # qual read sequence base qualities, including soft clipped bases # tags the tags in the AUX field # tlen insert size total += 1 LOG.debug('~' * 80) LOG.debug("Converting {0} {1} {2} {3}".format( alignment.qname, read_chr, alignment.pos, alignment.cigarstring)) if alignment.is_qcfail: LOG.debug("\tFail due to qc of old alignment") new_file_unmapped.write(alignment) total_fail_qc += 1 continue if alignment.is_unmapped: LOG.debug("\tFail due to unmapped old alignment") new_file_unmapped.write(alignment) total_unmapped += 1 continue if not alignment.is_paired: LOG.debug("SINGLE END ALIGNMENT") map_statistics['total'] += 1 alignment_new.seq = alignment.seq alignment_new.flag = FLAG_NONE alignment_new.mapq = alignment.mapq alignment_new.qname = alignment.qname alignment_new.qual = alignment.qual alignment_new.tags = alignment.tags read_start = alignment.pos read_end = alignment.aend read_strand = '-' if alignment.is_reverse else '+' mappings = chain_file.find_mappings(read_chr, read_start, read_end) # unmapped if mappings is None: LOG.debug("\tFail due to no mappings") new_file_unmapped.write(alignment) map_statistics['fail_cannot_map'] += 1 elif len(mappings) == 1: if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE alignment_new.tid = name_to_id[mappings[0].to_chr] alignment_new.pos = mappings[0].to_start alignment_new.cigar = alignment.cigar new_file.write(alignment_new) LOG.debug("\tSuccess (simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) map_statistics['success_simple'] += 1 else: LOG.debug("MAPPINGS: {0}".format(len(mappings))) for m in mappings: LOG.debug("> {0}".format(m)) if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE alignment_new.tid = name_to_id[mappings[0].to_chr] alignment_new.pos = mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read_strand, alignment.pos) new_file.write(alignment_new) LOG.debug("\tSuccess (complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) map_statistics['success_complex'] += 1 else: LOG.debug("PAIRED END ALIGNMENT") map_statistics_pair['total'] += 1 alignment_new.seq = alignment.seq alignment_new.flag = FLAG_PAIRED alignment_new.mapq = alignment.mapq alignment_new.qname = alignment.qname alignment_new.qual = alignment.qual alignment_new.tags = alignment.tags if alignment.is_read1: alignment_new.flag |= FLAG_READ1 if alignment.is_read2: alignment_new.flag |= FLAG_READ2 if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE if alignment.mate_is_reverse: alignment_new.flag |= FLAG_MREVERSE read1_chr = sam_file.getrname(alignment.tid) read1_start = alignment.pos read1_end = alignment.aend read1_strand = '-' if alignment.is_reverse else '+' read1_mappings = chain_file.find_mappings( read1_chr, read1_start, read1_end) #, read1_strand) read2_chr = None read2_start = None read2_end = None read2_strand = None read2_mappings = None if alignment.mate_is_unmapped: alignment_new.flag |= FLAG_MUNMAP else: read2_chr = sam_file.getrname(alignment.rnext) read2_start = alignment.pnext read2_end = read2_start + 1 read2_strand = '-' if alignment.mate_is_reverse else '+' try: read2_mappings = chain_file.find_mappings( read2_chr, read2_start, read2_end) except: read2_mappings = None if read1_mappings is None and read2_mappings is None: alignment_new.flag |= FLAG_UNMAP alignment_new.flag |= FLAG_MUNMAP LOG.debug("\tFail due to no mappings") new_file_unmapped.write(alignment) map_statistics_pair['fail_cannot_map'] += 1 elif read1_mappings is None and read2_mappings and len( read2_mappings) == 1: alignment_new.flag |= FLAG_UNMAP alignment_new.pos = 0 alignment_new.cigarstring = '0M' alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 LOG.debug( "\tPair Success (1:fail,2:simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_fail_2_simple'] += 1 elif read1_mappings is None and read2_mappings and len( read2_mappings) > 1: alignment_new.flag |= FLAG_UNMAP alignment_new.pos = 0 alignment_new.cigarstring = '0M' alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 LOG.debug( "\tPair Success (1:fail,2:complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_fail_2_complex'] += 1 elif read1_mappings and len( read1_mappings) == 1 and read2_mappings is None: alignment_new.flag |= FLAG_MUNMAP alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read1_mappings[0].to_chr] alignment_new.pnext = 0 alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:simple,2:fail): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_fail'] += 1 elif read1_mappings and len( read1_mappings) == 1 and read2_mappings and len( read2_mappings) == 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:simple,2:simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_simple'] += 1 elif read1_mappings and len( read1_mappings ) == 1 and read2_mappings and len(read2_mappings) > 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:simple,2:complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_complex'] += 1 elif read1_mappings and len( read1_mappings) > 1 and read2_mappings is None: alignment_new.flag |= FLAG_MUNMAP alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read1_mappings[0].to_chr] alignment_new.pnext = 0 alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:complex,2:fail): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_fail'] += 1 elif read1_mappings and len( read1_mappings) > 1 and read2_mappings and len( read2_mappings) == 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:complex,2:simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_simple'] += 1 elif read1_mappings and len( read1_mappings) > 1 and read2_mappings and len( read2_mappings) > 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:complex,2:complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_complex'] += 1 else: raise G2GBAMError( "Unknown BAM/SAM conversion/parse situation") except StopIteration: LOG.info("All reads processed") LOG.info(" {:>10} TOTAL ENTRIES".format(total)) LOG.info(" {:>10} TOTAL UNMAPPED ".format(total_unmapped)) LOG.info(" {:>10} TOTAL FAIL QC ".format(total_fail_qc)) if map_statistics['total'] > 0: LOG.info("") LOG.info("Mapping Summary Single End") LOG.info(" {:>10} TOTAL ENTRIES".format(map_statistics['total'])) LOG.info("") LOG.info( " {:>10} TOTAL SUCCESS".format(map_statistics['success_simple'] + map_statistics['success_complex'])) LOG.info(" {:>10} Simple".format(map_statistics['success_simple'])) LOG.info(" {:>10} Complex".format(map_statistics['success_complex'])) LOG.info("") LOG.info(" {:>10} TOTAL FAILURES".format( map_statistics['fail_cannot_map'])) LOG.info(" {:>10} Cannot Map ".format( map_statistics['fail_cannot_map'])) if map_statistics_pair['total'] > 0: total_success = 0 for k, v in map_statistics_pair.iteritems(): if k.startswith('success'): total_success += v LOG.info("") LOG.info("Mapping Summary Paired End") LOG.info(" {:>10} TOTAL ENTRIES".format(map_statistics_pair['total'])) LOG.info("") LOG.info(" {:>10} TOTAL SUCCESS".format(total_success)) LOG.info(" {:>10} Read 1 Failed, Read 2 Simple".format( map_statistics_pair['success_1_fail_2_simple'])) LOG.info(" {:>10} Read 1 Failed, Read 2 Complex".format( map_statistics_pair['success_1_fail_2_complex'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Failed".format( map_statistics_pair['success_1_simple_2_fail'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Simple".format( map_statistics_pair['success_1_simple_2_simple'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Complex".format( map_statistics_pair['success_1_simple_2_complex'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Failed".format( map_statistics_pair['success_1_complex_2_fail'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Simple".format( map_statistics_pair['success_1_complex_2_simple'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Complex".format( map_statistics_pair['success_1_complex_2_complex'])) LOG.info("") LOG.info(" {:>10} TOTAL FAILURES".format( map_statistics_pair['fail_cannot_map'])) LOG.info(" {:>10} Cannot Map".format( map_statistics_pair['fail_cannot_map'])) LOG.info("") LOG.info("BAM File Converted")