def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain':chain_file.current_chain_header, 'lines': []} else: chr_info[chain_file.current_chain_header[1]]['lines'].append(line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def fasta_patch( filename_fasta, filename_vcf, strain, filename_output, bgzip=False, num_processes=None, pass_only=False, quality=False, diploid=False, ): """ Patch a Fasta file by replacing the bases where the SNPs are located in the VCF file. :param filename_fasta: name of the input Fasta file :type filename_fasta: string :param filename_vcf: name of the VCF file :type filename_vcf: string :param strain: name of strain to use in VCF file :type strain: string :param filename_output: name of the output Fasta file :type filename_output: string :param bgzip: compress file in BGZIP format :type bgzip: boolean :param num_processes: the number of processes to spawn :type num_processes: int :param pass_only: Only process those VCF records with a 'PASS' :type pass_only: boolean :param quality: filter on quality, FI=PASS :type quality: boolean :param diploid: don't ignore hets and create 2 files :type diploid: boolean :return: Nothing """ start = time.time() filename_fasta = g2g_fu.check_file(filename_fasta) filename_vcf = g2g_fu.check_file(filename_vcf) LOG.info("INPUT FASTA FILE: {0}".format(filename_fasta)) LOG.info("VCF FILE: {0}".format(filename_vcf)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(pass_only))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not strain: raise G2GValueError("No strain was specified.") filename_output_l, filename_output_r = prepare_fasta_patch(filename_fasta, filename_output, bgzip, diploid) if not num_processes: num_processes = multiprocessing.cpu_count() else: if num_processes <= 0: num_processes = 1 LOG.info("NUMBER OF PROCESSES: {0}".format(num_processes)) if bgzip: if diploid: LOG.info("OUTPUT FASTA FILES: {0}.gz".format(filename_output_l)) LOG.info(" {0}.gz".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}.gz".format(filename_output_l)) else: if diploid: LOG.info("OUTPUT FASTA FILES: {0}".format(filename_output_l)) LOG.info(" {0}".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}".format(filename_output_l)) LOG.info("Patching...") try: patch( filename_fasta, filename_vcf, strain, filename_output_l, filename_output_r, num_processes, pass_only, quality, diploid, ) LOG.info("Patching complete") # remove the fai LOG.debug("removing the FAI index for {0}".format(g2g_fu.delete_index_files(filename_output_l))) g2g_fu.delete_index_files(filename_output_l) # move temp to final destination if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(filename_output_l, "{0}.gz".format(filename_output_l), "fa") if diploid: g2g_fu.bgzip_index(filename_output_r, "{0}.gz".format(filename_output_r), "fa") LOG.info("Execution complete: {0}".format(format_time(start, time.time()))) except Exception, e: LOG.debug(e) raise G2GError("")
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False): """ Initialize fasta_patch variables :param filename_fasta: :param filename_vcf: :param strain: :param filename_output: :param bgzip: :param diploid: :return: """ filename_output = g2g_fu.check_file(filename_output, "w") output_file_dir = os.path.abspath(os.path.dirname(filename_output)) new_filename_output = filename_output # let's figure out what our output names will be if filename_output.lower().endswith(".gz"): # strip off .gz new_filename_output = filename_output[:-3] if not filename_output.lower().endswith(".fa"): raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'") if diploid: filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, "l") filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, "r") g2g_fu.delete_index_files(filename_output_l) g2g_fu.delete_index_files(filename_output_r) else: filename_output_l = new_filename_output filename_output_r = None g2g_fu.delete_index_files(filename_output_l) # at this point we are hoping for a .fa extension # let's figure out our input and process accordingly if filename_fasta.lower().endswith(".fa.gz"): # decompress the fasta file if it is compressed LOG.info("Copying and decompressing fasta file") # copy file and preserve gz extension for bgzip -d to work tmp_file_name = os.path.basename(filename_fasta) # something.gz LOG.debug("tmp_file_name={0}".format(tmp_file_name)) tmp_fasta = os.path.join(output_file_dir, tmp_file_name) # /path/something.fa.gz LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta)) shutil.copy(filename_fasta, tmp_fasta) # cp /original/something.fa.gz /output/something.fa.gz LOG.debug("DECOMPRESSING {0}".format(tmp_fasta)) g2g_fu.bgzip_decompress(tmp_fasta) tmp_fasta = tmp_fasta[:-3] # /path/something.fa LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l)) shutil.move(tmp_fasta, filename_output_l) elif filename_fasta.lower().endswith(".fa"): LOG.debug("File is not compressed") LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l)) shutil.copy(filename_fasta, filename_output_l) else: raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'") if diploid: LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r)) shutil.copy(filename_output_l, filename_output_r) # build a temporary fasta index pysam.FastaFile(filename_output_l) return filename_output_l, filename_output_r
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [ parse_location( "{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references ] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format( chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = { 'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain': chain_file.current_chain_header, 'lines': [] } else: chr_info[chain_file.current_chain_header[1]]['lines'].append( line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn( "Length's do not match, chromosome length in chain: {0}, sequence length: {1}" .format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format( location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format( location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format( partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format( same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug( "LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}" .format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def fasta_patch(filename_fasta, filename_vcf, strain, filename_output, bgzip=False, num_processes=None, pass_only=False, quality=False, diploid=False): """ Patch a Fasta file by replacing the bases where the SNPs are located in the VCF file. :param filename_fasta: name of the input Fasta file :type filename_fasta: string :param filename_vcf: name of the VCF file :type filename_vcf: string :param strain: name of strain to use in VCF file :type strain: string :param filename_output: name of the output Fasta file :type filename_output: string :param bgzip: compress file in BGZIP format :type bgzip: boolean :param num_processes: the number of processes to spawn :type num_processes: int :param pass_only: Only process those VCF records with a 'PASS' :type pass_only: boolean :param quality: filter on quality, FI=PASS :type quality: boolean :param diploid: don't ignore hets and create 2 files :type diploid: boolean :return: Nothing """ start = time.time() filename_fasta = g2g_fu.check_file(filename_fasta) filename_vcf = g2g_fu.check_file(filename_vcf) LOG.info("INPUT FASTA FILE: {0}".format(filename_fasta)) LOG.info("VCF FILE: {0}".format(filename_vcf)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(pass_only))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not strain: raise G2GValueError("No strain was specified.") filename_output_l, filename_output_r = prepare_fasta_patch(filename_fasta, filename_output, bgzip, diploid) if not num_processes: num_processes = multiprocessing.cpu_count() else: if num_processes <= 0: num_processes = 1 LOG.info("NUMBER OF PROCESSES: {0}".format(num_processes)) if bgzip: if diploid: LOG.info("OUTPUT FASTA FILES: {0}.gz".format(filename_output_l)) LOG.info(" {0}.gz".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}.gz".format(filename_output_l)) else: if diploid: LOG.info("OUTPUT FASTA FILES: {0}".format(filename_output_l)) LOG.info(" {0}".format(filename_output_r)) else: LOG.info("OUTPUT FASTA FILE: {0}".format(filename_output_l)) LOG.info("Patching...") try: patch(filename_fasta, filename_vcf, strain, filename_output_l, filename_output_r, num_processes, pass_only, quality, diploid) LOG.info("Patching complete") # remove the fai LOG.debug("removing the FAI index for {0}".format(g2g_fu.delete_index_files(filename_output_l))) g2g_fu.delete_index_files(filename_output_l) # move temp to final destination if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(filename_output_l, "{0}.gz".format(filename_output_l), 'fa') if diploid: g2g_fu.bgzip_index(filename_output_r, "{0}.gz".format(filename_output_r), 'fa') LOG.info("Execution complete: {0}".format(format_time(start, time.time()))) except Exception, e: LOG.debug(e) raise G2GError("")
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False): """ Initialize fasta_patch variables :param filename_fasta: :param filename_vcf: :param strain: :param filename_output: :param bgzip: :param diploid: :return: """ filename_output = g2g_fu.check_file(filename_output, 'w') output_file_dir = os.path.abspath(os.path.dirname(filename_output)) new_filename_output = filename_output # let's figure out what our output names will be if filename_output.lower().endswith('.gz'): # strip off .gz new_filename_output = filename_output[:-3] if not filename_output.lower().endswith('.fa'): raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'") if diploid: filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, 'l') filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, 'r') g2g_fu.delete_index_files(filename_output_l) g2g_fu.delete_index_files(filename_output_r) else: filename_output_l = new_filename_output filename_output_r = None g2g_fu.delete_index_files(filename_output_l) # at this point we are hoping for a .fa extension # let's figure out our input and process accordingly if filename_fasta.lower().endswith('.fa.gz'): # decompress the fasta file if it is compressed LOG.info("Copying and decompressing fasta file") # copy file and preserve gz extension for bgzip -d to work tmp_file_name = os.path.basename(filename_fasta) # something.gz LOG.debug("tmp_file_name={0}".format(tmp_file_name)) tmp_fasta = os.path.join(output_file_dir, tmp_file_name) # /path/something.fa.gz LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta)) shutil.copy(filename_fasta, tmp_fasta) # cp /original/something.fa.gz /output/something.fa.gz LOG.debug("DECOMPRESSING {0}".format(tmp_fasta)) g2g_fu.bgzip_decompress(tmp_fasta) tmp_fasta = tmp_fasta[:-3] # /path/something.fa LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l)) shutil.move(tmp_fasta, filename_output_l) elif filename_fasta.lower().endswith('.fa'): LOG.debug("File is not compressed") LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l)) shutil.copy(filename_fasta, filename_output_l) else: raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'") if diploid: LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r)) shutil.copy(filename_output_l, filename_output_r) # build a temporary fasta index pysam.FastaFile(filename_output_l) return filename_output_l, filename_output_r