def graph(self, current_kmer, next_kmer): options = { 'k': next_kmer, 'host_mem': self.available_memory, 'mem_flag': 1, 'output_prefix': self._graph_prefix(next_kmer), 'num_cpu_threads': self.threads, 'need_mercy': not self.no_mercy and current_kmer == self.kmin, 'kmer_from': current_kmer, 'useconv': False } if current_kmer == 0: # Indicating it's the first graph if not self.one_pass: logger.log(2, f"Extracting solid (k+1)-mers for k={next_kmer}") count_opts = options.copy() count_opts['m'] = self.min_multi count_opts['read_lib_file'] = self.read_lib count_opts.pop('need_mercy') count_opts.pop('kmer_from') logger.log(0, f"Extract options : {count_opts}") shell_call(self.MEGAHIT_CORE, 'count', **count_opts) file_size = 0 if path.exists(self._graph_prefix(next_kmer) + '.edges.0'): options['input_prefix'] = self._graph_prefix(next_kmer) file_size += path.getsize( self._graph_prefix(next_kmer) + '.edges.0') if path.exists(self._contig_prefix(current_kmer) + '.addi.fa'): options['addi_contig'] = \ self._contig_prefix(current_kmer) + '.addi.fa' file_size += path.getsize( self._contig_prefix(current_kmer) + '.addi.fa') if path.exists(self._contig_prefix(current_kmer) + '.local.fa'): options['local_contig'] = \ self._contig_prefix(current_kmer) + '.local.fa' file_size += path.getsize( self._contig_prefix(current_kmer) + '.addi.fa') if path.exists(self._contig_prefix(current_kmer) + '.contigs.fa'): options['contig'] = \ self._contig_prefix(current_kmer) + '.contigs.fa' options['bubble'] = \ self._contig_prefix(current_kmer) + '.bubble_seq.fa' file_size += path.getsize( self._contig_prefix(current_kmer) + '.contigs.fa') if file_size == 0 and current_kmer != 0: raise EmptyGraph logger.log(2, f'Building graph for k={next_kmer}') logger.log(0, f'Build options : {options}') shell_call(self.MEGAHIT_CORE, 'seq2sdbg', **options) if file_size != 0 and current_kmer != 0 and not self.keep_temp: os.system(f"rm -r {path.join(self.temp_dir, f'k{current_kmer}')}")
def findmitoscaf(args): if args.__calling == 'findmitoscaf': if not args.from_megahit: logger.log(2, 'Remapping reads to contigs since contigs are not assembled from pipeline.') fastfilter_bin = path.abspath(path.join(path.dirname(__file__), 'assemble', 'fastfilter')) filtered_fasta = path.join(args.findmitoscaf_dir, f'{args.workname}.filtered.fa') shell_call(fastfilter_bin, i=args.fastafile, o=filtered_fasta, l=f"{configurations.assemble.min_length},{configurations.assemble.max_length}", d=0) fq1, fq2 = args.fastq1, args.fastq2 if not (fq1 or fq2): raise RuntimeError("At least one fastq file should be specified!") if not fq1: fq1, fq2 = fq2, fq1 # Remapping to calculate average depth. from findmitoscaf.findmitoscaf import remap_sequence args.fastafile = remap_sequence(args.workname, args.findmitoscaf_dir, filtered_fasta, args.fastq1, args.fastq2, args.threads) else: logger.log(2, "Remapping skipped since from-megahit is specified, no tagging needed.") from findmitoscaf.findmitoscaf import findmitoscaf as _findmitoscaf picked_fa = _findmitoscaf( thread_number=args.threads, clade=args.clade, relaxing=args.taxa_tolerance, gene_code=args.genetic_code, multi=args.min_abundance, taxa=args.required_taxa if not args.disable_taxa else None, prefix=args.workname, basedir=args.findmitoscaf_dir, contigs_file=args.fastafile, merge_method=args.merge_method, merge_overlapping=args.merge_overlap, merge_search=args.merge_start) # Further processing for calling directly if args.__calling == 'findmitoscaf': os.rename(picked_fa, path.join( args.result_dir, path.basename(picked_fa))) return picked_fa
def filter_pe(fq1=None, fq2=None, o1=None, o2=None, dedup=False, start=None, end=None, n=10, q=55, l=0.2, trim=0, trunc=False): fsin1, fsin2 = path.getsize(fq1), path.getsize(fq2) logger.log(level=1, info='Start filtering pair-end rawdata.') logger.log( level=0, info=f'Input file 1 has {fsin1} bytes, 2 has {fsin2} bytes.') if fsin1 != fsin2: logger.log( level=3, info=f'Input file 1 and 2 have different sizes! This could cause loss on rawdata, or even crash the program.') logger.log( level=1, info=f'Using argument : Ns={n}, quality={q}, start={start}, end={end},limit={l}, trimming={trim}') try: shell_call(path.join(filter_dir, 'filter_v2'), _1=f'"{fq1}"', _2=f'"{fq2}"', _3=f'"{o1}"', _4=f'"{o2}"', d=dedup, s=start, e=end, n=n, q=q, l=l, t=trim, truncate_only=trunc) except Exception as identifier: logger.log( level=4, info=f'Error occured when running filter, cause : {identifier}') logger.log(level=1, info=f'Input file : {fq1} , {fq2}') logger.log(level=1, info=f'Output file : {o1} , {o2}') sys.exit("Error occured when running filter!") fsot1 = path.getsize(o1) logger.log(level=0, info=f'Output file has {fsot1} bytes.') logger.log(level=1, info=f'Filtered {fsin1 - fsot1} bytes, ratio {100*fsot1/fsin1:.2f}%.') return o1, o2
def remap_sequence(prefix=None, basedir=None, fasta_file=None, fastq1=None, fastq2=None, threads=8): # Remap sequence back to the fastq file # This can be a non-trival task, so a partial of threads are # given to samtools view and samtools sort. logger.log(2, "Mapping fastq reads back onto fasta file.") shell_call('bwa index', fasta_file) bam_file = path.join(basedir, f'{prefix}.bam') check_output( f'bwa mem -t {max(1, int(threads*0.75))} {fasta_file} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -q 30 -h -@ {max(1, int(threads*0.25))} -o {bam_file} -', shell=True) bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam') check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}', shell=True) logger.log(2, "Calculating average depth for each sequence.") gene_depth_file = path.join(basedir, f'{prefix}.dep') avgdep_bin = path.join(path.abspath(path.dirname(__file__)), 'avgdep_bin') check_output( f'samtools depth -aa {bam_sorted_file} |{avgdep_bin} -o {gene_depth_file}', shell=True) mapping = {k: v for k, v in map(str.split, open(gene_depth_file))} logger.log(2, "Retagging sequences for latter processing.") sequences = [] for seq in SeqIO.parse(fasta_file, 'fasta'): seq.description = f"flag=1 multi={mapping[seq.id]}" sequences.append(seq) SeqIO.write(sequences, path.join(basedir, path.basename(fasta_file)), 'fasta') return fasta_file
def local(self, current_kmer, next_kmer): logger.log(2, f'Local assembly for k = {current_kmer}') shell_call(self.MEGAHIT_CORE, 'local', c=self._contig_prefix(current_kmer) + '.contigs.fa', l=self.read_lib, t=self.threads, o=self._contig_prefix(current_kmer) + '.local.fa', kmax=next_kmer)
def assemble(self, kmer) -> Tuple[ContigInfo, ContigInfo]: min_standalone = max( min(self.kmax * 3 - 1, int(self.min_length * 1.5)), self.min_length) options = { 's': self._graph_prefix(kmer), 'o': self._contig_prefix(kmer), 't': self.threads, 'min_standalone': min_standalone, 'prune_level': self.prune_level, 'merge_len': 20, 'merge_similar': 0.95, 'cleaning_rounds': 5, 'disconnect_ratio': 0.1, 'low_local_ratio': 0.2, 'min_depth': self.prune_depth, 'bubble_level': 2, 'max_tip_len': max(1, self.min_length * 1.5 + 1 - kmer) if kmer * 3 - 1 > self.min_length * 1.5 else -1, 'careful_bubble': kmer < self.kmax, 'is_final_round': kmer == self.kmax, 'output_standalone': self.no_local, 'useconv': False } logger.log(2, f'Assembling contigs from SdBG for k = {kmer}') logger.log(0, f'Assemble arguments : {options}') shell_call(self.MEGAHIT_CORE, 'assemble', **options) with open(self._contig_prefix(kmer) + '.contigs.fa.info', 'r') as c, \ open(self._contig_prefix(kmer) + '.addi.fa.info', 'r') as a: return ContigInfo(c), ContigInfo(a)
def iterate(self, current_kmer, next_kmer): logger.log( 2, f'Extracting iterative edges from k = {current_kmer} to {next_kmer}' ) shell_call(self.MEGAHIT_CORE, 'iterate', c=self._contig_prefix(current_kmer) + '.contigs.fa', b=self._contig_prefix(current_kmer) + '.bubble_seq.fa', t=self.threads, s=next_kmer - current_kmer, o=self._graph_prefix(next_kmer), r=self.read_lib + '.bin', k=current_kmer)
def finalize(self, kmer): self.final_contig = path.join(self.result_dir, f'k{kmer}.contig.fa') shell_call('cat', path.join(self.contig_dir, '*.final.contigs.fa'), self._contig_prefix(kmer) + '.contigs.fa', '>', self.final_contig) if not self.keep_temp: to_remove = self.temp_dir if path.isdir(str(a_conf.external_temp)): to_remove = path.join(to_remove, "..") to_remove = path.abspath(to_remove) os.system(f'rm -r {to_remove}')
def initialize(self): self.basedir = path.abspath(self.basedir) self.fq1 = path.abspath(self.fq1) if self.fq2: self.fq2 = path.abspath(self.fq2) # Check if POPCNT command is supported if self.use_popcnt: if shell_call('megahit_core checkpopcnt').rstrip() != '1': self.use_popcnt = False logger.log(3, "POPCNT is disabled since no features detected.") else: self.hwaccel = shell_call( "megahit_core checkcpu").rstrip() == '1' logger.log( 2, f"Using megahit with {'POPCNT' if not self.hwaccel else 'hardware acceleration'} support." ) else: logger.log(2, "POPCNT disabled by argument.") if self.one_pass: logger.log(3, "Using 1-pass mode.") self.result_dir = safe_makedirs( path.join(self.basedir, f'{self.prefix}.result'), False) if not path.isdir(str(a_conf.external_temp)): self.temp_dir = safe_makedirs( path.join(self.basedir, f'{self.prefix}.temp'), False) else: self.temp_dir = safe_makedirs( path.join(a_conf.external_temp, str(uuid.uuid4()), f'{self.prefix}.temp'), False) self.read_lib = path.join(self.temp_dir, 'reads.lib') self.contig_dir = safe_makedirs( path.join(self.temp_dir, 'intermediate_contigs'), False) vm = psutil.virtual_memory() logger.log( 1, f"System memory status : {', '.join([f'{k}={v/(1024**2):.2f}MB' for k,v in vm._asdict().items() if type(v) is int])}" ) self.available_memory = int(vm.available * a_conf.max_mem_percent) logger.log( 2, f'Scheduled {self.available_memory/(1024**2):.2f}MB to use.')
def build_lib(self): # Write reads info with open(self.read_lib, 'w') as l: fifos = [] if self.fq1 and self.fq2: print(self.fq1, self.fq2, sep=',', file=l) fq1, fq2 = (self.fq1 if not self.fq1.endswith('gz') else path.join(self.temp_dir, 'pipe.pe1'), self.fq2 if not self.fq2.endswith('gz') else path.join(self.temp_dir, 'pipe.pe2')) if self.fq1.endswith('gz'): fifo1 = path.join(self.temp_dir, 'pipe.pe1') os.mkfifo(fifo1) fifos.append( subprocess.Popen(f'gzip -dc {self.fq1} > {fifo1}', shell=True, preexec_fn=os.setsid)) if self.fq2.endswith('gz'): fifo2 = path.join(self.temp_dir, 'pipe.pe2') os.mkfifo(fifo2) fifos.append( subprocess.Popen(f'gzip -dc {self.fq2} > {fifo2}', shell=True, preexec_fn=os.setsid)) print('pe', fq1, fq2, file=l) else: print(self.fq1, file=l) fq1 = self.fq1 if not self.fq1.endswith('gz') else path.join( self.temp_dir, 'pipe.se') print('se', fq1, file=l) logger.log(1, "Converting reads to binary library.") shell_call(self.MEGAHIT_CORE, 'buildlib', self.read_lib, self.read_lib) if False in (x.wait() == 0 for x in fifos): raise RuntimeError("Error occured in reading input fifos") with open(self.read_lib + '.lib_info') as ri: info = [x.split(' ') for x in ri.readlines()] return LibInfo(info)
def blastn_multi(dbfile=None, infile=None, basedir=None, prefix=None, threads=8): infile = path.abspath(infile) dbfile = path.abspath(dbfile) truncated_call('makeblastdb', '-in', infile, dbtype='nucl') nucl_data_dir = path.join(basedir, "blastn_data") try: os.mkdir(nucl_data_dir) except FileExistsError: raise RuntimeError("Folder is already created, please make sure the working folder is clean.") logger.log(1, f'Making {threads} small datasets for calling blastn.') file_names = [path.join(nucl_data_dir, f'dataset_{x}.fasta') for x in range(threads)] tasks = [f'blastn -evalue 1e-5 -outfmt 6 -db {infile} -query {dataset_path}' for dataset_path in file_names] seqs = [[] for i in range(threads)] for i, seq in enumerate(SeqIO.parse(dbfile, 'fasta')): seqs[i % threads].append(seq) for i in range(threads): SeqIO.write(seqs[i], file_names[i], 'fasta') logger.log(1, 'Generating map for calling blastn.') pool = multiprocessing.Pool(processes=threads) out_blast = path.join(path.abspath(basedir), f'{prefix}.blast') with open(out_blast, 'w') as f: pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x))) pool.close() logger.log(1, "Waiting for all processes to finish.") pool.join() logger.log(1, f'Cleaning generated temp files.') shell_call('rm -r', nucl_data_dir) os.remove(f'{infile}.nhr') os.remove(f'{infile}.nin') os.remove(f'{infile}.nsq') return out_blast
def tblastn_multi(dbfile=None, infile=None, genetic_code=9, basedir=None, prefix=None, threads=8): infile = path.abspath(infile) dbfile = path.abspath(dbfile) truncated_call('makeblastdb', '-in', infile, dbtype='nucl') tasks = [] protein_data_dir = path.join(basedir, 'tblastn_data') try: os.mkdir(protein_data_dir) except FileExistsError: raise RuntimeError( "Folder is already created, please make sure the working folder is clean.") logger.log(1, f'Making {threads} small datasets for calling tblastn.') tblastn_db = np.array_split(list(SeqIO.parse(dbfile, 'fasta')), threads) for idx, data in enumerate(tblastn_db): if data.any(): logger.log(0, f'Dataset {idx} has {len(data)} queries.') dataset_path = path.join(protein_data_dir, f'dataset_{idx}.fasta') SeqIO.write(data, dataset_path, 'fasta') tasks.append( f'tblastn -evalue 1e-5 -outfmt 6 -seg no -db_gencode {genetic_code} -db {infile} -query {dataset_path}') logger.log(1, f'Generating map for calling tblastn.') pool = multiprocessing.Pool(processes=threads) out_blast = path.join(path.abspath(basedir), f'{prefix}.blast') with open(out_blast, 'w') as f: pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x))) logger.log(1, f'Waiting for all processes to finish.') pool.close() pool.join() logger.log(1, f'Cleaning generated temp files.') shell_call('rm -r', protein_data_dir) os.remove(f'{infile}.nhr') os.remove(f'{infile}.nin') os.remove(f'{infile}.nsq') return out_blast
def scaf(self) -> str: if self.lib_file == None: raise RuntimeError("Lib was not build before scaffolding!") kmer = int(self.read_length / 2) prefix = path.join(self.basedir, f'k{kmer}') # Prepare logger.log(2, "Constructing graph for SOAPdenovo-127.") shell_call(soap_fusion, D=True, s=self.lib_file, p=self.threads, K=kmer, g=prefix, c=self.contigs) # Map logger.log(2, "Mapping sequences.") shell_call(soap_127, 'map', s=self.lib_file, p=self.threads, g=prefix) # Scaff logger.log(2, "Scaffolding.") shell_call(soap_127, 'scaff', p=self.threads, g=prefix) # Convert logger.log(2, "Converting output scaffolds back.") scaf2mega(prefix + '.scafSeq', path.join(path.dirname(self.contigs), 'scaf.fa'), overlay=kmer) return path.join(path.dirname(self.contigs), 'scaf.fa')
def nhmmer_search(fasta_file=None, thread_number=None, nhmmer_profile=None, prefix=None, basedir=None): logger.log(1, 'Calling nhmmer.') # Call nhmmer hmm_out = os.path.join(basedir, f'{prefix}.nhmmer.out') hmm_tbl = os.path.join(basedir, f'{prefix}.nhmmer.tblout') logger.log(1, f'Out file : o={hmm_out}, tbl={hmm_tbl}') shell_call('nhmmer', o=hmm_out, tblout=hmm_tbl, cpu=thread_number, appending=[nhmmer_profile, fasta_file]) # Process data to pandas readable table hmm_tbl_pd = f'{hmm_tbl}.readable' with open(hmm_tbl, 'r') as fin, open(hmm_tbl_pd, 'w') as fout: for line in fin: striped = line.strip() splitted = striped.split() # Dispose the description of genes, god damned nhmmer... print(' '.join(splitted[:15]), file=fout) # Read table with pandas hmm_frame = pandas.read_csv(hmm_tbl_pd, comment='#', delimiter=' ', names=[ 'target', 'accession1', 'query', 'accession2', 'hmmfrom', 'hmm to', 'alifrom', 'alito', 'envfrom', 'envto', 'sqlen', 'strand', 'e', 'score', 'bias' ]) hmm_frame = hmm_frame.drop(columns=['accession1', 'accession2']) # Deduplicate multiple hits on the same gene of same sequence hmm_frame = hmm_frame.drop_duplicates( subset=['target', 'query'], keep='first') hmm_frame.to_csv(f'{hmm_tbl}.dedup.csv', index=False) logger.log(1, f'HMM query have {len(hmm_frame.index)} results.') return hmm_frame
def filter_se(fqiabs=None, fqoabs=None, Ns=10, quality=55, limit=0.2, start=None, end=None, trim=0, trunc=False): fsin = path.getsize(fqiabs) logger.log(level=1, info='Start filtering single-end rawdata.') logger.log(level=0, info=f'Input file has {fsin} bytes.') logger.log(level=1, info=f'Using argument : Ns={Ns}, quality={quality}, limit={limit}, start={start}, end={end}, trimming={trim}, trunc={trunc}') try: shell_call(path.join(filter_dir, 'filter_v2'), cleanq1=f'"{fqoabs}"', fastq1=f'"{fqiabs}"', n=Ns, q=quality, l=limit, s=start, e=end, t=trim, truncate_only=trunc) except Exception as identifier: logger.log( level=4, info=f'Error occured when running filter, cause : {identifier}') logger.log(level=1, info=f'Input file : {fqiabs}') logger.log(level=1, info=f'Output file : {fqoabs}') sys.exit("Error occured when running filter!") fsot = path.getsize(fqoabs) logger.log(level=0, info=f'Output file has {fsot} bytes.') logger.log(level=0, info=f'Filtered {fsin - fsot} bytes, ratio {fsot/fsin}.') return fqoabs
def filter(self, kmer=None, min_depth=3, min_length=0, max_length=20000, force_filter=False, deny_number=a_conf.filter_keep) -> Tuple[int, int, int]: logger.log(2, f'Filtering output contig files of k = {kmer}') results = [0, 0, 0] if not a_conf.no_filter or force_filter: for idx, suffix in enumerate( ['.contigs.fa', '.addi.fa', '.bubble_seq.fa']): if path.exists(self._contig_prefix(kmer) + suffix): results[idx] = int( shell_call(self.FAST_FILTER, i=self._contig_prefix(kmer) + suffix, o=self._contig_prefix(kmer) + '.filtered' + suffix, l=f"{min_length},{max_length}", d=min_depth)) if results[idx] <= deny_number and idx == 0: results[idx] = int( shell_call(self.FAST_FILTER, i=self._contig_prefix(kmer) + suffix, o=self._contig_prefix(kmer) + '.filtered' + suffix, l=f"{min_length},{max_length}", m=deny_number)) shell_call( 'mv', self._contig_prefix(kmer) + '.filtered' + suffix, self._contig_prefix(kmer) + suffix) return tuple(results)
def merge_sequences(fasta_file=None, overlapped_len=50, search_range=5, threads=8, index=0): # Merge sequences that are possibly be overlapped with each others. logger.log(1, "Trying to merge candidates that are possibly overlapped.") fasta_file = path.abspath(fasta_file) if some(SeqIO.parse(fasta_file, 'fasta')): logger.log(1, "No sequences needed merging.") return 0 while True: blast_results = tk.blastn_multi(fasta_file, fasta_file, path.dirname(fasta_file), 'merge', threads=threads) # Overlap Conditions: # 1. Not aligning itself # 2. One of the sequences can be sticked into the other in a short range # 3. Aligned length is long enough # 4. After merging, they will be longer and no too much sequences are discarded. logger.log(1, "Washing blast results.") libfastmathcal.wash_merge_blast(blast_results, fasta_file, search_range, overlapped_len, a_conf.max_length) logger.log(1, "Sorting outputs.") shell_call('sort -n -k12,12 -k3,3', appending=[blast_results + ".filtered", ">", blast_results]) logger.log(1, "Merging sequences.") new_index = libfastmathcal.merge_overlaps(blast_results, fasta_file, fasta_file + '.merged', index) os.rename(fasta_file + '.merged', fasta_file) logger.log(1, f"Merged {new_index - index} sequences") if index == new_index: break index = new_index os.remove(blast_results) os.remove(blast_results + ".filtered") return index
def visualize(fasta_file=None, fastq1=None, fastq2=None, pos_json=None, prefix=None, basedir=None, threads=8, circular=False): logger.log(2, 'Entering visualize module.') # Validate the paths fasta_file = path.abspath(fasta_file) fastq1 = path.abspath(fastq1) if fastq2 != None: fastq2 = path.abspath(fastq2) basedir = path.abspath(basedir) pos_json = path.abspath(pos_json) fa_copy = path.join(basedir, f'{prefix}.fasta') list_conv = [] counter = 1 # Rename to a easier form index_list = {} for seq in SeqIO.parse(fasta_file, 'fasta'): index_list[seq.id] = f'mt{counter}' seq.id_old = seq.id seq.id = f'mt{counter}' seq.description = '' list_conv.append(seq) counter += 1 SeqIO.write(list_conv, fa_copy, 'fasta') with open(pos_json, 'r') as f: poses = json.load(f) # Gene name files logger.log(1, 'Generating gene name and feature files.') gene_name_file = path.join(basedir, f'{prefix}.gene.txt') with open(gene_name_file, 'w') as gn_f: for key, value in poses.items(): start, end, gene_type, strand, _ = value strand_conv = index_list[strand] print(strand_conv, start, end, key.split('_')[0] if '_' in key else key, sep='\t', file=gn_f) # Gene feature files gene_feature_file = path.join(basedir, f'{prefix}.features.txt') with open(gene_feature_file, 'w') as gf_f: for key, value in poses.items(): start, end, gene_type, strand, plus = value plus = plus == '+' r0 = 0.965 if plus else 1 r1 = 1 if plus else 1.035 strand_conv = index_list[strand] print(strand_conv, start, start, f'fill_color=black,r0={r0}r,r1={r1}r', file=gf_f, sep='\t') print( strand_conv, start, end, f'fill_color={circos_config.fill_colors[int(gene_type)]},r0={r0}r,r1={r1}r', file=gf_f, sep='\t') print(strand_conv, end, end, f'fill_color=black,r0={r0}r,r1={r1}r', file=gf_f, sep='\t') logger.log(1, 'Generating depth files.') # Using check_output directly because being too lazy to remove decoder from subprocess import check_output shell_call('bwa index', fa_copy) bam_file = path.join(basedir, f'{prefix}.bam') mem_count = max(int(threads * 0.8), 1) view_count = max(threads - mem_count, 1) check_output( f'bwa mem -t {mem_count} {fa_copy} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -@ {view_count} -q 30 -h -o {bam_file} -', shell=True) bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam') check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}', shell=True) gene_depth_file = path.join(basedir, f'{prefix}.dep') check_output(f'samtools depth -aa {bam_sorted_file} > {gene_depth_file}', shell=True) # Calculate the things circos_depth_file = path.join(basedir, f'{prefix}.depth.txt') max_gene_depth = 0 with open(gene_depth_file, 'r') as gdf, open(circos_depth_file, 'w') as cdf: for line in gdf: content = str(line).rstrip().split() print(' '.join([content[0], content[1], content[1], content[2]]), file=cdf) if int(content[2]) > max_gene_depth: max_gene_depth = int(content[2]) # GC content # Reusing conv-list here, as it's not deleted in the scope gc_content_file = path.join(basedir, f'{prefix}.gc.txt') with open(gc_content_file, 'w') as gc_f: for seq in list_conv: # Stepping 50 to walk through for s in range(0, len(seq), 50): seq_slice = seq[s:s + 50] gc_num = sum(x == 'G' or x == 'C' for x in seq_slice) gc_per = gc_num / len(seq_slice) print(seq.id, s, s + len(seq_slice), gc_per, file=gc_f) # Karyotype logger.log(1, 'Generating chr files.') karyotype_file = path.join(basedir, f'{prefix}.karyotype.txt') with open(karyotype_file, 'w') as ky_f: for seq in list_conv: chr_name = seq.id.replace('mt', 'chr') print(f'{chr_name} - {seq.id}\t{seq.id_old}\t0\t{len(seq)}\tgrey', file=ky_f) # Plus generation logger.log(1, 'Generating plus.') plus_file = path.join(basedir, f'{prefix}.plus.txt') with open(plus_file, 'w') as p_f: print('mt1\t0\t300\t+\tr0=1r-150p,r1=1r-100p', file=p_f) # Giving the values logger.log(1, 'Generating circos config file.') generated_config = circos_config.circos_conf generated_config.ideogram.spacing._break = "0.5r" if not circular else "0.01r" generated_config.image.dir = basedir generated_config.karyotype = karyotype_file generated_config.plots['plot', 0].file = gene_name_file generated_config.plots['plot', 1].file = plus_file generated_config.plots['plot', 2].file = gc_content_file with generated_config.plots['plot', 3] as depth_plot: depth_plot.file = circos_depth_file depth_plot.max = max_gene_depth depth_plot.rules[ 'rule', 0].condition = f'var(value) > {int(max_gene_depth*0.9)}' depth_plot.rules[ 'rule', 1].condition = f'var(value) < {int(max_gene_depth*0.1)}' generated_config.highlights['highlight', 0].file = gene_feature_file # Writing to final # I guess it would be better to use a f-string formatted cfg, but # well this is fine. cfg_dict = circos.collapse(generated_config) cfg_file = path.join(basedir, 'circos.conf') with open(cfg_file, 'w') as cfg_f: cfg_f.write('<<include etc/colors_fonts_patterns.conf>>\n') cfg_f.write(circos.dict2circos(cfg_dict) + '\n') cfg_f.write('<<include etc/housekeeping.conf>>') logger.log(1, 'Running Circos.') try: check_output('circos', shell=True, cwd=basedir) except Exception: logger.log(4, "Running circos errored, no graph is outputted!") return path.join(basedir, 'Circos.png'), path.join(basedir, 'Circos.svg')