def split_consensus_fasta(fasta, ref_name, out): elements = [] out_dir = out+"/split_fasta/" mccutils.mkdir(out_dir) fasta_records = SeqIO.parse(fasta,"fasta") for record in fasta_records: fasta_name = str(record.id) elements.append(fasta_name) special_chars = [";","&","(",")","|","*","?","[","]","~","{","}","<","!","^",'"',"'","\\","$","/"] for char in special_chars: fasta_name = fasta_name.replace(char,"_") tmp_fasta = out_dir+ref_name+"_"+fasta_name+".fasta.tmp" with open(tmp_fasta,"w") as outfa: outfa.write(">"+str(record.id)+"\n") outfa.write(str(record.seq)+"\n") fasta_lines = fix_fasta.fix_fasta_lines(tmp_fasta, 80) out_fasta = out_dir+ref_name+"_"+fasta_name+".fasta" with open(out_fasta,"w") as outfa: for line in fasta_lines: outfa.write(line+"\n") mccutils.remove(tmp_fasta) return elements
def fix_fasta_lines(infasta, outfasta, length=80): lines = fix_fasta.fix_fasta_lines(infasta, length) with open(outfasta, "w") as fa: for line in lines: fa.write(line+"\n") return outfasta
def main(): mccutils.log("processing", "making coverage fasta") fastas = [] try: length = 80 if snakemake.params.coverage_fasta == "None": mccutils.run_command(["touch", snakemake.output.coverage_fasta]) else: fasta3 = snakemake.params.coverage_fasta fastas.append(fasta3) lines = fix_fasta.fix_fasta_lines(fasta3, length) write_fasta(lines, snakemake.output.coverage_fasta) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print( "ERROR...failed to create coverage fasta, check the formatting of :", snakemake.params.coverage_fasta, file=sys.stderr) mccutils.remove(snakemake.output[0]) mccutils.remove(snakemake.output[1]) mccutils.remove(snakemake.output[2]) sys.exit(1) mccutils.log("processing", "coverage fasta created")
def repeat_mask(reference, te_fasta, chromosomes, procs, run_id, log, out): try: outdir = out + "/tmp/repeatmasker_" + run_id mccutils.mkdir(outdir) os.chdir(outdir) command = [ "RepeatMasker", "-pa", str(procs), "-lib", te_fasta, "-dir", outdir, "-s", "-gff", "-nolow", "-no_is", reference ] mccutils.run_command(command, log=log) os.chdir(out) # RepeatMasker appears to override the custom database names during the ProcessRepeats # this step changes them back, more rules may be needed for other reference genomes ref_name = os.path.basename(reference) repeatmasker_gff = outdir + "/" + ref_name + ".out.gff" formatted_ref_tes = out + "/tmp/" + run_id + "tmpreferenceTEs.gff" with open(repeatmasker_gff, "r") as rmgff: with open(formatted_ref_tes, "w") as outgff: for line in rmgff: if "#" not in line: line = line.replace("McClintock-int", "McClintock") line = line.replace("POGON1", "pogo") split_line = line.split("\t") feats = split_line[8] if split_line[0] in chromosomes: te = feats.split(" ")[1] te = te.replace('"', '').split(":")[1] feats = ";".join( ["ID=" + te, "Name=" + te, "Alias=" + te]) split_line[2] = te split_line[8] = feats line = "\t".join(split_line) outgff.write(line + '\n') masked_fasta = outdir + "/" + ref_name + ".masked" fasta_lines = fix_fasta.fix_fasta_lines(masked_fasta, 80) mccutils.check_file_exists(formatted_ref_tes) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...Failed to run repeatmasker on: ", reference, "with lib:", te_fasta, "check file formatting...exiting...", file=sys.stderr) sys.exit(1) return formatted_ref_tes