def run_prepend_unique_ids(job, cactusWorkflowArguments, project, cactus_blast_input): """ prepend the unique ids on the input fasta. this is required for cactus to work (would be great to relax it though""" # note, there is an order dependence to everything where we have to match what was done in cactus_workflow # (so the code is pasted exactly as it is there) # this is horrible and needs to be fixed via drastic interface refactor exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] sequences = [job.fileStore.readGlobalFile(id) for id in map(itemgetter(1), ingroupsAndOriginalIDs)] cactusWorkflowArguments.totalSequenceSize = sum(os.stat(x).st_size for x in sequences) renamedInputSeqDir = job.fileStore.getLocalTempDir() id_map = {} uniqueFas = prependUniqueIDs(sequences, renamedInputSeqDir, id_map) uniqueFaIDs = [job.fileStore.writeGlobalFile(seq, cleanup=True) for seq in uniqueFas] # Set the uniquified IDs for the ingroups and outgroups ingroupsAndNewIDs = list(zip(list(map(itemgetter(0), ingroupsAndOriginalIDs)), uniqueFaIDs[:len(ingroupsAndOriginalIDs)])) for event, sequenceID in ingroupsAndNewIDs: cactusWorkflowArguments.experimentWrapper.setSequenceID(event, sequenceID) # if we're not taking the blast input, then we have to apply to the cigar files too if not cactus_blast_input: alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.alignmentsID) renamed_alignments = prepend_cigar_ids([alignments], renamedInputSeqDir, id_map) cactusWorkflowArguments.alignmentsID = job.fileStore.writeGlobalFile(renamed_alignments[0], cleanup=True) if cactusWorkflowArguments.secondaryAlignmentsID: sec_alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.secondaryAlignmentsID) renamed_sec_alignments = prepend_cigar_ids([sec_alignments], renamedInputSeqDir, id_map) cactusWorkflowArguments.secondaryAlignmentsID = job.fileStore.writeGlobalFile(renamed_sec_alignments[0], cleanup=True) if cactusWorkflowArguments.outgroupFragmentIDs: og_alignments= job.fileStore.readGlobalFile(cactusWorkflowArguments.outgroupFragmentIDs) renamed_og_alignments = prepend_cigar_ids(og_alignments, renamedInputSeqDir, id_map) cactusWorkflowArguments.outgroupFragmentIDs = [job.fileStore.writeGlobalFile(rga, cleanup=True) for rga in renamed_og_alignments] return cactusWorkflowArguments
def run_prepend_unique_ids(job, assembly_files): """ Ensures all input sequences from assembly_files have unique names. This is adapted from run_prepend_unique_ids in cactus_align, in order to maintain order-dependent renamings. Because cactus-reference-align assumes that all input sequences are ingroups, it does not attempt to avoid renaming outgroup genomes. """ print("assembly_files", assembly_files) sequences = [ job.fileStore.readGlobalFile(id) for id in assembly_files.values() ] print("sequences", sequences) renamedInputSeqDir = job.fileStore.getLocalTempDir() id_map = {} uniqueFas = prependUniqueIDs(sequences, renamedInputSeqDir, id_map) print("uniqueFas", uniqueFas) uniqueFaIDs = [ job.fileStore.writeGlobalFile(seq, cleanup=True) for seq in uniqueFas ] print("uniqueFaIDs", uniqueFaIDs) i = 0 for assembly in assembly_files: assembly_files[assembly] = uniqueFaIDs[i] i += 1 print("assembly_files", assembly_files) return assembly_files
def testPrependUniqueIDs(self): # Create fake FASTA files with some interesting headers. with NamedTemporaryFile(mode='w+') as fasta1, NamedTemporaryFile( mode='w+') as fasta2: fasta1.write( dedent(""" >C10856240 2.0 ACTAGAGG G GG >foo bar baz ACTGACGATgacgat >emptyseq """)) fasta2.write( dedent(""" > space GTGC >id=1||header ATCC """)) fasta1.flush() fasta2.flush() outDir = mkdtemp() eventToFa = {0: fasta1.name, 1: fasta2.name} outputPaths = prependUniqueIDs(eventToFa, outDir) assert len(outputPaths) == 2 with open(outputPaths[0]) as f: self.assertEqual( f.read(), dedent(""" >id=0|C10856240 2.0 ACTAGAGG G GG >id=0|foo bar baz ACTGACGATgacgat >id=0|emptyseq """)) with open(outputPaths[1]) as f: self.assertEqual( f.read(), dedent(""" >id=1| space GTGC >id=1|id=1||header ATCC """)) shutil.rmtree(outDir)
def minigraph_map_one(job, config, event_name, fa_path, fa_file_id, gfa_file_id, gaf_output, paf_output): """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """ work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") fa_dir = job.fileStore.getLocalTempDir() fa_path = os.path.join(fa_dir, os.path.basename(fa_path)) gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name)) job.fileStore.readGlobalFile(gfa_file_id, gfa_path) job.fileStore.readGlobalFile(fa_file_id, fa_path) if fa_path.endswith('.gz'): fa_path = fa_path[:-3] cactus_call(parameters = ['gzip', '-d', '-c', fa_path + '.gz'], outfile=fa_path) # prepend the unique id before mapping so the GAF has cactus-compatible event names fa_path = prependUniqueIDs({event_name : fa_path}, work_dir, eventNameAsID=True)[event_name] # parse options from the config xml_node = findRequiredNode(config.xmlRoot, "graphmap") minigraph_opts = getOptionalAttrib(xml_node, "minigraphMapOptions", str, default="") opts_list = minigraph_opts.split() # add required options if not present if "-S" not in opts_list: opts_list += ["-S"] if "--write-mz" not in opts_list: opts_list += ["--write-mz"] if "-t" not in opts_list: opts_list += ["-t", str(int(job.cores))] cmd = ["minigraph", os.path.basename(gfa_path), os.path.basename(fa_path), "-o", os.path.basename(gaf_path)] + opts_list mask_filter = getOptionalAttrib(xml_node, "maskFilter", int, default=-1) if mask_filter >= 0: cmd[2] = '-' cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path), '-m', str(mask_filter)], cmd] cactus_call(work_dir=work_dir, parameters=cmd) paf_id, gaf_id = None, None if paf_output: # optional gaf->paf step. we are not piping directly out of minigraph because mzgaf2paf's overlap filter # (which is usually on) requires 2 passes so it won't read stdin when it's enabled paf_id = merge_gafs_into_paf(job, config, None, [gaf_path]) if gaf_output: gaf_id = job.fileStore.writeGlobalFile(gaf_path) return gaf_id, paf_id
def run_prepend_unique_ids(job, cactusWorkflowArguments, project, renameCigars, eventNameAsID): """ prepend the unique ids on the input fasta. this is required for cactus to work (would be great to relax it though""" # note, there is an order dependence to everything where we have to match what was done in cactus_workflow # (so the code is pasted exactly as it is there) # this is horrible and needs to be fixed via drastic interface refactor # update: this has been somewhat fixed with a minor refactor: prependUniqueIDs is no longer order dependent (but takes dict instead of list) exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] eventToSequence = {} for g, seqID in ingroupsAndOriginalIDs: seqPath = job.fileStore.getLocalTempFile() + '.fa' if project.inputSequenceMap[g].endswith('.gz'): seqPath += '.gz' job.fileStore.readGlobalFile(seqID, seqPath) if seqPath.endswith('.gz'): cactus_call(parameters=['gzip', '-d', '-c', seqPath], outfile=seqPath[:-3]) seqPath = seqPath[:-3] eventToSequence[g] = seqPath cactusWorkflowArguments.totalSequenceSize = sum(os.stat(x).st_size for x in eventToSequence.values()) # need to have outgroups in there just for id naming (don't need their sequence) for g in exp.getOutgroupGenomes(): eventToSequence[g] = None renamedInputSeqDir = job.fileStore.getLocalTempDir() id_map = {} eventToUnique = prependUniqueIDs(eventToSequence, renamedInputSeqDir, idMap=id_map, eventNameAsID=eventNameAsID) # Set the uniquified IDs for the ingroups and outgroups for event, uniqueFa in eventToUnique.items(): uniqueFaID = job.fileStore.writeGlobalFile(uniqueFa, cleanup=True) cactusWorkflowArguments.experimentWrapper.setSequenceID(event, uniqueFaID) # if we're not taking cactus-[blast|refmap] input, then we have to apply to the cigar files too if renameCigars: alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.alignmentsID) renamed_alignments = prepend_cigar_ids([alignments], renamedInputSeqDir, id_map) cactusWorkflowArguments.alignmentsID = job.fileStore.writeGlobalFile(renamed_alignments[0], cleanup=True) if cactusWorkflowArguments.secondaryAlignmentsID: sec_alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.secondaryAlignmentsID) renamed_sec_alignments = prepend_cigar_ids([sec_alignments], renamedInputSeqDir, id_map) cactusWorkflowArguments.secondaryAlignmentsID = job.fileStore.writeGlobalFile(renamed_sec_alignments[0], cleanup=True) if cactusWorkflowArguments.outgroupFragmentIDs: og_alignments= job.fileStore.readGlobalFile(cactusWorkflowArguments.outgroupFragmentIDs) renamed_og_alignments = prepend_cigar_ids(og_alignments, renamedInputSeqDir, id_map) cactusWorkflowArguments.outgroupFragmentIDs = [job.fileStore.writeGlobalFile(rga, cleanup=True) for rga in renamed_og_alignments] return cactusWorkflowArguments
def testPrependUniqueIDs(self): # Create fake FASTA files with some interesting headers. with NamedTemporaryFile() as fasta1, NamedTemporaryFile() as fasta2: fasta1.write(dedent(""" >C10856240 2.0 ACTAGAGG G GG >foo bar baz ACTGACGATgacgat >emptyseq """)) fasta2.write(dedent(""" > space GTGC >id=1||header ATCC """)) fasta1.flush() fasta2.flush() outDir = mkdtemp() outputPaths = prependUniqueIDs([fasta1.name, fasta2.name], outDir) assert len(outputPaths) == 2 with open(outputPaths[0]) as f: self.assertEquals(f.read(), dedent(""" >id=0|C10856240 2.0 ACTAGAGG G GG >id=0|foo bar baz ACTGACGATgacgat >id=0|emptyseq """)) with open(outputPaths[1]) as f: self.assertEquals(f.read(), dedent(""" >id=1| space GTGC >id=1|id=1||header ATCC """)) shutil.rmtree(outDir)
def run_prepend_unique_ids(job, assembly_files): """ Ensures all input sequences from assembly_files have unique names. This is adapted from run_prepend_unique_ids in cactus_align, in order to maintain order-dependent renamings. Because cactus-reference-align assumes that all input sequences are ingroups, it does not attempt to avoid renaming outgroup genomes. """ # download all the sequence files event_to_path = {} for event, assembly_id in assembly_files.items(): event_to_path[event] = job.fileStore.readGlobalFile(assembly_id) # prepend unique id to each one (using event name instead of numeric id, as it's more stable across tools) event_to_unique_path = prependUniqueIDs(event_to_path, job.fileStore.getLocalTempDir(), eventNameAsID=True) # write the prepended files back to the job store and return the dict for event, prepended_sequence_path in event_to_unique_path.items(): assembly_files[event] = job.fileStore.writeGlobalFile( prepended_sequence_path, cleanup=True) return assembly_files
def run_prepend_unique_ids(job, cactusWorkflowArguments, project, renameCigars, eventNameAsID, mask_beds): """ prepend the unique ids on the input fasta. this is required for cactus to work (would be great to relax it though""" # note, there is an order dependence to everything where we have to match what was done in cactus_workflow # (so the code is pasted exactly as it is there) # this is horrible and needs to be fixed via drastic interface refactor # update: this has been somewhat fixed with a minor refactor: prependUniqueIDs is no longer order dependent (but takes dict instead of list) exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] eventToSequence = {} for g, seqID in ingroupsAndOriginalIDs: seqPath = job.fileStore.getLocalTempFile() + '.fa' job.fileStore.readGlobalFile(seqID, seqPath) eventToSequence[g] = seqPath cactusWorkflowArguments.totalSequenceSize = sum( os.stat(x).st_size for x in eventToSequence.values()) # need to have outgroups in there just for id naming (don't need their sequence) for g in exp.getOutgroupGenomes(): eventToSequence[g] = None renamedInputSeqDir = job.fileStore.getLocalTempDir() id_map = {} eventToUnique = prependUniqueIDs(eventToSequence, renamedInputSeqDir, idMap=id_map, eventNameAsID=eventNameAsID) # Set the uniquified IDs for the ingroups and outgroups for event, uniqueFa in eventToUnique.items(): uniqueFaID = job.fileStore.writeGlobalFile(uniqueFa, cleanup=True) cactusWorkflowArguments.experimentWrapper.setSequenceID( event, uniqueFaID) # if we're not taking cactus-[blast|refmap] input, then we have to apply to the cigar files too if renameCigars: alignments = job.fileStore.readGlobalFile( cactusWorkflowArguments.alignmentsID) renamed_alignments = prepend_cigar_ids([alignments], renamedInputSeqDir, id_map) cactusWorkflowArguments.alignmentsID = job.fileStore.writeGlobalFile( renamed_alignments[0], cleanup=True) if cactusWorkflowArguments.secondaryAlignmentsID: sec_alignments = job.fileStore.readGlobalFile( cactusWorkflowArguments.secondaryAlignmentsID) renamed_sec_alignments = prepend_cigar_ids([sec_alignments], renamedInputSeqDir, id_map) cactusWorkflowArguments.secondaryAlignmentsID = job.fileStore.writeGlobalFile( renamed_sec_alignments[0], cleanup=True) if cactusWorkflowArguments.outgroupFragmentIDs: og_alignments = job.fileStore.readGlobalFile( cactusWorkflowArguments.outgroupFragmentIDs) renamed_og_alignments = prepend_cigar_ids(og_alignments, renamedInputSeqDir, id_map) cactusWorkflowArguments.outgroupFragmentIDs = [ job.fileStore.writeGlobalFile(rga, cleanup=True) for rga in renamed_og_alignments ] # if we have some masking beds (maps event name to bed file id), then apply the naming there too mask_bed_id = None if mask_beds: mask_bed_path = job.fileStore.getLocalTempFile() with open(mask_bed_path, 'w') as mask_bed_file: for g, bed_id in mask_beds.items(): bed_path = job.fileStore.readGlobalFile(bed_id) with open(bed_path, 'r') as bed_file: for line in bed_file: toks = line.split('\t') if len(toks) > 2: if toks[0] not in id_map: raise RuntimeError( 'bed id {} not found in id-map {}'.format( toks[0], str(idMap)[:1000])) toks[0] = id_map[toks[0]] mask_bed_file.write('\t'.join(toks)) mask_bed_id = job.fileStore.writeGlobalFile(mask_bed_path) return cactusWorkflowArguments, mask_bed_id