def build_filesystem(): # DIRECTORY STRUCTURE global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR SCRATCH = Dir.make('/scratch/' + os.environ['USER']) BASEDIR = Dir('/Strong/proj/.data/Project_NTM') REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes")) DATA = BASEDIR.make_subdir("data") # RAW_DIR = DATA.make_subdir("00_raw") TRIM_DIR = DATA.make_subdir('trimmed_reads') ASSEMBLY_DIR = DATA.make_subdir('assemblies') ANNOTATION_DIR = DATA.make_subdir('annotations') MAP_DIR = DATA.make_subdir('mapped_reads')
def identify(isolate, delimiter="_", species_threshold=0.97, genus_threshold=0.80): isolate.log('Identifying Isolate') assembly = isolate.files.assembly """Blast fasta to reference genomes and store values in db""" ani_script = "/Strong/proj/.data/Morty/.config/software/ani-script/ANI.pl" blastall = "/software/cgeh/blast/2.2.22/bin/blastall" formatdb = "/software/cgeh/blast/2.2.22/bin/formatdb" # COMPARE TO REFERENCES try: matches = [] references = Fasta.get_all(REFERENCES) isolate.log( f"{isolate}: IDENTIFYING TAXON USING {len(references)} REFERENCES", lvl='INFO') for reference in references: ref_id = reference.filename.split('.')[0] scratch = SCRATCH.make_subdir('ani', isolate.name, f"{isolate}_vs_{ref_id}") command = f"perl {ani_script} -bl {blastall} -fd {formatdb} -qr {assembly} -sb {reference} -od {scratch}" output, error = subprocess.Popen( command.split(), stdout=subprocess.PIPE).communicate() try: ani = float(output) / 100 except (ValueError, TypeError): ani = 0 finally: record = { 'ani': ani, 'sample': isolate.name, 'reference': ref_id, 'taxon': ref_id.split('_')[0].split('-')[0] } matches.append(record) # WRITE CSV ani_csv = Dir.make(assembly.dir.join("ANI")).join(f"{isolate}_ANI.csv") df = pd.DataFrame.from_records(matches) df = df.sort_values('ani', ascending=False) df.to_csv(ani_csv, index=False) isolate.files.ani = File(ani_csv) # ASSIGN TAXON taxon = 'UNKNOWN' possible_species = df[(df.ani >= species_threshold)] if len(possible_species) > 0: taxon = possible_species.iloc[0].taxon elif len(df[(df.ani >= genus_threshold)]) > 0: taxon = 'NTM' isolate.taxon = taxon isolate.log(f"taxon={isolate.taxon}", lvl='INFO') return taxon except Exception as e: isolate.log(f"Identification failed: {e}", lvl='WARNING') subprocess.call(f"rm error.log formatdb.log".split())