示例#1
0
def build_filesystem():
    # DIRECTORY STRUCTURE
    global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR
    SCRATCH = Dir.make('/scratch/' + os.environ['USER'])
    BASEDIR = Dir('/Strong/proj/.data/Project_NTM')
    REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes"))
    DATA = BASEDIR.make_subdir("data")
    # RAW_DIR = DATA.make_subdir("00_raw")
    TRIM_DIR = DATA.make_subdir('trimmed_reads')
    ASSEMBLY_DIR = DATA.make_subdir('assemblies')
    ANNOTATION_DIR = DATA.make_subdir('annotations')
    MAP_DIR = DATA.make_subdir('mapped_reads')
示例#2
0
def identify(isolate,
             delimiter="_",
             species_threshold=0.97,
             genus_threshold=0.80):
    isolate.log('Identifying Isolate')
    assembly = isolate.files.assembly
    """Blast fasta to reference genomes and store values in db"""
    ani_script = "/Strong/proj/.data/Morty/.config/software/ani-script/ANI.pl"
    blastall = "/software/cgeh/blast/2.2.22/bin/blastall"
    formatdb = "/software/cgeh/blast/2.2.22/bin/formatdb"

    # COMPARE TO REFERENCES
    try:
        matches = []
        references = Fasta.get_all(REFERENCES)
        isolate.log(
            f"{isolate}: IDENTIFYING TAXON USING {len(references)} REFERENCES",
            lvl='INFO')
        for reference in references:
            ref_id = reference.filename.split('.')[0]
            scratch = SCRATCH.make_subdir('ani', isolate.name,
                                          f"{isolate}_vs_{ref_id}")
            command = f"perl {ani_script} -bl {blastall} -fd {formatdb} -qr {assembly} -sb {reference} -od {scratch}"
            output, error = subprocess.Popen(
                command.split(), stdout=subprocess.PIPE).communicate()

            try:
                ani = float(output) / 100
            except (ValueError, TypeError):
                ani = 0
            finally:
                record = {
                    'ani': ani,
                    'sample': isolate.name,
                    'reference': ref_id,
                    'taxon': ref_id.split('_')[0].split('-')[0]
                }
                matches.append(record)

        # WRITE CSV
        ani_csv = Dir.make(assembly.dir.join("ANI")).join(f"{isolate}_ANI.csv")
        df = pd.DataFrame.from_records(matches)
        df = df.sort_values('ani', ascending=False)
        df.to_csv(ani_csv, index=False)
        isolate.files.ani = File(ani_csv)

        # ASSIGN TAXON
        taxon = 'UNKNOWN'
        possible_species = df[(df.ani >= species_threshold)]
        if len(possible_species) > 0:
            taxon = possible_species.iloc[0].taxon
        elif len(df[(df.ani >= genus_threshold)]) > 0:
            taxon = 'NTM'

        isolate.taxon = taxon
        isolate.log(f"taxon={isolate.taxon}", lvl='INFO')
        return taxon

    except Exception as e:
        isolate.log(f"Identification failed: {e}", lvl='WARNING')
        subprocess.call(f"rm error.log formatdb.log".split())