예제 #1
0
def combine_lanes(directory=os.getcwd()):
    """
    REPORT:
        L001_R1 counts
        L002_R1 counts


    CF RULES 1e9 in one lane

    """

    directory = Dir(directory)
    temp = BASEDIR.make_subdir('tmp', directory.dirname)
    logger = generic_logger(temp.join('combine_lanes.log'))
    pairs = Fastq.get_pairs(directory)

    logger.info(f"Combining lanes of {len(pairs)} pairs in {directory}")
    for pair in pairs:
        try:
            if pair.pair1.lane == "L001" and os.path.isfile(
                    pair.pair1.path.replace("L001", "L002")):
                # READ 1
                L001_R1 = pair.pair1
                L002_R1 = Fastq(L001_R1.path.replace("L001", "L002"))
                LCAT_R1 = temp.join(L001_R1.filename.replace("L001", "LCAT"))
                # READ 2
                L001_R2 = pair.pair2
                L002_R2 = Fastq(L001_R2.path.replace("L001", "L002"))
                LCAT_R2 = temp.join(L001_R2.filename.replace("L001", "LCAT"))
                try:
                    # COMBINE LANE 1
                    logger.info(
                        f"Concatenating {L001_R1.filename} and {L002_R1.filename} as {LCAT_R1}"
                    )
                    subprocess.Popen(f'cat {L001_R1} {L002_R1} > {LCAT_R1}',
                                     shell=True)
                    logger.info("...PASS")
                except Exception as e:
                    logger.warning(f"...FAIL: {e}")
                try:
                    # COMBINE LANE 2
                    logger.info(
                        f"Concatenating {L001_R2.filename} and {L002_R2.filename} as {LCAT_R2}"
                    )
                    subprocess.Popen(f'cat {L001_R2} {L002_R2} > {LCAT_R2}',
                                     shell=True)
                    logger.info("...PASS")
                except Exception as e:
                    logger.info(f"...FAIL: {e}")
        except Exception as e:
            logger.warning(f'Combining Failed: {e}')
예제 #2
0
def build_filesystem():
    # DIRECTORY STRUCTURE
    global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR
    SCRATCH = Dir.make('/scratch/' + os.environ['USER'])
    BASEDIR = Dir('/Strong/proj/.data/Project_NTM')
    REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes"))
    DATA = BASEDIR.make_subdir("data")
    # RAW_DIR = DATA.make_subdir("00_raw")
    TRIM_DIR = DATA.make_subdir('trimmed_reads')
    ASSEMBLY_DIR = DATA.make_subdir('assemblies')
    ANNOTATION_DIR = DATA.make_subdir('annotations')
    MAP_DIR = DATA.make_subdir('mapped_reads')
예제 #3
0
class Config:

    # DIRECTORY STRUCTURE
    BASE_DIR = Dir('/Strong/proj/.data/Project_NTM')
    REFERENCES = Dir(BASE_DIR.join("lib", "reference_genomes"))

    DATA = BASE_DIR.make_subdir("data")
    TRIM_DIR = DATA.make_subdir("trimmed_reads")
    ASSEMBLY_DIR = DATA.make_subdir("assemblies")
    ANNOTATION_DIR = DATA.make_subdir("annotations")
    MAP_DIR = DATA.make_subdir("mapped_reads")

    # NAMING CONVENTIONS
    REPORTS = "reports"
    ARCHIVE = "archive"
    FAILED = "failed"
    LOGS = "logs"

    # REPORTS
    TRIM_STATS = "trim_stats.csv"
    ASSEMBLY_STATS = "assembly_stats.csv"

    # LOGGING
    LOG_LEVEL = logging.DEBUG
    LOG_FILE = BASE_DIR.join('log', 'pipeline.log')

    # REFERENCES
    REFERENCE_GENOMES = {
        # Clinical
        'MAB': 'MAB.ATCC19977.fasta',
        'MBOL': 'MAB.ATCC19977.fasta',
        'MAV': 'MAV.HOM.H87.fasta',
        'MMAS': 'MMAS.BRAPA42FWDG01.fasta',
        'MCHIM': 'MCHIM.CDC2015-22-71.fasta',
        'MINT': 'MCHIM.CDC2015-22-71.fasta',
        'MCHE': 'MCHE.ATCC19237.fasta',
        'MTB': 'MTB.H37RV.fasta',
        # Environmental
        'MAROS': 'MAROS.DSM45069.fasta',
        'MASIA': 'MASIA.DSM44297.fasta',
        'MBOUCH': 'MBOUCH.DSM45439.fasta',
        'MBOV': 'MBOV.AF2122.fasta',
        'MCANE': 'MCANE.CIPT140070017.fasta',
        'MCHUB': 'MCHUB.NBB4.fasta',
        'MCOLOM': 'MCOLOM.CECT3035.fasta',
        'MELE': 'MELE.DSM44368.fasta',
        'MFORT': 'MFORT.CT6.fasta',
        'MFRANK': 'MFRANK.DSM45524.fasta',
        'MGILV': 'MGILV.SPYR1.fasta',
        'MGORD': 'MGORD.DSM44160.fasta',
        'MHAEM': 'MHAEM.DSM44634.fasta',
        'MIMMU': 'MIMMU.CCUG47286T.fasta',
        'MINDP': 'MINDP.MTCC9506.fasta',
        'MIRAN': 'MIRAN.DSM45541.fasta',
        'MKAN': 'MKAN.ATCC12478.fasta',
        'MKUB': 'MKUB.CIP106428.fasta',
        'MLENT': 'MLENT.CSURP1491.fasta',
        'MLEPR': 'MLEPR.TN.fasta',
        'MLIFL': 'MLIFL.128FXT.fasta',
        'MMANT': 'MMANT.DSM45255.fasta',
        'MMARI': 'MMARI.M.fasta',
        'MMARS': 'MMARS.DSM45437.fasta',
        'MMUCO': 'MMUCO.CSURP2099.fasta',
        'MNEOA': 'MNEOA.VKMAC-1815D.fasta',
        'MPORC': 'MPORC.CSURP1564.fasta',
        'MRHOD': 'MRHOD.NBB3.fasta',
        'MSALM': 'MSALM.D16Q15.fasta',
        'MSENE': 'MSENE.NCTC4524.fasta',
        'MSIMI': 'MSIMI.ATCC25275.fasta',
        'MSMEG': 'MSMEG.MC2155.fasta',
        'MTERR': 'MTERR.NCTC10856.fasta',
        'MTIM': 'MTIM.CCUG56329.fasta',
        'MTRIP': 'MTRIP.DSM44626.fasta',
        'MULCE': 'MULCE.AGY99.fasta',
        'MVANB': 'MVANB.PYR-1.fasta',
        'MVUL': 'MVUL.DSM45247T.fasta',
        'MXENO': 'MXENO.RIVM700367.fasta',
        'MYONG': 'MYONG.05-1390.fasta',
        'NFARC': 'NFARC.NCTC3000.fasta'
    }

    SPECIES_GROUPS = {'MAC': ['MAV', 'MCHIM', 'MINT']}

    @classmethod
    def declare_globals(cls):
        global BASEDIR, REFERENCES, REFERENCE_GENOMES, SPECIES_GROUPS
예제 #4
0
 def __init__(self):
     directory = Dir(BASEDIR.join("lib", "reference_genomes"))
     references = Fasta.get_all(directory)
예제 #5
0
def identify(isolate,
             delimiter="_",
             species_threshold=0.97,
             genus_threshold=0.80):
    isolate.log('Identifying Isolate')
    assembly = isolate.files.assembly
    """Blast fasta to reference genomes and store values in db"""
    ani_script = "/Strong/proj/.data/Morty/.config/software/ani-script/ANI.pl"
    blastall = "/software/cgeh/blast/2.2.22/bin/blastall"
    formatdb = "/software/cgeh/blast/2.2.22/bin/formatdb"

    # COMPARE TO REFERENCES
    try:
        matches = []
        references = Fasta.get_all(REFERENCES)
        isolate.log(
            f"{isolate}: IDENTIFYING TAXON USING {len(references)} REFERENCES",
            lvl='INFO')
        for reference in references:
            ref_id = reference.filename.split('.')[0]
            scratch = SCRATCH.make_subdir('ani', isolate.name,
                                          f"{isolate}_vs_{ref_id}")
            command = f"perl {ani_script} -bl {blastall} -fd {formatdb} -qr {assembly} -sb {reference} -od {scratch}"
            output, error = subprocess.Popen(
                command.split(), stdout=subprocess.PIPE).communicate()

            try:
                ani = float(output) / 100
            except (ValueError, TypeError):
                ani = 0
            finally:
                record = {
                    'ani': ani,
                    'sample': isolate.name,
                    'reference': ref_id,
                    'taxon': ref_id.split('_')[0].split('-')[0]
                }
                matches.append(record)

        # WRITE CSV
        ani_csv = Dir.make(assembly.dir.join("ANI")).join(f"{isolate}_ANI.csv")
        df = pd.DataFrame.from_records(matches)
        df = df.sort_values('ani', ascending=False)
        df.to_csv(ani_csv, index=False)
        isolate.files.ani = File(ani_csv)

        # ASSIGN TAXON
        taxon = 'UNKNOWN'
        possible_species = df[(df.ani >= species_threshold)]
        if len(possible_species) > 0:
            taxon = possible_species.iloc[0].taxon
        elif len(df[(df.ani >= genus_threshold)]) > 0:
            taxon = 'NTM'

        isolate.taxon = taxon
        isolate.log(f"taxon={isolate.taxon}", lvl='INFO')
        return taxon

    except Exception as e:
        isolate.log(f"Identification failed: {e}", lvl='WARNING')
        subprocess.call(f"rm error.log formatdb.log".split())
예제 #6
0
def process_directory(directory=os.getcwd()):
    """Run pipeline on all fastq pairs in directory"""
    [
        LSF.bsub(f"{sys.argv[0]} pipeline {pair.pair1} {pair.pair2}")
        for pair in Fastq.get_pairs(Dir(directory))
    ]
예제 #7
0
def configure():
    global BASEDIR, REFERENCES, REFERENCE_GENOMES, SPECIES_GROUPS
    BASEDIR = Dir('/Strong/proj/.data/ProjectNTM')
    REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes"))
    REFERENCE_GENOMES = {
        # Clinical
        'MAB': 'MAB.ATCC19977.fasta',
        'MBOL': 'MAB.ATCC19977.fasta',
        'MAV': 'MAV.HOM.H87.fasta',
        'MMAS': 'MMAS.BRAPA42FWDG01.fasta',
        'MCHIM': 'MCHIM.CDC2015-22-71.fasta',
        'MINT': 'MCHIM.CDC2015-22-71.fasta',
        'MCHE': 'MCHE.ATCC19237.fasta',
        'MTB': 'MTB.H37RV.fasta',
        # Environmental
        'MAROS': 'MAROS.DSM45069.fasta',
        'MASIA': 'MASIA.DSM44297.fasta',
        'MBOUCH': 'MBOUCH.DSM45439.fasta',
        'MBOV': 'MBOV.AF2122.fasta',
        'MCANE': 'MCANE.CIPT140070017.fasta',
        'MCHUB': 'MCHUB.NBB4.fasta',
        'MCOLOM': 'MCOLOM.CECT3035.fasta',
        'MELE': 'MELE.DSM44368.fasta',
        'MFORT': 'MFORT.CT6.fasta',
        'MFRANK': 'MFRANK.DSM45524.fasta',
        'MGILV': 'MGILV.SPYR1.fasta',
        'MGORD': 'MGORD.DSM44160.fasta',
        'MHAEM': 'MHAEM.DSM44634.fasta',
        'MIMMU': 'MIMMU.CCUG47286T.fasta',
        'MINDP': 'MINDP.MTCC9506.fasta',
        'MIRAN': 'MIRAN.DSM45541.fasta',
        'MKAN': 'MKAN.ATCC12478.fasta',
        'MKUB': 'MKUB.CIP106428.fasta',
        'MLENT': 'MLENT.CSURP1491.fasta',
        'MLEPR': 'MLEPR.TN.fasta',
        'MLIFL': 'MLIFL.128FXT.fasta',
        'MMANT': 'MMANT.DSM45255.fasta',
        'MMARI': 'MMARI.M.fasta',
        'MMARS': 'MMARS.DSM45437.fasta',
        'MMUCO': 'MMUCO.CSURP2099.fasta',
        'MNEOA': 'MNEOA.VKMAC-1815D.fasta',
        'MPORC': 'MPORC.CSURP1564.fasta',
        'MRHOD': 'MRHOD.NBB3.fasta',
        'MSALM': 'MSALM.D16Q15.fasta',
        'MSENE': 'MSENE.NCTC4524.fasta',
        'MSIMI': 'MSIMI.ATCC25275.fasta',
        'MSMEG': 'MSMEG.MC2155.fasta',
        'MTERR': 'MTERR.NCTC10856.fasta',
        'MTIM': 'MTIM.CCUG56329.fasta',
        'MTRIP': 'MTRIP.DSM44626.fasta',
        'MULCE': 'MULCE.AGY99.fasta',
        'MVANB': 'MVANB.PYR-1.fasta',
        'MVUL': 'MVUL.DSM45247T.fasta',
        'MXENO': 'MXENO.RIVM700367.fasta',
        'MYONG': 'MYONG.05-1390.fasta',
        'NFARC': 'NFARC.NCTC3000.fasta'
    }
    SPECIES_GROUPS = {
        'MAC': ['MAV', 'MCHIM', 'MINT', 'MTIM', 'MBOUCH', 'MMARS'],
        'MAB': ['MAB', 'MBOL', 'MMAS']
    }
예제 #8
0
def update_taxa(species_threshold=0.97, genus_threshold=0.80):
    logger = generic_logger('update_taxa.csv')

    run = Dir().dirname
    trim_dir = Dir(BASEDIR.join('data', 'trimmed_reads', run))
    assembly_dir = Dir(BASEDIR.join('data', 'assemblies', run))
    ani_dir = Dir(assembly_dir.join('ANI'))

    trimmed_reads = trim_dir.files(endswith="fq", dataframe=True)
    assemblies = assembly_dir.files(endswith='fna', dataframe=True)
    ani = ani_dir.files(endswith='.csv')

    for file in ani:
        try:
            df = pd.read_csv(file.path).sort_values('ani')
            sample_name = df.iloc[0].sample
            trim1 = trimmed_reads[
                (trimmed_reads.filename.str.contains(sample_name))
                & (trimmed_reads.filename.str.contains('_R1'))].iloc[0].path
            trim2 = trimmed_reads[
                (trimmed_reads.filename.str.contains(sample_name))
                & (trimmed_reads.filename.str.contains('_R2'))].iloc[0].path
            assembly = assemblies[(
                assemblies.filename.str.contains(sample_name))].iloc[0].path

            trim1 = File(trim1)
            trim2 = File(trim2)
            assembly = File(assembly)

            # ASSIGN TAXON
            taxon = 'UNKNOWN'
            possible_species = df[(df.ani >= species_threshold)]
            if len(possible_species) > 0:
                taxon = possible_species.iloc[0].taxon
            elif len(df[(df.ani >= genus_threshold)]) > 0:
                taxon = 'NTM'

            trim1_filename = trim1.filename
            trim2_filename = trim2.filename
            assembly_filename = assembly.filename

            trim1.rename(f'{sample_name}_{taxon}.fq.gz')
            trim2.rename(f'{sample_name}_{taxon}.fq.gz')
            assembly.rename(f'{sample_name}_{taxon}_000.fna')

            logger.info(f'renamed {trim1_filename} to {trim1.filename}')
            logger.info(f'renamed {trim2_filename} to {trim1.filename}')
            logger.info(f'renamed {assembly_filename} to {assembly.filename}')

        except Exception as e:
            logger.warning(e)
예제 #9
0
"""
beagles[compbio] alma: grep , E-ALMA1.MAV_vs_NJH87.cf | more

CP018363.1	83972	C	T,G	0	34	34	SNP
CP018363.1	148672	A	G,C	0	14	14	SNP
CP018363.1	1381815	G	A,C	0	8	8	SNP
CP018363.1	2334023	G	A,T	0	30	30	SNP
CP018363.1	2948951	T	C,G	0	6	6	SNP
CP018363.1	3621547	T	C,G	0	35	35	SNP
CP018363.1	4243092	A	G,C	0	33	33	SNP
CP018363.1	4562156	T	C,G	0	15	15	SNP
"""


VCF_DIR = Dir("/Strong/proj/.data/alma")
FASTA_DIR = VCF_DIR.make_subdir("fasta")
MATRIX = FASTA_DIR.join("ALMA_matrix_N.fna")
CSV_OUT = FASTA_DIR.join("ALMA_stats.csv")
CSV_MUTATIONS = FASTA_DIR.join("ALMA_mutations.csv")
POSITIONS = 5626623

files = VCF_DIR.files(endswith='.cf')

records = []
multi_alleles = {}

print(f"Building Matrix: {MATRIX} from {len(files)} files.")
for i, file in enumerate(files):
    print(f"\t{i:02d} | {file.filename}")
    isolate = file.filename.split("_vs_")[0]