Exemplo n.º 1
0
def detect_prophage(prefix_name,faa_file, base_dir='.', timing_log=None,threads=0):
    #TODO: include overwrite mode
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir, 'element_finder_' +prefix_name)
    if not os.path.exists(path_out):
        os.makedirs(path_out)

    #Plasmid finder
    prophage_out = os.path.join(path_out,prefix_name + '_prophage.tsv')
    if os.path.isfile(prophage_out):
        return prophage_out
    # cmd = 'abricate --quiet --threads {threads} --nopath --db plasmidfinder {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=oriREP_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd) != 0:
    #     return None
    #Plasmid finder
    gunzip_faa= faa_file;
    if faa_file.endswith('.gz'):
        gunzip_faa =os.path.join(path_out,prefix_name+'.faa')
        cmd = 'gunzip -c {} > {}'.format(faa_file, gunzip_faa)
        run_command(cmd)
    element_finder.search_prophage(sample=gunzip_faa,output=prophage_out,threads=threads)
    if os.path.exists(os.path.join(path_out,prefix_name+'.faa')):
        os.remove(os.path.join(path_out,prefix_name+'.faa'))

    return prophage_out
Exemplo n.º 2
0
def detect_integron(prefix_name,assembly, base_dir='.', timing_log=None,threads=0):
    # TODO: include overwrite mode
    if threads == 0:
        threads = NUM_CORES_DEFAULT
    #path_out = os.path.join(base_dir, 'integron_finder_' + read_data['sample_id'])
    path_out = os.path.join(base_dir,  prefix_name+'_integrall' )
    if not os.path.exists(path_out):
        os.makedirs(path_out)
    #Plasmid finder
    integron_out = os.path.join(path_out, prefix_name + '_integron.tsv')

    if os.path.isfile(integron_out):
        return integron_out

    gunzip_fna= assembly;
    if assembly.endswith('.gz'):
        #FIXME: this step may not be necessary
        gunzip_fna =os.path.join(path_out,prefix_name+'.fasta')
        cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna)
        run_command(cmd)

    # cmd = 'integron_finder {sequence} --func-annot --local-max --mute --outdir {outdir}'.format(sequence=read_data['assembly'],outdir=path_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd,timing_log) != 0:
    #     return None
    element_finder.search_integrall(sample=gunzip_fna,output=integron_out,threads=threads)
    if os.path.exists(os.path.join(path_out,prefix_name+'.fasta')):
        os.remove(os.path.join(path_out,prefix_name+'.fasta'))
    return integron_out
Exemplo n.º 3
0
def run_roary(gff_folder,
              overwrite=False,
              threads=0,
              base_dir='.',
              timing_log=None):
    """
        Run roay make pangeome analysis (using prokka results in previous step)
        :param read_data: result holder
        :param base_dir: working directory
        :param threads: number of core CPU
        :return:
    """

    roary_folder = os.path.join(base_dir, 'pangenome/roary')
    temp_folder = os.path.join(base_dir, 'pangenome/temp_roary')
    roary_output = os.path.join(roary_folder, 'summary_statistics.txt')
    if os.path.isfile(roary_output) and (not overwrite):
        logger.info(
            'roary has run and the input has not changed, skip roarying')
        return roary_folder
    if not os.path.isdir(temp_folder):
        os.makedirs(temp_folder)
    gff_list = []
    for filename in os.listdir(gff_folder):
        if filename.endswith('.gz'):
            sample_id = filename.replace('.gff.gz', '')
            #gffgz_file = os.path.join(sample['annotation'], sample_id + '.gff.gz')
            gff_file = os.path.join(temp_folder, sample_id + '.gff')
            if run_command('gunzip -c {} > {}'.format(
                    os.path.join(gff_folder, filename), gff_file)) != 0:
                raise Exception('Cannot get {}'.format(
                    os.path.join(gff_folder, filename)))
            gff_list.append(gff_file)
        else:
            gff_list.append(os.path.join(gff_folder, filename))

    # Make sure the directory is not there or roary will add timestamp
    if os.path.isfile(roary_folder):
        os.remove(roary_folder)
    if os.path.exists(roary_folder):
        shutil.rmtree(roary_folder)
    cmd = 'roary -p {} -f {} -v '.format(threads,
                                         roary_folder) + ' '.join(gff_list)
    ret = run_command(cmd, timing_log)
    if ret != 0:
        raise Exception('roary fail to run!')

    cmd = 'gzip ' + os.path.join(roary_folder, 'gene_presence_absence.csv')
    ret = run_command(cmd)
    if ret != 0:
        raise Exception('Error running {}'.format(cmd))

    shutil.rmtree(temp_folder)

    return roary_folder
Exemplo n.º 4
0
def qc_reads(prefix_name,
             reads,
             base_dir='.',
             threads=0,
             timing_log=None,
             **kargs):
    """
        Run QC process for pair-end input using fastqc and multiqc
        :param read_data: result holder
        :param base_dir: working directory
        :return: path to qc output file
    """
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    out_fastqc = os.path.join(base_dir, prefix_name + '_fastqc')
    if not os.path.exists(out_fastqc):
        os.makedirs(out_fastqc)

    if 'pe1' in reads and 'pe2' in reads:
        cmd = 'fastqc -t {threads} -o {outdir} {pe1} {pe2}'.format(
            threads=threads,
            outdir=out_fastqc,
            pe1=reads['pe1'],
            pe2=reads['pe2'])
        fastqc_ret = run_command(cmd, timing_log)
        if fastqc_ret != 0:
            return None
    elif 'se' in reads:
        cmd = 'fastqc -t {threads} -o {outdir} {reads}'.format(
            threads=threads, outdir=out_fastqc, reads=reads['se'])
        fastqc_ret = run_command(cmd, timing_log)
        if fastqc_ret != 0:
            return None
    elif 'long-read' in reads:
        cmd = 'fastqc -t {threads} -o {outdir} {reads}'.format(
            threads=threads, outdir=out_fastqc, reads=reads['long-read'])
        fastqc_ret = run_command(cmd, timing_log)
        if fastqc_ret != 0:
            return None
    out_multiqc = os.path.join(base_dir, prefix_name + '_multiqc')
    if not os.path.exists(out_multiqc):
        os.makedirs(out_multiqc)
    cmd = 'multiqc -o {outdir} {indir}'.format(outdir=out_multiqc,
                                               indir=out_fastqc)
    multiqc_ret = run_command(cmd, timing_log)
    if multiqc_ret != 0:
        return None
    #read_data['fastqc']=out_fastqc

    return os.path.join(out_multiqc, 'multiqc_data', 'multiqc_fastqc.txt')
Exemplo n.º 5
0
def detect_virulome(prefix_name,assembly, base_dir='.', threads=0, timing_log=None):
    """
    Run in-house script to identify virulent genes using VFDB

    Parameters
    ----------
    prefix_name:
        name to attach to output
    assembly: str
        input sequence
    threads: int
        number of threads to use
    overwrite:bool
        whether to overwrite the existing result
    timing_log: str
        log file
    Returns
    -------
        path to virulent gene file
    """

    #TODO: to include overwrite
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir,  prefix_name+'_element_finder' )
    if not os.path.exists(path_out):
        os.makedirs(path_out)

    vir_out = os.path.join(path_out, prefix_name + '_virulome.tsv')
    if os.path.isfile(vir_out):
        return vir_out

    # cmd = 'abricate --quiet --threads {threads} --nopath --db vfdb {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=vir_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd) != 0:
    #     return None
    gunzip_fna = assembly
    if assembly.endswith('.gz'):
        #FIXME: review this step
        gunzip_fna =os.path.join(path_out,prefix_name+'.fasta')
        cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna)
        run_command(cmd)
    element_finder.search_virulome(sample=gunzip_fna,output=vir_out,threads=threads)

    if not os.path.exists(os.path.join(path_out,prefix_name+'.fasta')):
        os.remove(os.path.join(path_out,prefix_name+'.fasta'))
    return vir_out
Exemplo n.º 6
0
def detect_insertion_sequence(prefix_name,assembly,  base_dir='.', threads=0):
    """
        Run isescan for searching IS
        :param read_data: result holder
        :param base_dir: working directory
        :return: path to output file in result holder
    """
    # TODO: include overwrite mode
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir, prefix_name+"_isescan")
    if not os.path.exists(path_out):
        os.makedirs(path_out)

    isescan_out = os.path.join(path_out, prefix_name + '_is.tsv')
    if os.path.isfile(isescan_out):
        return isescan_out

    gunzip_fna = assembly
    if assembly.endswith('.gz'):
        #FIXME: this step is not needed
        gunzip_fna =os.path.join(path_out,prefix_name+'.fasta')
        cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna)
        run_command(cmd)
    #Plasmid finder

    cmd = 'isescan.py --nthread {threads} --seqfile {asm} --output {output}  '.format(
        threads=threads,
        asm=gunzip_fna,
        output=path_out
    )
    if run_command(cmd) != 0:
        return None
    #if os.path.exists(path_out+'/prediction'):
    #    shutil.rmtree(path_out+'/prediction')
    #shutil.copytree('prediction', path_out+'/prediction')
    #read_data['is'] = path_out+'/prediction'
    isout=None
    for root, dirs, files in os.walk(path_out, topdown=False):
        for name in files:
            if name.endswith('.raw'):
                isout=os.path.join(root, name)
    #if os.path.exists('prediction'):
    #    shutil.rmtree('prediction')
    if os.path.exists(os.path.join(path_out,prefix_name+'.fasta')):
        os.remove(os.path.join(path_out,prefix_name+'.fasta'))
    return isout
Exemplo n.º 7
0
def assembly_eval(prefix_name,
                  assembly,
                  base_dir='.',
                  threads=0,
                  timing_log=None,
                  **kargs):
    """
        Run QC process for assembly output
        :param read_data: result holder
        :param base_dir: working directory
        :return: path to report file
    """
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    out_quast = os.path.join(base_dir, prefix_name + '_quast')
    if not os.path.exists(out_quast):
        os.makedirs(out_quast)

    cmd = 'quast.py -t {threads} -o {outdir} {input}'.format(threads=threads,
                                                             outdir=out_quast,
                                                             input=assembly)
    quast_ret = run_command(cmd, timing_log)

    if quast_ret != 0:
        return None

    return os.path.join(out_quast, 'report.tsv')
Exemplo n.º 8
0
def species_identification_kraken(prefix_name,
                                  assembly,
                                  db='db/kraken2/k2std',
                                  base_dir='.',
                                  timing_log=None,
                                  threads=0):
    """
        Run kraken2 for identifying species
        :param read_data: result holder
        :param base_dir: working directory
        :return: path to output file in result holder
    """
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir, prefix_name + '_kraken2')
    kraken2_report = os.path.join(path_out, prefix_name + '_kraken2.tsv')
    if not os.path.exists(path_out):
        os.makedirs(path_out)

    cmd = 'kraken2 --db {db} --use-names --threads {threads} --report {report} {asm}'.format(
        db=db, threads=threads, report=kraken2_report, asm=assembly)

    cmd = "bash -c '{}'".format(cmd)
    if run_command(cmd, timing_log) != 0:
        return None
    return kraken2_report
Exemplo n.º 9
0
def run_protein_alignment(roary_folder, collection_dir, threads=8, overwrite=False, timing_log=None):
    """
    Align protein sequence by mafft

    Parameters
    ----------
    report: object
        A report object
    collection_dir: str
        working directory of the collection
    threads: int
        number of threads to use
    overwrite: bool
        whether to overwrite existing result even if input did not change
    timing_log: str
        file to log timing
    Returns
        report object
    -------
    """
    alignment_dir = os.path.join(collection_dir, 'alignments')

    gene_cluster_file =roary_folder + '/gene_presence_absence.Rtab'
    gene_df = pd.read_csv(gene_cluster_file, sep='\t', index_col='Gene')
    gene_df.fillna('', inplace=True)

    cmds_file = os.path.join(alignment_dir,"align_cmds")
    with open(cmds_file,'w') as cmds:
        for gene_id, row in gene_df.iterrows():
            # Only align if there are at least 2 sequences
            if row.sum() < 2:
                continue

            gene_id = re.sub(r'\W+', '', gene_id)
            gene_dir = os.path.join(alignment_dir, gene_id)

            # check if done before
            gene_aln_file = os.path.join(gene_dir, gene_id + '.faa.aln.gz')
            if (not overwrite) and os.path.isfile(gene_aln_file):
                continue

            gene_seq_file = os.path.join(gene_dir, gene_id + '.faa')
            if not os.path.isfile(gene_seq_file):
                logger.info('{} does not exist'.format(gene_aln_file))
                continue

            cmd = f"mafft --auto --quiet --thread 1 {gene_seq_file} | gzip > {gene_aln_file}"
            cmds.write(cmd + '\n')

    cmd = f"parallel --bar -j {threads} -a {cmds_file}"
    ret = run_command(cmd, timing_log)
    #report['alignments'] = alignment_dir
    return alignment_dir
Exemplo n.º 10
0
def run_species_phylogeny_iqtree(roary_folder,
                                 collection_dir,
                                 threads=8,
                                 overwrite=False,
                                 timing_log=None):
    """
    Run iqtree to create phylogeny tree from core gene alignment. If the list of samples has
    not changed, and none of the samples has changed, the existing tree will be kept unless
    overwrite is set to True
    Parameters
    ----------
    report: object
        A report object
    collection_dir: str
        working directory of the collection
    threads: int
        number of threads to use
    overwrite: bool
        whether to overwrite existing result even if input did not change
    timing_log: str
        file to log timing
    Returns
        report object
    -------
    """
    phylogeny_folder = os.path.join(collection_dir, 'phylogeny')
    if not os.path.exists(phylogeny_folder):
        os.makedirs(phylogeny_folder)
    #report['phylogeny'] = phylogeny_folder

    phylogeny_file = os.path.join(phylogeny_folder,
                                  'core_gene_alignment.treefile')
    if os.path.isfile(phylogeny_file) and (not overwrite):
        logger.info(
            'phylogeny tree exists and input has not changed, skip phylogeny analysis'
        )
        return phylogeny_folder

    aln_file = os.path.join(phylogeny_folder, 'core_gene_alignment.aln.gz')
    if not os.path.isfile(aln_file):
        aln_file = os.path.join(report['roary'], 'core_gene_alignment.aln.gz')
    cmd = 'iqtree -s {alignment} --prefix {prefix} -B 1000 -T {threads} -czb -keep-ident'.format(
        alignment=aln_file,
        prefix=phylogeny_folder + '/core_gene_alignment',
        threads=threads)
    ret = run_command(cmd, timing_log)
    if ret != 0:
        raise Exception(
            'iqtree fail to create phylogeny tree from core gene alignment!')

    return phylogeny_folder
Exemplo n.º 11
0
def detect_amr_abricate(prefix_name, assembly, base_dir='.', threads=8, overwrite=False, timing_log=None):
    """
    Run abricate to identify resistant genes

    Parameters
    ----------
    sample:
        a dictionary-like object containing various attributes for a sample
    sample_dir: str
        the directory of the sample
    threads: int
        number of threads to use
    overwrite:bool
        whether to overwrite the existing result
    timing_log: str
        log file
    Returns
    -------
        path to resistant gene file
    """
    path_out = os.path.join(base_dir,   prefix_name+'_abricate')
    if not os.path.exists(path_out):
        os.makedirs(path_out)
    # TODO: replace by consensus db later
    amr_out = os.path.join(path_out, prefix_name+ '_resistome.tsv')
    if os.path.isfile(amr_out) and (not overwrite):
        logger.info('Resistome for {} exists, skip analysis'.format(prefix_name))
        return amr_out
    dbs=['ncbi','megares','ecoh','argannot','card','resfinder']
    numError=0
    outputfiles=[]
    for db in dbs:
        outfile= os.path.join(path_out,prefix_name + '_'+db+'.tsv')
        cmd = 'abricate --quiet --threads {threads} --nopath --db {db} {infile} > {outfile}'.format(
            threads=threads,
            db=db,
            infile=assembly,
            outfile=outfile)
        if run_command(cmd, timing_log) != 0:
            numError=numError+1
        else:
            outputfiles.append(outfile)
    if numError==len(dbs):
        raise Exception('Error running amr')
    combined_tsv = pd.concat([pd.read_csv(f,sep='\t') for f in outputfiles ])
    combined_tsv.sort_values(['SEQUENCE','START'],ascending=[True, True],inplace=True)
    combined_tsv.to_csv(amr_out, index=False,sep='\t', encoding='utf-8-sig')
    #sample['updated'] = True
    return amr_out
Exemplo n.º 12
0
def detect_mlst(prefix_name,
                assembly,
                base_dir='.',
                timing_log=None,
                threads=0):
    #TODO: include overwrite
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir, prefix_name + '_mlst')
    if not os.path.exists(path_out):
        os.makedirs(path_out)
    mlst_out = os.path.join(path_out, prefix_name + '_mlst.tsv')
    if os.path.isfile(mlst_out):
        return mlst_out

    gunzip_fna = assembly
    if assembly.endswith('.gz'):
        #FIXME: check if this is needed
        gunzip_fna = os.path.join(path_out, prefix_name + '.fasta')
        cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna)
        run_command(cmd)
    m = mlst.find_mlst(gunzip_fna)
    with open(mlst_out, 'w') as f:
        f.write("%s\t%s\t%s" % (m['file'], m['scheme'], m['st']))
        for gene in m['profile']:
            f.write("\t%s" % gene)
        f.write("\n")
    # cmd = 'mlst --quiet --threads {threads} --nopath {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=mlst_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd) != 0:
    #     return None
    if os.path.exists(os.path.join(path_out, prefix_name + '.fasta')):
        os.remove(os.path.join(path_out, prefix_name + '.fasta'))

    return mlst_out
Exemplo n.º 13
0
def detect_pmlst(prefix_name,assembly,  base_dir='.', threads=0):
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir,prefix_name+'_pmlst' )
    if not os.path.exists(path_out):
        os.makedirs(path_out)

    #Plasmid finder
    pmlst_out = os.path.join(path_out, prefix_name + '_pmlst.tsv')
    if os.path.isfile(pmlst_out):
        return pmlst_out

    # cmd = 'abricate --quiet --threads {threads} --nopath --db plasmidfinder {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=oriREP_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd) != 0:
    #     return None
    gunzip_fna = assembly
    if assembly.endswith('.gz'):
        #FIXME: review this
        gunzip_fna =os.path.join(path_out,prefix_name+'.fasta')
        cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna)
        run_command(cmd)
    m=mlst.find_mlst(query_file=gunzip_fna,blastdb='db/pmlst/blast/pmlst.fa',mlstdb='db/pmlst/pubmlst',num_threads=threads)
    with open(pmlst_out, 'w') as f:
        f.write("%s\t%s\t%s"%(m['file'],m['scheme'],m['st']))
        for gene in m['profile']:
            f.write("\t%s"%gene)
        f.write("\n")
    # cmd = 'mlst --quiet --threads {threads} --nopath {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=mlst_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd) != 0:
    #     return None
    if os.path.exists(os.path.join(path_out,prefix_name+'.fasta')):
        os.remove(os.path.join(path_out,prefix_name+'.fasta'))
    return pmlst_out
Exemplo n.º 14
0
def map_reads_to_assembly_bwamem(prefix_name,
                                 assembly,
                                 reads,
                                 base_dir='.',
                                 threads=0,
                                 memory=50,
                                 timing_log=None,
                                 **kargs):
    if not os.path.isfile(assembly + '.sa'):
        cmd = 'bwa index ' + assembly
        ret = run_command(cmd, timing_log)
        if ret != 0:
            return None
    path_out = os.path.dirname(assembly)

    cmd = 'bwa mem -t {threads} {index}'.format(threads=threads,
                                                index=assembly)
    if 'pe1' in reads and 'pe2' in reads:
        pe_sam = os.path.join(path_out, prefix_name + '_pe.sam')
        pe_bam = os.path.join(path_out, prefix_name + '_pe.bam')
        cmd_bwa_pe = cmd + ' ' + reads['pe1'] + ' ' + reads[
            'pe2'] + ' > ' + pe_sam
        run_command(cmd_bwa_pe, timing_log)
        cmd_st_pe = 'samtools view -u {sam} | samtools sort -@{threads} -o {bam} - ;samtools index {bam}'.format(
            sam=pe_sam, threads=threads, bam=pe_bam)
        run_command(cmd_st_pe, timing_log)
        return pe_bam

    if 'se' in reads:
        se_sam = os.path.join(path_out, prefix_name + '_se.sam')
        se_bam = os.path.join(path_out, prefix_name + '_se.bam')
        cmd_bwa_se = cmd + ' ' + reads['se'] + ' > ' + se_sam
        run_command(cmd_bwa_se, timing_log)
        cmd_st_se = 'samtools view -u {sam} | samtools sort -@{threads} -o {bam} - ;samtools index {bam}'.format(
            sam=se_sam, threads=threads, bam=se_bam)
        run_command(cmd_st_se, timing_log)
        return se_bam
Exemplo n.º 15
0
def detect_amr_amrfinder(prefix_name,faa_file,fna_file,gff_file,genus=None,species=None,  base_dir='.', db='db/amrfinderplus/data/latest', timing_log=None, threads=0):
    """
        Run AMR analysis, using AMRfinderPlus for searching amr genes, virulome genes and point mutaions.
        :param read_data: result holder
        :param base_dir: working directory
        :return: path to output file in result holder
    """
    if threads == 0:
        threads = NUM_CORES_DEFAULT
    path_out = os.path.join(base_dir,   prefix_name+'_amrfinder')
    if not os.path.exists(path_out):
        os.makedirs(path_out)
    #AMR profiling with CARD. TODO: replace by consensus db later
    ret_out = os.path.join(path_out, prefix_name + '_amr.tsv')
    amr_out = os.path.join(path_out, prefix_name+ '_resistome.tsv')
    virulen_out = os.path.join(path_out, prefix_name + '_virulome.tsv')
    point_out = os.path.join(path_out, prefix_name + '_point.tsv')
    #using abricate
    # cmd = 'abricate --quiet --threads {threads} --nopath --db card {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=amr_out)
    # cmd = "bash -c '{}'".format(cmd)
    # if run_command(cmd) != 0:
    #     return None
    #using build-in function
    #element_finder.search_amr(sample=read_data['assembly'],output=amr_out,threads=threads)
    #using AMRFinderPlus

    #process files in prokka folder, prepare for using amrfinder
    #move files from prokka to temp folder
    temp_dir = os.path.join(base_dir, 'amr_temp')
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    temp_gff_file=os.path.join(temp_dir,prefix_name+'.gff')
    source_gff_file=None
    # for root, dirs, files in os.walk(read_data['annotation']):
    #     for _file in files:
    #         if _file.endswith(('.faa')):
    #             faa_file = shutil.copyfile(os.path.join(str(root),_file), faa_file)
    #         if _file.endswith(('.fna')):
    #             fna_file = shutil.copyfile(os.path.join(str(root),_file), fna_file)
    #         if _file.endswith(('.gff')):
    #             source_gff_file = os.path.join(str(root),_file)

    source_gff_file=gff_file
    #add Name property to column 9 of gff file (AMRfinder need it!) and remove #fasta section
    if not source_gff_file==None:
        destination= open(temp_gff_file, "w" )
        #source= open( source_gff_file, "r" )
        with gzip.open(source_gff_file,'rt') as source:
            for line in source:
                if line.startswith('##FASTA'):
                    break
                if line.startswith('##'):
                    destination.write( line )
                else:
                    newline=line.replace('ID=','Name=')
                    destination.write( newline )
        #source.close()
        destination.close()
    gunzip_faa= faa_file;
    if faa_file.endswith('.gz'):
        gunzip_faa =os.path.join(temp_dir,prefix_name+'.faa')
        cmd = 'gunzip -c {} > {}'.format(faa_file, gunzip_faa)
        run_command(cmd)
    gunzip_fna= fna_file;
    if fna_file.endswith('.gz'):
        gunzip_fna =os.path.join(temp_dir,prefix_name+'.fna')
        cmd = 'gunzip -c {} > {}'.format(fna_file, gunzip_fna)
        run_command(cmd)
    cmd = 'amrfinder -d {database} -p {faa_file}  -n {fna_file} -g {gff_file} --plus --threads {threads} -o {outfile}'\
    .format(
        database=db,
        faa_file=gunzip_faa,
        fna_file=gunzip_fna,
        gff_file=temp_gff_file,
        threads=threads,
        outfile=ret_out
    )
    #full option if has --Genus
    if not genus==None:
        organism = genus.capitalize()
        if not species==None and not species=='':
            organism = species.replace(' ','_')
        organisms = ['Campylobacter', 'Enterococcus_faecalis', 'Enterococcus_faecium', 'Escherichia', 'Klebsiella', 'Salmonella', 'Staphylococcus_aureus', 'Staphylococcus_pseudintermedius', 'Vibrio_cholerae']
        if organism in organisms:
            cmd = 'amrfinder -d {database} -p {faa_file} -O {organism}  -n {fna_file} -g {gff_file} --plus --threads {threads} -o {outfile}'\
            .format(
                database=db,
                faa_file=gunzip_faa,
                organism=organism,
                fna_file=gunzip_fna,
                gff_file=temp_gff_file,
                threads=threads,
                outfile=ret_out
            )
        else:
            cmd = 'amrfinder -d {database} -p {faa_file} -n {fna_file} -g {gff_file} --plus --threads {threads} -o {outfile}'\
            .format(
                database=db,
                faa_file=gunzip_faa,
                fna_file=gunzip_fna,
                gff_file=temp_gff_file,
                threads=threads,
                outfile=ret_out
            )

    cmd = "bash -c '{}'".format(cmd)
    if run_command(cmd,timing_log) != 0:
        return None

    #clean up:
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    #proccess output files:
    virulen=[]
    amr=[]
    point=[]
    header=[]
    with open(ret_out) as tsvfile:
        reader = csv.DictReader(tsvfile, dialect='excel-tab')
        for row in reader:
            header=row.keys()
            if row['Element type']=='VIRULENCE':
                virulen.append(row)
            elif row['Element subtype']=='POINT':
                point.append(row)
            else:
                amr.append(row)
    with open(amr_out, 'w', newline='') as csvfile:

        writer = csv.DictWriter(csvfile, fieldnames=header,delimiter='\t')
        writer.writeheader()
        for row in amr:
            writer.writerow(row)

    with open(point_out, 'w', newline='') as csvfile:

        writer = csv.DictWriter(csvfile, fieldnames=header,delimiter='\t')
        writer.writeheader()
        for row in point:
            writer.writerow(row)

    with open(virulen_out, 'w', newline='') as csvfile:

        writer = csv.DictWriter(csvfile, fieldnames=header,delimiter='\t')
        writer.writeheader()
        for row in virulen:
            writer.writerow(row)

    if os.path.exists(ret_out):
        os.remove(ret_out)

    return amr_out,point_out,virulen_out
Exemplo n.º 16
0
def annotate_prokka(prefix_name,
                    assembly,
                    genus=None,
                    species=None,
                    strain=None,
                    gram=None,
                    base_dir='.',
                    overwrite=False,
                    timing_log=None,
                    threads=0):
    """
        Run annotation process using prokka
        :param read_data: result holder
        :param base_dir: working directory
        :return: path to output file in result holder
    """
    if threads == 0:
        threads = NUM_CORES_DEFAULT

    path_out = os.path.join(base_dir, prefix_name + '_prokka')
    if not os.path.exists(path_out):
        os.makedirs(path_out)

    annotation_gbk = os.path.join(path_out, prefix_name + '.gbk.gz')
    annotation_gff = path_out + '/' + str(prefix_name) + '.gff.gz'
    annotation_faa = path_out + '/' + str(prefix_name) + '.faa.gz'
    annotation_ffn = path_out + '/' + str(prefix_name) + '.ffn.gz'
    annotation_fna = path_out + '/' + str(prefix_name) + '.fna.gz'
    if os.path.isfile(annotation_gff) and os.path.isfile(annotation_gbk) and (
            not overwrite):
        # Dont run again if gff/gbk file exists
        logger.info('GFF and GBK files found, skip annotating')
        return annotation_gff, annotation_faa, annotation_ffn, annotation_fna, annotation_gbk
    gunzip_fasta = assembly
    if assembly.endswith('.gz'):
        gunzip_fasta = os.path.join(path_out, prefix_name + '.fin')
        cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fasta)
        run_command(cmd)
    cmd = 'prokka --force --cpus {threads} --addgenes --mincontiglen 200'.format(
        threads=threads)
    cmd += ' --prefix {sample_id} --locus {sample_id} --outdir {path} '.format(
        sample_id=prefix_name, path=path_out)
    if not genus == None and genus:
        cmd += ' --genus ' + genus
    if not species == None and species:
        species = species.replace(' ', '_')
        cmd += ' --species ' + species
    if not strain == None and strain:
        cmd += ' --strain ' + strain
    if not gram == None and gram:
        cmd += ' --gram ' + gram
    cmd += ' ' + gunzip_fasta
    cmd = "bash -c '{}'".format(cmd)
    ret = run_command(cmd, timing_log)
    if ret != 0:
        raise Exception('Command {} returns non-zero ()!'.format(cmd, ret))

    for file_name in glob.glob(os.path.join(path_out, '*')):
        ext = file_name[-3:]
        if ext in ['gff', 'gbk', 'ffn', 'faa', 'fna']:  # fna?
            run_command('gzip {}'.format(file_name))
        else:
            os.remove(file_name)

    return annotation_gff, annotation_faa, annotation_ffn, annotation_fna, annotation_gbk
Exemplo n.º 17
0
def run_gene_phylogeny_iqtree(roary_folder,
                              collection_dir,
                              threads=8,
                              overwrite=False,
                              timing_log=None):
    """
    Run phylogenetic analysis of gene clusters. If the list of samples has not changed, and
    none of the samples has changed, the existing tree will be kept unless overwrite is
    set to True
    Parameters
    ----------
    report: object
        A report object
    collection_dir: str
        working directory of the collection
    threads: int
        number of threads to use
    overwrite: bool
        whether to overwrite existing result even if input did not change
    timing_log: str
        file to log timing
    Returns
        report object
    -------
    """
    alignment_dir = os.path.join(collection_dir, 'alignments')
    gene_cluster_file = roary_folder + '/gene_presence_absence.Rtab'
    gene_df = pd.read_csv(gene_cluster_file, sep='\t', index_col='Gene')
    gene_df.fillna('', inplace=True)

    cmds_file = os.path.join(alignment_dir, "phylo_cmds")
    with open(cmds_file, 'w') as cmds:
        for gene_id, row in gene_df.iterrows():
            # Only analyse if there are at least 3 genes
            if row.sum() < 3:
                continue

            gene_id = re.sub(r'\W+', '', gene_id)
            gene_dir = os.path.join(alignment_dir, gene_id)
            if not os.path.exists(gene_dir):
                os.makedirs(gene_dir)
            # check if done before
            iqtree_output = os.path.join(gene_dir, gene_id + '.treefile')
            if (not overwrite) and os.path.isfile(iqtree_output):
                continue

            gene_aln_file_roary = os.path.join(roary_folder,
                                               'pan_genome_sequences',
                                               gene_id + '.fa.aln')
            gene_aln_file = os.path.join(gene_dir, gene_id + '.fna.aln.gz')
            if os.path.isfile(gene_aln_file_roary):
                shutil.move(gene_aln_file_roary, gene_aln_file)
            if not os.path.isfile(gene_aln_file):
                logger.info('{} does not exist'.format(gene_aln_file))
                continue

            cmd = f"iqtree -s {gene_aln_file} --prefix {gene_dir+'/'+gene_id} -m GTR -quiet -T 1 -B 1000 2> /dev/null"
            cmd += f" || iqtree -s {gene_aln_file} --prefix {gene_dir+'/'+gene_id} -m GTR -quiet -T 1"
            # translate to protein alignment
            #protein_aln_file = os.path.join(gene_dir, gene_id + '.faa.aln')
            #with open(protein_aln_file, 'w') as fh:
            #    for record in SeqIO.parse(gene_aln_file, 'fasta'):
            #        trans = translate_dna(str(record.seq))
            #        new_record = SeqRecord(Seq(trans), id=record.id,)
            #        SeqIO.write(new_record, fh, 'fasta')
            #cmd = f"iqtree -s {protein_aln_file} --prefix {gene_dir+'/'+gene_id} -m LG -quiet -T 1"
            #cmd = f"fasttree -nt -gtr -quiet {gene_aln_file} > {gene_dir+'/'+gene_id+'.treefile'} && echo '{gen_list_string}' > {gene_list_json}"
            cmds.write(cmd + '\n')

    cmd = f"parallel --bar -j {threads} -a {cmds_file}"
    ret = run_command(cmd, timing_log)

    return alignment_dir
Exemplo n.º 18
0
def run_phylogeny_parsnp(base_dir, ref_genome, genome_dir='.', threads=0):
    """
        Run parsnp to create phylogeny tree
        :param read_data: result holder
        :param ref_genome: path to reference genome, if equal None, one of genome in genome directory will be choosed to be reference.
        :param base_dir: working directory
        :param threads: number of core CPU
        :return:
    """
    phylogeny_folder = os.path.join(base_dir, 'pangenome/phylogeny')
    if not os.path.exists(phylogeny_folder):
        os.makedirs(phylogeny_folder)
    else:

        return phylogeny_folder
    temp_folder = os.path.join(phylogeny_folder, 'temp_phylo')
    if not os.path.isdir(temp_folder):
        os.makedirs(temp_folder)
    #take first genome to get reference genome
    sample_list = []
    files = os.listdir(genome_dir)
    if ref_genome == None:
        #pick a file in genome_dir to make ref
        for i, f in enumerate(files):
            fasta_file = os.path.join(temp_folder, os.path.basename(f))
            cmd = 'gunzip -c {} > {}'.format(os.path.join(genome_dir, f),
                                             fasta_file)
            run_command(cmd)
            cmd = 'cat {} > {}'.format(os.path.join(genome_dir, f), fasta_file)
            run_command(cmd)
            if i == 0:
                ref_genome = fasta_file
            else:
                sample_list.append(fasta_file)
    else:
        for i, f in enumerate(files):
            fasta_file = os.path.join(temp_folder, os.path.basename(f))
            cmd = 'gunzip -c {} > {}'.format(os.path.join(genome_dir, f),
                                             fasta_file)
            run_command(cmd)
            cmd = 'zcat {} > {}'.format(f, fasta_file)
            run_command(cmd)
            sample_list.append(fasta_file)
    myCmd = 'parsnp -r {} -d {} -o {} -p {}'.format(ref_genome,
                                                    ' '.join(sample_list),
                                                    phylogeny_folder, threads)
    print(myCmd)
    et = run_command(myCmd, timing_log)
    if ret != 0:
        raise Exception('Error running parsnp')
    run_command('gzip {}'.format(os.path.join(phylogeny_folder,
                                              'parsnp.xmfa')))
    run_command('gzip {}'.format(os.path.join(phylogeny_folder, 'parsnp.ggr')))
    shutil.rmtree(temp_folder)

    return phylogeny_folder
Exemplo n.º 19
0
def run_alignment_by_parsnp(roary_folder,ffn_dir,base_dir, overwrite=False,  timing_log=None,threads=0):
    """
        Run aligment process to create both multi-alignment  and phylogeny tree for each gene in gene clusters
        :param read_data: result holder
        :param ffn_dir: path to folder of .ffn (output of prokka)
        :param base_dir: working directory
        :return:
    """
    gene_cluster_file=roary_folder+'/gene_presence_absence.csv.gz'
    dict_cds={}
    for root, dirs, files in os.walk(ffn_dir):
        for _file in files:
            if _file.endswith('.ffn.gz'):
                with gzip.open(os.path.join(root, _file), 'rt') as fn:
                    for seq in SeqIO.parse(fn, 'fasta'):
                        dict_cds[seq.id] = seq


    #make folder contains sequences for each gene
    alignment_dir=os.path.join(base_dir,'alignments')
    if (not overwrite) and os.path.exists(alignment_dir):
        return alignment_dir
    if not os.path.exists(alignment_dir):
        os.makedirs(alignment_dir)

    gene_df = pd.read_csv(gene_cluster_file, dtype=str)
    gene_df.fillna('', inplace=True)

    sample_columns = list(gene_df.columns)[14:]
    for _, row in gene_df.iterrows():
        gene_id = row['Gene']
        gene_list = []
        for sample_column in sample_columns:
            if row[sample_column]:
                # roary can pool together genes from the same sample and tab-separate them
                for sample_gene in row[sample_column].split('\t'):
                    gene_list.append(sample_gene)
                    # TODO: make sure all samples in this gene have not updated

        gene_list = sorted(gene_list)
        # Only analyse if there are more than 3 genes
        if len(gene_list) < 3:
            logger.info('There are too few genes for {} skipping'.format(gene_id))
            continue

        gene_dir = os.path.join(alignment_dir, gene_id)
        # Check if done before
        gene_list_json = os.path.join(gene_dir, 'gene_list.json')
        # if os.path.isfile(os.path.join(gene_dir, 'parsnp.tree')) and (not overwrite):
        if os.path.isfile(gene_list_json):
            with open(gene_list_json) as fn:
                existing_gene_list = json.load(fn)
                if gene_list == existing_gene_list:
                    logger.info('Phylogeny for gene {} done, skipping'.format(gene_id))
                    continue  # for _, row

        gene_file_dir = os.path.join(gene_dir, 'files')
        if not os.path.exists(gene_file_dir):
            os.makedirs(gene_file_dir)

        gene_files = []
        for sample_gene in gene_list:
            gene_file = os.path.join(gene_file_dir, sample_gene + '.fasta')
            SeqIO.write(dict_cds[sample_gene], gene_file, 'fasta')
            gene_files.append(gene_file)

        # Use the first gene as the reference
        cmd = 'parsnp -d {} -r {} -o {} -p {}'.format(
            ' '.join(gene_files[1:]), gene_files[0], gene_dir, threads)
        ret = run_command(cmd)
        # if ret != 0:
        #     raise Exception('error')

        with open(gene_list_json, 'w') as fn:
            json.dump(gene_list, fn)
        #run_command('gzip {}'.format(os.path.join(gene_dir, 'parsnp.xmfa')))
        #run_command('gzip {}'.format(os.path.join(gene_dir, 'parsnp.ggr')))

        if os.path.exists(gene_file_dir):
            shutil.rmtree(gene_file_dir)
        #clean up
        run_command('rm -f ' + os.path.join(gene_dir, '*.ini ') + os.path.join(gene_dir, '*block* '))
        shutil.rmtree(os.path.join(gene_dir, 'blocks'), True)
        shutil.rmtree(os.path.join(gene_dir, 'tmp'), True)


    return alignment_dir