Пример #1
0
def trimmo_module(files, path_name, sample_name, threads, Debug, trimmomatic_adapters):
	## 
	## This functions generates a trimmomatic call using java and trimmomatic from 
	## the system with a minimum version (specified in config.py)
	## Checks if adapter file exists
	## Returns code from trimmo_call: OK/FAIL
	##
	
	## get exe
	trimmomatic_jar = set_config.get_exe('trimmomatic')
	java_path = set_config.get_exe('java')

	## check if it exists
	if os.path.isfile(trimmomatic_adapters):
		## debug message
		if (Debug):
			print (colored("**DEBUG: trimmomatic_adapters file exists **", 'yellow'))
			print (trimmomatic_adapters)
	else:
		## rise error & exit
		print (colored("***ERROR: Trimmomatic adapters file does not exist: " + trimmomatic_adapters,'red'))
		exit()
	
	## call
	return(trimmo_call(java_path, path_name, sample_name, files, trimmomatic_jar, threads, trimmomatic_adapters, Debug))
Пример #2
0
def snippy_core_call(list_folder, options, name, output_dir, output_format, Debug):
	"""
	Create core alignment for samples align to the same reference
	
	ATTENTION: Requires sample names to be different within the first 10 characters.
	
	:param list_folder:
	:param options:
	:param name:
	:param output_dir:
	:param output_format:
	:param Debug:
	
	:type list_folder: list
	:type options: string
	:type name: string
	:type output_dir:
	:type output_format:
	:type Debug:
	 
	"""
	
	## create snippy-core call
	snippy_core_exe = set_config.get_exe('snippy_core', Debug)
	
	## start snippy_cmd 
	list_folder_string = " ".join(list_folder)
	log_file = os.path.join(output_dir, "snippy_cmd.log")
	name_outdir =  os.path.join(output_dir, name)
	
	## use one reference: must be the same for all comparisons
	reference_fasta = list_folder[1] + "/ref.fa"	
	snippy_core_cmd = '%s -aformat %s --ref %s --prefix %s %s 2> %s' %(snippy_core_exe, output_format, reference_fasta, name_outdir, list_folder_string, log_file)
	
	return (HCGB_sys.system_call(snippy_core_cmd))
def main():
	## this code runs when call as a single script

  	## control if options provided or help
	if len(sys.argv) > 1:
		print ("")
	else:
		help_options()
		exit()    	
	
	## arguments
	name = argv[1]
	fasta = os.path.abspath(argv[2])
	folder = os.path.abspath(argv[3])
	sample_name = argv[4]
	threads = argv[5]

	files = []
	for i,e in enumerate(argv):
		if i > 5:
			files.append(os.path.abspath(argv[i]))

	## other
	cutoff=80
	kma_bin = set_config.get_exe("kma")
	out_file = sample_name + ".out_kma-search.txt"

	## check if database is indexed
	if not  check_db_indexed(name, folder):
		index_database(fasta, kma_bin, name, 'new', folder, '')
		print ("\n+ Database indexed")
	

	## search files
	kma_ident_call(out_file, files, sample_name, folder + '/' + name, kma_bin, '', threads)
Пример #4
0
def ml_tree(folder, name, threads, output, Debug):
    """
    Create Maximum Likelihood tree reconstruction 
    
    We use IQ-Tree for the versatility and the ability to automatically set parameters. 
    
    :param folder: Snippy-core folder containing results.
    :param name: Name of the analysis.
    :param Debug: True/false for debugging messages
    
    :type folder: string 
    :type name: string
    :type Debug: bool 
    """
    iqtree_exe = set_config.get_exe('iqtree', Debug)
    bootstrap_number = '1000'
    aln_file = os.path.join(folder, name + '.aln')
    output_log = os.path.join(output, 'iqtree.error.log')
    output_files = os.path.join(output, 'iqtree_' + name)

    iqtree_cmd = '%s -s %s -redo --threads-max %s --prefix %s -B %s 2> %s' % (
        iqtree_exe, aln_file, threads, output_files, bootstrap_number,
        output_log)
    code = HCGB_sys.system_call(iqtree_cmd)

    if code == 'OK':
        return ()
    else:
        print("Some error occurred...")
        return ()
Пример #5
0
def run_module_SPADES_old(name, folder, file1, file2, threads):

	print ("+ Calling spades assembly for sample...", name)	

	## folder create
	HCGB_files.create_folder(folder)
	
	## get configuration
	SPADES_bin = set_config.get_exe('spades')
	
	## assembly main 
	path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads)

	## assembly plasmids
	path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, name, SPADES_bin, threads)
	
	## discard plasmids from main
	(tmp_contigs, tmp_plasmids) = discardPlasmids(path_to_contigs, path_to_plasmids, folder, name)
	
	## rename fasta sequences
	new_contigs = tmp_contigs.split(".fna.tmp")[0] + '.fna'	
	rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs)
	
	new_plasmids=""
	if os.path.isfile(tmp_plasmids):
		new_plasmids = tmp_plasmids.split(".fna.tmp")[0] + '.fna'	
		rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids)
	
	## contig stats
	stats(new_contigs, new_plasmids)
	
	## success stamps
	filename_stamp = folder + '/.success'
	stamp =	HCGB_time.print_time_stamp(filename_stamp)
Пример #6
0
def generate_seq_search_call(db,
                             query,
                             outfile,
                             revcomp,
                             start=0,
                             end=-1,
                             format='fasta'):

    ## Sequence Range
    ##   -seq_start     First sequence position to retrieve
    ##   -seq_stop      Last sequence position to retrieve
    ##   -strand        1 = forward DNA strand, 2 = reverse complement
    ##   -revcomp       Shortcut for strand 2

    efetch_bin = set_config.get_exe("efetch")
    cmd = ("%s -db %s -id %s -seq_start %s -seq_stop %s -format %s" %
           (efetch_bin, db, query, start, end, format))

    ## add reverse complement
    if (revcomp):
        cmd = cmd + ' -revcomp'

    ## add output file
    cmd = cmd + ' > %s' % outfile

    return (HCGB_sys.system_call(cmd))
Пример #7
0
def module_call(sequence_fasta, kingdom, genus, path, name, threads):
    """
	Function that checks and generates annotation.
	
	- It uses Prokka_ via :func:`BacterialTyper.scripts.annotation.prokka_call`.
	
	- It checks if previously generated 
	
	- Once finished, it prints timestamp 
	
	:param sequence_fasta: Assembled sequences in fasta file format. 
	:param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses
	:param genus: Available genus options for Prokka software. See details above.
	:param path: Absolute path to the output folder to include results.
	:param name: Sample name and tag to include in the annotation report and files.
	:param threads: Number of CPUs to use.
	  
	:type sequence_fasta: string
	:type kingdom: string
	:type genus: string 
	:type path: string 
	:type name: string 
	:type threads: integer 
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`HCGB.functions.time_functions.print_time_stamp`
				
		- :func:`HCGB.functions.time_functions.prokka_call`	

	.. include:: ../../links.inc	 	
	"""

    ## check if previously assembled and succeeded
    filename_stamp = path + '/.success'

    if os.path.isdir(path):
        if os.path.isfile(filename_stamp):
            stamp = HCGB_time.read_time_stamp(filename_stamp)
            print(
                colored(
                    "\tA previous command generated results on: %s [%s]" %
                    (stamp, name), 'yellow'))
            return ()

    ## call prokka
    prokka_bin = set_config.get_exe('prokka')
    dirname = prokka_call(prokka_bin, sequence_fasta, kingdom, genus, path,
                          name, threads)

    ## success stamps
    filename_stamp = path + '/.success'
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    return (dirname)
Пример #8
0
def BUSCO_plot(outfolder):
	busco_plot_bin = set_config.get_exe('generate_plot')
	
	os.chdir(outfolder)
	#logFile = dataset_name + '.log'
	cmd = '%s -wd %s' %(busco_plot_bin, outfolder)
	HCGB_sys.system_call(cmd)
	return()
Пример #9
0
def run_module_assembly(name, folder, file1, file2, threads, debug=False):
	"""Assembly main module call.
	
	It calls assembly function to process data provided and returns genome statistics. Steps: 
	
	- Retrieves SPADES_ executable (See details :func:`BacterialTyper.scripts.set_config.get_exe`) using the minimun version required (See :func:`BacterialTyper.scripts.set_config.min_version_programs` for details)
	
	- It generates a call to SPADES_ assembler (See :func:`BacterialTyper.scripts.spades_assembler.run_SPADES_assembly`). 
		
	- If assembly succeeds and fasta file is generated under the directory provided, contig statistics are generated (:func:`BacterialTyper.scripts.spades_assembler.contig_stats`).
	
	- It retrieves spades executable using 
	
	:param name: Sample name or tag to identify sample
	:param folder: Absolute path to folder.
	:param file1: Absolute path to fastq reads (R1).
	:param file2: Absolute path to fastq reads (R2).
	:param threads: Number of CPUs to use.
	:type name: string
	:type folder: string
	:type file1: string
	:type file2: string
	:type threads: integer
	:return: Assembly statistics file.
	:rtype: string : Path to file assembly statistics file.
	:warnings: Returns **FAIL** if assembly process stopped.
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.spades_assembler.run_SPADES_assembly`
	
		- :func:`BacterialTyper.scripts.set_config.min_version_programs`
	
		- :func:`BacterialTyper.scripts.spades_assembler.contig_stats`
	
	.. include:: ../../links.inc	 	
	"""
	
	print ("+ Calling spades assembly for sample...", name)	
	
	## get configuration
	SPADES_bin = set_config.get_exe('spades')
	
	## assembly main 
	path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads, debug)
	
	if path_to_contigs == 'FAIL':
		return ('FAIL')
	else:
		## contig stats
		#print ('+ Get assembly statistics:...\n')
		(stats_dict, excel_file) = contig_stats(path_to_contigs, debug)
	
		## check statistics in file
		print ("+ Check statistics for sample %s in file:\n%s" %(name, excel_file))
		return([stats_dict, excel_file])
Пример #10
0
def blastn(outFile, DBname, fasta, threads):
    # blastn plasmids vs contigs
    blastnexe = set_config.get_exe('blastn')
    cmd_blastn = "%s -db %s -query %s -out %s -evalue 1e-20 -outfmt \'6 std qlen slen\' -num_threads %s" % (
        blastnexe, DBname, fasta, outFile, threads)
    codeBlastn = system_call(cmd_blastn)

    if (codeBlastn == 'FAIL'):
        print(
            colored('****ERROR: Some error happened during the blastn command',
                    'red'))
        print(cmd_blastn)
        exit()
Пример #11
0
def makeblastdb(DBname, fasta):
    ## generate blastdb for genome
    makeblastDBexe = set_config.get_exe('makeblastdb')

    if (os.path.isfile(DBname + '.nhr')):
        print("+ BLAST database is already generated...")
    else:
        cmd_makeblast = "%s -in %s -input_type fasta -dbtype %s -out %s" % (
            makeblastDBexe, fasta, 'nucl', DBname)
        code = system_call(cmd_makeblast)

        if (code == 'FAIL'):
            print(
                colored(
                    '****ERROR: Some error happened during the makeblastDB command',
                    'red'))
            print(cmd_makeblast)
            exit()
Пример #12
0
def get_external_kma(kma_external_files, Debug):
    print('\t- Get additional kma databases:')
    ## external sequences provided are indexed and generated in the same folder provided

    option_db = ""
    if (kma_external_files):
        kma_external_files = set(kma_external_files)
        kma_external_files = [os.path.abspath(f) for f in kma_external_files]

        ## check if indexed and/or index if necessary
        external_kma_dbs_list = []

        ## set defaults
        kma_bin = set_config.get_exe("kma")
        for f in kma_external_files:
            file_name = os.path.basename(f)
            fold_name = os.path.dirname(f)
            print(colored('\t\t+ %s' % file_name, 'green'))
            print()

            ## generate db
            databaseKMA = species_identification_KMA.generate_db(
                [f], file_name, fold_name, 'new', 'single', Debug, kma_bin)
            if not databaseKMA:
                print(
                    colored(
                        "***ERROR: Database provided is not indexed.\n" %
                        databaseKMA, 'orange'))
            else:
                external_kma_dbs_list.append(databaseKMA)

        external_kma_dbs_string = ','.join(external_kma_dbs_list)
        option_db = "kma_external:" + external_kma_dbs_string

    else:
        ## rise error & exit
        print(
            colored(
                "***ERROR: No database provided via --kma_external_file option.\n",
                'red'))
        exit()

    return (option_db)
Пример #13
0
def print_list_prokka():
    """
	Prints Prokka_ databases that has installed to use. It is the output from the call: 
	
	.. code-block:: sh

		prokka --listdb
	
	.. include:: ../../devel/results/print_list_prokka.txt
		:literal:
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
			
	.. include:: ../../links.inc	 	
	"""
    prokka_bin = set_config.get_exe('prokka')
    cmd = prokka_bin + " --listdb"
    HCGB_sys.system_call(cmd)
Пример #14
0
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db):

	my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name)
	## timestamp
	filename_stamp =  my_out_folder + '/.success'

	print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow'))
		
	## check previous run
	if os.path.isfile(filename_stamp):
		timestamp = HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tSuccessfully run on date: %s"  %timestamp, 'green'))
	else:
	
		busco_bin = set_config.get_exe('busco')
		os.chdir(output_name)
		
		## init cmd configuration
		cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db)
		
		## options if autolineage or given dataset
		if "auto-lineage" == dataset_name:
			logFile = 'auto_lineage.log'
			cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile)
		else:
			logFile = dataset_name + '.log'
			cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile)
		
		## system call
		HCGB_sys.system_call(cmd)
		
		if os.path.isfile(my_out_folder + '/short_summary.txt'):
			## timestamp
			HCGB_time.print_time_stamp(filename_stamp)
		else:
			print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red'))
			return ('FAIL')

	return()
Пример #15
0
def print_available_BUSCO():
	HCGB_aes.print_sepLine("-", 100, False)
	busco_bin = set_config.get_exe('busco')
	
	## get datasets
	busco_bin_call = busco_bin + ' --list-datasets > tmp'
	HCGB_sys.system_call(busco_bin_call, message=False)
	
	## dump in screen
	with open("./tmp", 'r') as f:
		print(f.read())	
	
	## clean
	list_files = HCGB_main.get_fullpath_list("./busco_downloads", False)
	list_files + ['tmp']
	for i in list_files:
		os.remove(i)
	os.rmdir("./busco_downloads/information")		
	os.rmdir("./busco_downloads/")
		
	HCGB_aes.print_sepLine("-", 100, False)
	print ("\n")
Пример #16
0
def print_dependencies():
    """

	"""

    progs = {}
    depencencies_pd = read_dependencies()
    for prog in depencencies_pd:
        #print (prog)
        prog_exe = set_config.get_exe(prog)
        #print (prog + '\t' + prog_exe)
        prog_ver = get_version(prog, prog_exe)
        progs[prog] = [prog_exe, prog_ver]

    df_programs = pd.DataFrame.from_dict(progs,
                                         orient='index',
                                         columns=('Executable path',
                                                  'Version'))
    df_programs = df_programs.stack().str.lstrip().unstack()
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.max_columns', None)
    print(df_programs)
Пример #17
0
def install_R_packages(package, source, install_path, extra):
	
	(install_R, install_github_package) = get_install_R_files()
	
	HCGB_files.create_folder(install_path)
	Rscript_exe = set_config.get_exe('Rscript')
	print("+ Installing %s package..." %package)
	install_file = install_R
	if (source == 'github'):
		install_file = install_github_package
		package= extra + '/' + package
	
	cmd_R = '%s %s -l %s -p %s' %(Rscript_exe, install_file, package, install_path)
	HCGB_sys.system_call(cmd_R)
	
	## check if exists or try to install
	MLSTar_package = os.path.join(install_path, 'MLSTar')
	if os.path.exists(MLSTar_package):
		RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt')
		HCGB_main.printList2file(RDir_package, [install_path])
	else:
		print_error_message(package, "No R package found", 'package')
		print ('Please install manually to proceed...')
Пример #18
0
def install_git_repo(git_repo, folder_sofware, install_path, option, Debug):
	""" """
	
	## current path
	current_path = os.getcwd()
	os.chdir(install_path)
	
	## git clone repo
	print ('+ Using git to get code...')
	git_exe = set_config.get_exe('git', Debug)
	
	if os.path.exists(folder_sofware):
		print ('+ Clone repository...')
		## pull
		os.chdir(folder_sofware)
		cmd = git_exe + ' pull'
	else:
		print ('+ Clone repository...')
		## clone
		cmd = git_exe + ' clone ' + git_repo 
	
	## call git
	HCGB_sys.system_call(cmd)

	## compile if necessary
	if (option == 'make'):
		## Compile
		print ('+ Compile software...')
		## make
		os.chdir(folder_sofware)
		HCGB_sys.system_call('make')
	
	## chdir to previous path
	os.chdir(current_path)
	
	return(True)	
Пример #19
0
def get_MLSTar_package_installed(debug=False):

    install_path = set_config.R_package_path_installed()
    (check_install_system, check_install_path) = set_config.get_check_R_files()
    R_script_exe = set_config.get_exe('Rscript')

    if debug:
        print('\n+ Check package: MLSTar')

    ## ATTENTION: optparse library missing, no installation withn conda

    ## first try to check if package available in system
    cmd_check = R_script_exe + ' ' + check_install_system + ' -l MLSTar'
    code = functions.system_call(cmd_check, message=False, returned=False)
    if (code == 'OK'):
        return ('system')
    else:
        ## check if installed in path
        cmd_check_path = R_script_exe + ' ' + check_install_path + ' -l MLSTar -p ' + install_path
        code2 = functions.system_call(cmd_check_path,
                                      message=False,
                                      returned=False)

        if (code2 == 'OK'):
            return (install_path)
        else:
            (install_R, install_github_package
             ) = install_dependencies.get_install_R_files()
            cmd_R = '%s %s -l iferres/MLSTar -p %s' % (
                R_script_exe, install_github_package, install_path)
            code3 = functions.system_call(cmd_R, message=False, returned=False)
            if (code3):
                return (install_path)
            else:
                print('ERROR')
                exit()
Пример #20
0
def snippy_call(reference_fasta, list_files, threads, outdir, name, contig_option, other_options, Debug):
	"""
	Creates variant calling for a sample vs. a reference.
	
	By default, it uses ``rgid`` option with argument ``name`` provided. Argument ``list_files``
	contains files to map that could be a single end file, two paired-end fastq files or a contig file. If 
	fasta contig files provided, set ``contig_option`` True. 
	
	All output files within ``outdir`` folder would containg tag ``snps``
	
	:param reference_fasta: Absolute path to reference fasta file.
	:param list_files: List of absolute path to fastq files (.fq / .fq.gz / fastq / fastq.gz)
	:param threads: Number of CPU cores to use.
	:param outdir: Output folder.
	:param name: Name of the sample
	:param contig_option: True/false to map contigs provided instead of files. Contigs provided via list_files.
	:param other_options: String of options to include in snippy call
	:param Debug: True/false for debugging messages
	
	:type reference_fasta: string
	:type list_files: list
	:type threads: int
	:type outdir: string 
	:type name: string 
	:type contig_options: bool
	:type other_options: string
	:type Debug: bool
	"""
	
	## create snippy call
	snippy_exe = set_config.get_exe('snippy', Debug)
	
	## start snippy_cmd 
	log_file = os.path.join(outdir, "snippy_cmd.log")
	snippy_cmd = '%s --cpus %s --reference %s --force --unmapped --outdir %s --rgid %s' %(
		snippy_exe, threads, reference_fasta, outdir, name)
	
	## force option: prevent finish early if folder exists
	## unmapped option: keep unmapped reads
	
	## add files to map
	if contig_option:
		snippy_cmd = snippy_cmd + ' --ctgs ' + list_files[0]
	else:
		if (len(list_files) == 1):
			snippy_cmd = snippy_cmd + ' --se ' + list_files[0]
		elif (len(list_files) == 2):
			snippy_cmd = snippy_cmd + ' --pe1 ' + list_files[0] + ' --pe2 ' + list_files[1]
		else:
			print(colored("** ERROR: No reads or contigs provided...", "red"))
			return(False)		
	
	## add log
	snippy_cmd = snippy_cmd + ' 2> ' + log_file
	
	## debug message
	if (Debug):
		print (colored("**DEBUG: snippy_cmd **", 'yellow'))	
		print (snippy_cmd)
	
	## create system call
	return(HCGB_sys.system_call(snippy_cmd, returned=False, message=True))
Пример #21
0
def GI_module(genbank_file, name, outdir, Debug, cutoff_dinuc_bias=8, min_length=1000):
    """Identify genomic islands (GI) within the genbank file provided. They are calculated
    based on gene annotation and dinucleotide bias region using the software `IslandPath-DIMOB`_.
    
    :param genbank_file: Absolute path to annotation file in Genbank format.
    :param name: Sample identifier. 
    :param outdir: Absolute path to output folder.
    :param cutoff_dinuc_bias: Dinucleotide bias cutoff
    :param min_length: Minimun length for the regions to be reported

    :type name: string
    :type genbank_file: string
    :type outdir: string
    :type cutoff_dinuc_bias: int
    :type min_length: int

    The Dimob.pl perl script has two mandatory argument which are the input :file:`genbank_file` and an output name.
    
    .. code-block:: sh

        Usage:
        perl Dimob.pl <genome.gbk> <output_name> [cutoff_dinuc_bias] [min_length]
        
        Default values:
            cutoff_dinuc_bias = 8
            min_length = 8000
        
        Example:
            perl Dimob.pl example/NC_003210.gbk NC_003210_GIs
            perl Dimob.pl example/NC_003210.gbk NC_003210_GIs 6 10000
            perl Dimob.pl example/NC_000913.embl NC_000913_GIs 6 10000

    
    During the development of BacterialTyper, we generated a modification of the original `IslandPath-DIMOB`_ to analyze 
    contig sequence data and generated different output format for better clarificaiton and interpretaion of results. 
    We forked the original code into a new git repository and update the code accordingly. See details here: https://github.com/JFsanchezherrero/islandpath.
    
     .. include:: ../../links.inc
    
    """
    
    ## filename stamp of the process
    filename_stamp = outdir + '/.Dimob'

    # check if previously done
    if os.path.isfile(filename_stamp):
        stamp = functions.read_time_stamp(filename_stamp)
        print (colored("\tA previous command generated results on: %s [%s -- Dimob]" %(stamp, name), 'yellow'))
    else:    
        ## debug message
        if (Debug):
            print (colored("**DEBUG: Call Dimob for sample %s " %name + "**", 'yellow'))
            print ("genbank_file", genbank_file)
            print ("outdir: ", outdir)
           
        ## Call IslandPath Dimob executable perl file.
        dimob_pl = set_config.get_exe("dimob", Debug)
        perl_exe = set_config.get_exe("perl", Debug)
        
        ## command
        outdir_sample = os.path.join(outdir, name)
        log_file = outdir_sample + '.log'
        perl_cmd = '%s %s %s %s %s %s > %s' %(perl_exe, dimob_pl, genbank_file, outdir_sample, cutoff_dinuc_bias, min_length, log_file)
    
        code = functions.system_call(perl_cmd)
        ##
        if code:
            ## when finished print time stamp in  output + '/.Dimob'
            stamp = functions.print_time_stamp(filename_stamp)
        else:
            return False
    
    return (outdir)
Пример #22
0
def generate_xtract_call(docsum_file, pattern, element, outfile):
    xtract_bin = set_config.get_exe("xtract")
    return (xtract_call(docsum_file, pattern, element, outfile, xtract_bin))
Пример #23
0
def generate_docsum_call(db, query, outfile):
    esearch_bin = set_config.get_exe("esearch")
    efetch_bin = set_config.get_exe("efetch")
    return (docsum_call(db, query, outfile, esearch_bin, efetch_bin))
Пример #24
0
def agrvate_call(sample, assembly_file, folder, debug=False):
    """agrvate call and check results."""
    
    ## prepare call
    log_call = os.path.join(folder, "agrvate_cmd.log")
    err_call = os.path.join(folder, "agrvate_cmd.err")
    agrvate_bin = set_config.get_exe('agrvate')
    
    ## system call
    cmd_call = "%s -i %s -m -f >  %s 2> %s " %(agrvate_bin, 
                                               assembly_file,
                                               log_call, err_call) ## use mummer (-m) and force results folder (-f)
    status = HCGB_sys.system_call(cmd_call)
    
    ## check results
    ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details
    results = pd.DataFrame()
    
    ## check folder is created
    assembly_file_name = os.path.basename(assembly_file).split('.fna')[0]    
    original_results_folder = os.path.join(folder, assembly_file_name + '-results')
    results_folder = os.path.join(folder, 'agrvate_results')
    
    if os.path.isdir(original_results_folder):
        print("+ Results folder generated OK")
        print("+ Check results generated:")
        
        ## rename folder
        os.rename(original_results_folder, results_folder)
        os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab'))
        
        ## write to excel
        file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx')
        writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle
    
        ## get all files
        list_files = HCGB_main.get_fullpath_list(results_folder)
    
        ## summary tab
        summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0]
        summary_tab =  HCGB_main.get_data(summary_tab_file, '\t', options="")
        summary_tab['sample'] = sample
        
        ## columns
        #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. 
        ##           If multiple agr groups were found (col 5 = m), 
        ##           the displayed agr group is the majority/highest confidence. 
        # match_score: maximum 15; 0 means untypeable; < 5 means low confidence.
        # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown.
        # multiple_agr:  s means single, m means multiple, u means unknown ) 
        ##               Multiple groups are found likely due to multiple S. aureus isolates in sequence
        # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted)
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow')
            print(summary_tab_file)
            print(summary_tab)

        ## add summary results to all results
        del summary_tab['#filename']
        results = summary_tab.copy()

        ## save summary_tab into excel
        ## tab summary
        summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle

        ## agr_gp tab
        agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0]
        if HCGB_files.is_non_zero_file(agr_gp_tab_file):
            agr_gp_tab =  HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None')
            agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end']
            agr_gp_tab['sample'] = sample
            
            ## columns
            ## Assembly Contig ID
            ## ID of matched agr group kmer
            ## evalue
            ## Percentage identity of match
            ## Start position of kmer alignment on input sequence
            ## End position of kmer alignment on input sequence
    
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow')
                print(agr_gp_tab_file)
                print(agr_gp_tab)
            
            ## save agr_gp_tab file into excel
            ## tab operon
            agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle

        ## agr_operon fna
        try:
            agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0]
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow')
                print(agr_operon_fna_file)
            
            results['operon_fna'] = agr_operon_fna_file
        except:
            results['operon_fna'] = ''

        ## agr_operon fna
        error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0]
        error_report =  HCGB_main.get_data(error_report_file, '\t', options="")
        del error_report['#input_name']

        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow')
            print(error_report_file)
            print(error_report)
            
        ## save error_report file into excel
        ## tab steps
        error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle
        
        ## merge results
        results = pd.concat([results, error_report], axis=1)

        ## close xlsx file
        writer_Excel.save() ## close excel handle
    
        ## add to pandas dataframe
        results['agr_operon_xlsx'] = file_name_Excel

    ## debug messages
    if debug:
        HCGB_aes.debug_message("agrvate results", 'yellow')
        HCGB_main.print_all_pandaDF(results)
        
    return (results)
Пример #25
0
def run(options):
    """
	This is the main function of the module ``config``. It basically checks 
	if the different requirements (python` and third-party software) are
	fulfilled. 

	If any requirement is not available this modules tries to install them or reports to the user to
	manually install them.

	:param option: State whether to check or install missing modules, packages and third party software. Provide: check/install
	:param install_path: Absolute path to install modules or packages missing. Default: ``BacterialTyper`` environment path.
	:param IslandPath: True/False for checking additional perl and software required by this option analysis.
	:param debug: True/false for debugging messages.
	
	:type option: string 
	:type IslandPath: boolean
	:type install_path: string 
	:type debug: boolean	

	.. seealso:: This function depends on several ``BacterialTyper`` functions:

		- :func:`BacterialTyper.config.set_config.check_python_packages`

		- :func:`BacterialTyper.config.set_config.check_perl_packages`

		- :func:`BacterialTyper.config.extern_progs.return_min_version_soft`

		- :func:`BacterialTyper.config.extern_progs.print_dependencies`

	"""

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Pipeline Configuration")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    if (options.install_path):
        if os.path.isdir(options.install_path):
            if (Debug):
                print(
                    "Installation path provided for missing modules, packages, dependencies..."
                )
                print("Path: " + options.install_path)
        else:
            print(colored("\n*** ERROR ****", 'red'))
            print(colored("Path provided is not a folder", 'red'))
            print(options.install_path)
            exit()
    else:
        ## get python environment path
        env_bin_directory = os.path.dirname(os.environ['_'])

        ##os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'templates'))
        options.install_path = os.path.abspath(
            os.path.join(env_bin_directory, '../software'))

        if (Debug):
            print("Retrieve environment path as installation path:")
            print("Path: " + options.install_path)

        HCGB_files.create_folder(options.install_path)

    #######################
    ## install or only check
    #######################
    option_install = False
    if (options.option == 'install'):
        print("\n+ Check dependencies")
        print(
            "+ Try to install all missing dependencies, modules or third party software..."
        )
        option_install = True

        ## check if access and permission
        if os.path.isdir(options.install_path):
            if (set_config.access_check(options.install_path, mode=os.F_OK)):
                print(
                    "Installation path is accessible and has permission for installation if necessary"
                )
            else:
                print(colored("\n*** ERROR ****", 'red'))
                print(
                    colored(
                        "No access/permission for this path: %s" %
                        options.install_path, 'red'))
                print(
                    colored(
                        "Please provide a valid path with access/permission to install any missing dependencies.",
                        'red'))
                exit()
        else:
            print(colored("\n*** ERROR ****", 'red'))
            print(colored("Path provided is not a folder", 'red'))
            print(options.install_path)
            exit()

    elif (options.option == 'only_check'):
        print(
            "\nCheck dependencies, modules or third party software and print report..."
        )

    #######################
    ## python version
    #######################
    HCGB_aes.print_sepLine("+", 20, False)
    print('Python:')
    HCGB_aes.print_sepLine("+", 20, False)

    this_python_version = str(sys.version)
    python_min_version = extern_progs.return_min_version_soft('python')
    if LooseVersion(this_python_version) >= LooseVersion(python_min_version):
        print(
            colored(
                "Minimum version (%s) satisfied: %s" %
                (python_min_version, this_python_version), 'green'))
    else:
        print(
            colored(
                "Minimum version (%s) not satisfied: %s" %
                (python_min_version, this_python_version), 'red'))
        exit()

    #######################
    ## perl_version
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 50, False)
    print('Perl:')
    HCGB_aes.print_sepLine("+", 50, False)

    perl_min_version = extern_progs.return_min_version_soft('perl')
    this_perl_path = set_config.get_exe("perl", Debug)
    this_perl_version = set_config.get_version("perl", this_perl_path, Debug)
    if LooseVersion(this_perl_version) >= LooseVersion(perl_min_version):
        print(
            colored(
                "Minimum version (%s) satisfied: %s" %
                (perl_min_version, this_perl_version), 'green'))
    else:
        print(
            colored(
                "Minimum version (%s) not satisfied: %s" %
                (perl_min_version, this_perl_version), 'red'))
        exit()

    #######################
    ## third-party software
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('External dependencies:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_dependencies(option_install, options.install_path, Debug)
    print('\n')

    #######################
    ## python packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('Python packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_python_packages(Debug, option_install,
                                     options.install_path)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')

    #######################
    ## perl packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('Perl packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_perl_packages("perl_dependencies", Debug, option_install,
                                   options.install_path)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')

    #######################
    ## IslandPath dependencies
    #######################
    if (options.IslandPath):
        print('\n')
        HCGB_aes.print_sepLine("+", 20, False)
        print('IslandPath packages and software required:')
        HCGB_aes.print_sepLine("+", 20, False)

        set_config.check_IslandPath(Debug, option_install,
                                    options.install_path)
        HCGB_aes.print_sepLine("+", 20, False)
        print('\n')

    #######################
    ## R packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('R packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_R_packages(option_install, options.install_path, Debug)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')
Пример #26
0
def run_database(options):

    ## init time
    start_time_total = time.time()
    start_time_partial = start_time_total

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
        print("[Debug mode: ON]")
    else:
        Debug = False

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Database")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    kma_bin = set_config.get_exe("kma")

    ######################################################
    ## print further information if requested
    if (options.help_ARIBA):
        print("ARIBA databases information:")
        ariba_caller.help_ARIBA()
        exit()

    elif (options.help_BUSCO):
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_KMA):
        species_identification_KMA.help_kma_database()
        exit()
    ######################################################

    ## create folder
    ## absolute
    options.path = os.path.abspath(options.path)
    HCGB_files.create_folder(options.path)

    #########
    if Debug:
        print(colored("DEBUG: absolute path folder: " + options.path,
                      'yellow'))

    ##########
    ## NCBI	##
    ##########
    ## if any NCBI options provided
    if any([options.ID_file, options.descendant]):
        ## create folders
        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path)
        if (options.ID_file):
            ## get path and check if it is file
            abs_path_file = os.path.abspath(options.ID_file)
            if os.path.isfile(abs_path_file):
                print()
                HCGB_aes.print_sepLine("*", 50, False)
                print("--------- Check NCBI ids provided ---------\n")
                HCGB_aes.print_sepLine("*", 70, False)
                ## get file information
                print("\t+ Obtaining information from file: %s" %
                      abs_path_file)
                strains2get = HCGB_main.get_data(abs_path_file, ',', '')
                dataBase_NCBI = database_generator.NCBI_DB(
                    strains2get, NCBI_folder, Debug)

                #########
                if Debug:
                    print(colored("DEBUG: NCBI data provided: ", 'yellow'))
                    print(options.ID_file)

                ## functions.timestamp
                start_time_partial = HCGB_time.timestamp(start_time_partial)
                ## strains downloaded would be included to a kma index

        ## Get all entries belonging to this taxon provided
        if (options.descendant):
            #########
            if Debug:
                print(colored("DEBUG: NCBI descendant option: ON ", 'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print(
                "--------- Check descendant NCBI taxonomy ids provided ---------\n"
            )
            HCGB_aes.print_sepLine("*", 70, False)
            ## [TODO]
            dataBase_NCBI = database_generator.NCBI_descendant(
                options.descendant, NCBI_folder, Debug)

        ##############################################################
        ## update KMA database with NCBI information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_NCBI['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db)

        print('+ Database to update: ', genbank_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'genbank_KMA',
                                               genbank_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ###############
    ## user_data ##
    ###############
    if options.project_folder:

        ##
        dataBase_user = pd.DataFrame()
        ## get absolute path
        abs_project_folder = os.path.abspath(options.project_folder)
        if os.path.exists(abs_project_folder):
            #########
            if Debug:
                print(
                    colored("DEBUG: User provides folder containing project",
                            'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print("--------- Check user provided project folder ---------")
            HCGB_aes.print_sepLine("*", 70, False)
            dataBase_user = database_user.update_database_user_data(
                options.path, abs_project_folder, Debug, options)
        else:
            print(
                colored(
                    "ERROR: Folder provided does not exists: %s" %
                    options.project_folder, 'red'))
            exit()

        ##############################################################
        ## update KMA database with user_data information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_user['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        user_kma_db = HCGB_files.create_subfolder('user_data', kma_db)

        print('+ Database to update: ', user_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'userData_KMA',
                                               user_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ##########
    ## ARIBA
    ##########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check ARIBA parameters provided --------")
    HCGB_aes.print_sepLine("*", 50, False)
    if (options.no_ARIBA):
        print("+ No ARIBA databases would be downloaded...")

        #########
        if Debug:
            print(colored("DEBUG: No option ARIBA", 'yellow'))

    else:
        #functions.print_sepLine("*",50, False)

        ### ariba list databases
        ariba_dbs_list = ['CARD', 'VFDB']

        if (options.no_def_ARIBA):
            ariba_dbs_list = options.ariba_dbs
        else:
            if (options.ariba_dbs):
                ariba_dbs_list = ariba_dbs_list + options.ariba_dbs
                ariba_dbs_list = set(ariba_dbs_list)

        #########
        if Debug:
            print(colored("DEBUG: Option ARIBA", 'yellow'))
            print(options.ariba_dbs)

        ariba_caller.download_ariba_databases(ariba_dbs_list, options.path,
                                              Debug, options.threads)

        ### ariba list databases
        if (options.ariba_users_fasta):
            print(
                "+ Generate ARIBA database for databases provided: prepare fasta and metadata information"
            )

            #########
            if Debug:
                print(colored("DEBUG: Option user ARIBA db", 'yellow'))
                print(ariba_users_fasta)
                print(ariba_users_meta)

            ## [TODO]:
            ## ariba prepareref fasta and metadata

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    #########
    ## kma ##
    #########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check KMA parameters provided ----------")
    kma_database = options.path + '/KMA_db'
    HCGB_files.create_folder(kma_database)

    ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains
    ## downloads all "bacterial" genomes from KMA website
    ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/

    print(
        "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website"
    )

    ## KMA databases to use
    ## only user dbs
    if (options.no_def_kma):
        if (options.kma_dbs):
            print("+ Only user databases selected will be indexed...")
        else:
            print("+ No databases selected.")
            print(colored("ERROR: Please select a kma database.", 'red'))
            exit()

    ## default dbs + user
    else:
        kma_dbs = ["bacteria", "plasmids"]

        ## default dbs + user
        if (options.kma_dbs):
            options.kma_dbs = options.kma_dbs + kma_dbs
            options.kma_dbs = set(options.kma_dbs)
        else:
            options.kma_dbs = kma_dbs

    #########
    if Debug:
        print(colored("DEBUG: options.kma_dbs", 'yellow'))
        print(options.kma_dbs)

    ## Get databases
    for db in options.kma_dbs:
        print(colored("\n+ " + db, 'yellow'))
        db_folder = HCGB_files.create_subfolder(db, kma_database)
        species_identification_KMA.download_kma_database(db_folder, db, Debug)

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########
    ## BUSCO ##
    ###########
    if (options.BUSCO_dbs):
        print()
        HCGB_aes.print_sepLine("*", 50, False)
        print("--------- Check BUSCO datasets provided ---------")
        BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path)

        #########
        if Debug:
            print(colored("DEBUG: options.BUSCO_dbs", 'yellow'))
            print(options.BUSCO_dbs)

        print("+ BUSCO datasets would be downloaded when executed...")
        #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder)

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    print("\n*************** Finish *******************\n")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Database module.\n")
    return ()
Пример #27
0
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect,
               retrieve_databases):
    """Generate MLST profile identification
	
	This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. 
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`.
	:param retrieve_databases: 
	
	:type options: 
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type dataFrame_edirect: pandas.DataFrame()
	:type retrieve_databases: pandas.DataFrame()
	
	:return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information.
	:rtype: Dictionary

	
	See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here:
	
	.. include:: ../../devel/results/doMLST_result_example.csv
		:literal:
	
	.. seealso:: Additional information to PubMLST available datasets.
	
		- :doc:`PubMLST datasets<../../../data/PubMLST_datasets>`
	
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
		
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.MLSTar.run_MLSTar`
		
		- :func:`HCGB.sampleParser.files.get_files`
		
		- :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species`
		
	.. include:: ../../links.inc	
	"""
    ## set config
    rscript = set_config.get_exe("Rscript")

    ## TODO: Samples might not be assembled...to take into account and return 0

    ## TODO: Fix and install MLSTar during installation
    print(MLSTar.get_MLSTar_package_installed())
    exit()

    ########################################################################################

    ## TODO: What to do if multi-isolate sample?
    ## TODO: Control if a different profile is provided via --MLST_profile
    ## TODO: Check time passed and download again if >?? days passed]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow'))
        print(dataFrame_edirect)

    ## MLST call
    HCGB_aes.boxymcboxface("MLST typing")
    print(
        "+ Create classical MLST typification of each sample according to species retrieved by kmer..."
    )

    ## get assembly files
    input_dir = os.path.abspath(options.input)
    assembly_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow'))
        print(assembly_samples_retrieved)

    # init
    MLST_results = {}

    ## get MLST_profile: default or provided
    mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] ==
                                               'PubMLST']['path'].tolist()

    if (Debug):
        print("** Debug **")
        print("mlst_profile_list")
        print(mlst_profile_list)

        print("dataFrame_edirect")
        print(dataFrame_edirect)

    ## Generate MLST call according to species identified for each sample
    for index, row in dataFrame_edirect.iterrows():
        MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'],
                                                     row['species'])

        if (MLSTar_taxa_name == 'NaN'):
            print(
                colored(
                    "\t- Not available PubMLST profile for sample [%s] identified as %s %s"
                    % (row['sample'], row['genus'], row['species']), 'yellow'))

        else:
            for mlst_profile in mlst_profile_list:

                ## species folder
                #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder)
                species_mlst = mlst_profile.split(',')[0]
                species_mlst_folder = mlst_profile.split(',')[1]

                ## output file
                output_file = species_mlst_folder + '/PubMLST_available_scheme.csv'
                filename_stamp = species_mlst_folder + '/.success_scheme'

                ##
                if MLSTar_taxa_name == species_mlst:
                    if os.path.isfile(filename_stamp):
                        stamp = HCGB_time.read_time_stamp(filename_stamp)
                        print(
                            colored(
                                "\tA previous command generated results on: %s"
                                % stamp, 'yellow'))
                    else:
                        ### get scheme available
                        MLSTar.getPUBMLST(MLSTar_taxa_name, rscript,
                                          output_file)
                        stamp = HCGB_time.print_time_stamp(filename_stamp)

                    ## parse and get scheme for classical MLST
                    schemes_MLST = pd.read_csv(output_file, sep=',', header=0)

                    ##
                    for item, cluster in schemes_MLST.iterrows():
                        if cluster['len'] < 10:
                            scheme2use = int(cluster['scheme'])
                            continue
                    ###
                    sample = row['sample']
                    MLSTar_folder = HCGB_files.create_subfolder(
                        'MLST', outdir_dict[sample])
                    genome_file = assembly_samples_retrieved.loc[
                        assembly_samples_retrieved['name'] ==
                        sample]['sample'].values[0]

                    ## call MLST
                    (results, profile_folder) = MLSTar.run_MLSTar(
                        species_mlst_folder, rscript, MLSTar_taxa_name,
                        scheme2use, sample, MLSTar_folder, genome_file,
                        options.threads)
                    MLST_results[sample] = results

    ##
    print("+ Finish this step...")
    return (MLST_results)
Пример #28
0
def send_kma_job(outdir_file, list_files, name, database, threads, Debug):
    """
	Executes KMA identification jobs
	
	This function automates the process of checking if any previous run succeeded or
	runs the appropiate identification process for the sample and database provided.
	
	:param outdir_file:
	:param list_files:
	:param name:
	:param database:
	:param threads:
	:param dataFrame_sample:
	
	:type outdir_file:
	:type list_files:
	:type name:
	:type database:
	:type threads:
	:type dataFrame_sample:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call`
	
		- :func:`BacterialTyper.module.ident.get_outfile`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
		
		
	"""

    if (Debug):
        print(colored("**DEBUG: ident.send_kma_job call**", 'yellow'))
        print("outdir_file")
        print(outdir_file)
        print("list_files")
        print(list_files)
        print("name: " + name)
        print("database: " + database)

    ## outdir_KMA
    outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file)

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## get outfile
    outfile = get_outfile(outdir_dict_kma, name, database)

    ## check if previously run and succeeded
    basename_tag = os.path.basename(outfile)
    filename_stamp = outdir_dict_kma + '/.success_' + basename_tag

    if (Debug):
        print("Outdir: ", outdir_dict_kma)
        print("outfile: ", outfile)
        print("Filename_stamp: ", filename_stamp)

    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
    else:
        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: species_identification_KMA.kma_ident_module call**",
                    'yellow'))
            print("outfile = get_outfile(outdir_dict_kma, name, db2use)")
            print("outfile: ", outfile)
            print(
                "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) "
            )
            print("species_identification_KMA.kma_ident_module" + "\t" +
                  outfile + "\t" + str(list_files) + "\t" + name + "\t" +
                  database + "\t" + str(threads) + "\n")

        ## Sparse or not
        #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']):
#		if (basename_tag == 'userData_KMA'):
#			option = ''
#		else:
#			option = '-Sparse '

## Add option to retrieve databse from memory
        option = ""
        option = option + '-shm 1'

        # Call KMA
        species_identification_KMA.kma_ident_call(outfile, list_files, name,
                                                  database, kma_bin, option,
                                                  threads)
        stamp = HCGB_time.print_time_stamp(filename_stamp)
Пример #29
0
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
              time_partial):
    """Kmer identification using software KMA_.
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param pd_samples_retrieved: pandas dataframe for samples to process.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param retrieve_databases: 
	:param time_partial: timestamp of start time of the process.
	
	:type options: 
	:type pd_samples_retrieved: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type retrieve_databases: pandas.DataFrame()
	:type time_partial: 
	
	:return: Information of the identification. See example below.
	:rtype: pandas.DataFrame()
	
	See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here:
	
	.. include:: ../../devel/results/KMA_ident_example.csv
		:literal:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.modules.ident.send_kma_job`
		
		- :func:`BacterialTyper.modules.ident.get_outfile`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results`
	
		
	.. include:: ../../links.inc	
	
	"""

    return (pd.DataFrame())

    ### print header
    HCGB_aes.boxymcboxface("KMA Identification")

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## check status
    databases2use = []
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (str(db2use['source']).startswith('KMA')):
            print('+ Check database: ' + db2use['db'])
            fold_name = os.path.dirname(db2use['path'])

            index_status = species_identification_KMA.check_db_indexed(
                db2use['path'], fold_name)
            if (index_status == True):
                print(
                    colored(
                        "\t+ Databases %s seems to be fine...\n\n" %
                        db2use['db'], 'green'))
                databases2use.append(db2use['path'])
            else:
                #databases2use.remove(db2use)
                print(
                    colored(
                        "\t**Databases %s is not correctly indexed. Not using it...\n"
                        % db2use['db'], 'red'))

    ## debug message
    if (Debug):
        print(
            colored(
                "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**",
                'yellow'))

    ## Start identification of samples
    print("\n+ Send KMA identification jobs...")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:

            ## load database on memory
            print("+ Loading database on memory for faster identification.")
            return_code_load = species_identification_KMA.load_db(
                kma_bin, db2use)
            ## send for each sample
            commandsSent = {
                executor.submit(send_kma_job, outdir_dict[name],
                                sorted(cluster["sample"].tolist()), name,
                                db2use, threads_job, Debug): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            ## remove database from memory
            print("+ Removing database from memory...")
            return_code_rm = species_identification_KMA.remove_db(
                kma_bin, db2use)

            if (return_code_rm == 'FAIL'):
                print(
                    colored(
                        "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s"
                        % cmd_rm_db, 'red'))

            ## functions.timestamp
            time_partial = HCGB_time.timestamp(time_partial)

    ## parse results
    print("+ KMA identification call finished for all samples...")
    print("+ Parse results now")
    results_summary = pd.DataFrame()
    for db2use in databases2use:
        ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided

        basename_db = os.path.basename(db2use)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)

        ###
        for name, cluster in sample_frame:

            ## get result
            ## outdir_KMA
            outdir_dict_kma = HCGB_files.create_subfolder(
                "kma", outdir_dict[name])
            result = get_outfile(outdir_dict_kma, name, db2use)
            #print ('\t- File: ' + result + '.spa')

            ## get results using a cutoff value [Defaulta: 80]
            results = species_identification_KMA.parse_kma_results(
                result + '.spa', options.KMA_cutoff)
            results['Database'] = basename_db

            ### check if db2use is plasmids as it could be several.
            if (results.index.size > 1):
                if (basename_db == "plasmids.T" or basename_db == "viral.TG"):
                    ## let it be several entries
                    results['Sample'] = name
                    results_summary = results_summary.append(results,
                                                             ignore_index=True)
                else:
                    print(
                        colored("###########################################",
                                'yellow'))
                    print(
                        colored("Sample %s contains multiple strains." % name,
                                'yellow'))
                    print(
                        colored("###########################################",
                                'yellow'))
                    print(colored(results, 'yellow'))
                    print('\n\n')

                    ## add both strains if detected
                    results['Sample'] = name
                    results_summary = results_summary.append(results,
                                                             ignore_index=True)

                    ## TODO: add multi-isolate flag

            elif (results.index.size == 1):  ## 1 clear reference
                results['Sample'] = name
                results_summary = results_summary.append(results,
                                                         ignore_index=True)

            else:
                print(
                    colored(
                        '\tNo clear strain from database %s has been assigned to sample %s'
                        % (basename_db, name), 'yellow'))
                ## add empty line if no available
                results['Sample'] = name
                results_summary = results_summary.append(results,
                                                         ignore_index=True)

    print("+ Finish this step...")

    ## debug message
    if (Debug):
        results_summary.to_csv(quotechar='"')

    return (results_summary)