Exemplo n.º 1
0
def ariba_run_caller(db2use, db_name, list_files, folder_out, threads, cutoff):
    ## check if already is done
    # generate a stamp when finish parsing each file

    ## make stamp time
    filename_stamp = os.path.join(folder_out, '.success_' + db_name)
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        files_names = [os.path.basename(s) for s in list_files]
        print(
            colored(
                "\tA previous command generated results on: %s [Files: %s]" %
                (stamp, files_names), 'yellow'))

    else:
        if os.path.exists(folder_out):
            shutil.rmtree(
                folder_out)  ## delete folder if exists but failed before

        ## call
        code = ariba_caller.ariba_run(db2use, list_files, folder_out, threads,
                                      cutoff)
        if code == 'FAIL':
            print("*** ERROR: System call failed for ", folder_out)

        ## print success timestamp
        HCGB_time.print_time_stamp(filename_stamp)
Exemplo n.º 2
0
def init_db_object(debug):
    """Instantiate the ete taxonomy object     
    Created by Joe R. J. Healey; Nick Youngblut
    Original code.
    """
    # Instantiate the ete NCBI taxa object
    print("+ ------------------------------------- +")
    print("+ Looking for NCBI taxonomy database:")
    ncbi = NCBITaxa()

    ## dbfile location
    if debug:
        debug_message(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        debug_message(
            'NCBI Taxonomy database is stored at {}\n'.format(ncbi.dbfile),
            "yellow")

    ## folder would be download here: ~/.etetoolkit/taxa.sqlite
    db_folder = os.path.dirname(format(ncbi.dbfile))

    ## check timestamp, update if necessary
    filename_stamp_parse = db_folder + '/timestamp_db.txt'
    if os.path.isfile(filename_stamp_parse):
        stamp = time_functions.read_time_stamp(filename_stamp_parse)
        days_passed = time_functions.get_diff_time(filename_stamp_parse)

        ## debug messages
        if debug:
            debug_message('Database previously initiated', "yellow")
            debug_message('on date: {}'.format(stamp), "yellow")
            debug_message('Days passed: {}'.format(days_passed), "yellow")

        if (days_passed > 30):
            ## update_db
            update_db(ncbi, db_folder, debug)
        else:
            ## debug messages
            if debug:
                debug_message('No need to update db', "yellow")
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )

            print(
                colored(
                    "\tA previous command generated results on: %s [%s]" %
                    (stamp, 'init database'), 'yellow'))
    else:
        ## create first timestamp
        time_functions.print_time_stamp(filename_stamp_parse)

    return ncbi
Exemplo n.º 3
0
def create_blast_results(sample, fasta_file, outdir, debug):
    '''Creates BLAST results for each fasta vs. itself'''
    
    #phr is the header file, pin is the index file, psq is the sequence file
    
    ## debug messages
    if debug:
        debug_message('create_blast_results function call:', 'yellow')
        debug_message('sample: ' + sample, 'yellow')
        debug_message('fasta_file: ' + fasta_file, 'yellow')
        debug_message('outdir: ' + outdir, 'yellow')
    
    ## output file
    raw_blast = os.path.abspath(os.path.join(outdir, "BLAST_raw_results.tsv"))

    ## timestamps 
    db_timestamp = os.path.join(outdir, '.db_success')
    search_timestamp = os.path.join(outdir, '.blast_success')
        
    if (not HCGB.functions.files_functions.is_non_zero_file(search_timestamp)):

        ## get binaries
        (makeblastdb_exe, blastp_exe) = BacDup.modules.config.get_exe('BLAST', debug)
        makeblastdb_exe = "/usr/bin/makeblastdb" 
        blastp_exe = "/usr/bin/blastp"
        
        ## check if db is indexed already
        db_path_name = os.path.join(os.path.abspath(outdir), sample + '_db')
        if (not HCGB.functions.files_functions.is_non_zero_file(db_timestamp)):
            ## generate blastdb for genome
            HCGB.functions.blast_functions.makeblastdb(db_path_name, fasta_file, makeblastdb_exe, 'prot') # HCGB function    
        
            ## print time stamp
            HCGB_time.print_time_stamp(db_timestamp)
        
        else:
            print (colored("\t+ BLAST database already available for sample %s [%s]" %(sample, read_time), 'green'))
            
        ## create blastp outfile
        HCGB.functions.blast_functions.blastp(blastp_exe, raw_blast, db_path_name, fasta_file, 1) # HCGB function

        ## print time stamp
        HCGB_time.print_time_stamp(search_timestamp)
    else:
        read_time = HCGB_time.read_time_stamp(search_timestamp)
        print (colored("\t+ Duplicate search already available for sample %s [%s]" %(sample, read_time), 'green'))
            
    return (raw_blast)
Exemplo n.º 4
0
def run_module_SPADES_old(name, folder, file1, file2, threads):

	print ("+ Calling spades assembly for sample...", name)	

	## folder create
	HCGB_files.create_folder(folder)
	
	## get configuration
	SPADES_bin = set_config.get_exe('spades')
	
	## assembly main 
	path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads)

	## assembly plasmids
	path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, name, SPADES_bin, threads)
	
	## discard plasmids from main
	(tmp_contigs, tmp_plasmids) = discardPlasmids(path_to_contigs, path_to_plasmids, folder, name)
	
	## rename fasta sequences
	new_contigs = tmp_contigs.split(".fna.tmp")[0] + '.fna'	
	rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs)
	
	new_plasmids=""
	if os.path.isfile(tmp_plasmids):
		new_plasmids = tmp_plasmids.split(".fna.tmp")[0] + '.fna'	
		rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids)
	
	## contig stats
	stats(new_contigs, new_plasmids)
	
	## success stamps
	filename_stamp = folder + '/.success'
	stamp =	HCGB_time.print_time_stamp(filename_stamp)
def download_VFDB_files(folder):
    ##
    ## Given a folder, check if it contains VFDB information
    ## or download it from website: http://www.mgc.ac.cn
    ##
    links = (
        "http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz",
        "http://www.mgc.ac.cn/VFs/Down/Comparative_tables_from_VFDB.tar.gz")

    ## check if data is downloaded, how old is the data and if it is necessary to download again
    ## consider >30 days long enough to be updated again

    ## time stamp
    filename_stamp = folder + '/download_timestamp.txt'
    if os.path.exists(folder):
        if os.path.isfile(filename_stamp):
            stamp = HCGB_time.read_time_stamp(filename_stamp)
            print("+ A previous download generated results on: ", stamp)
            days_passed = HCGB_time.get_diff_time(filename_stamp)
            print("\t\t** %s days ago" % days_passed)
            if (days_passed > 30):  ## download again
                print(
                    "\t\t** Downloading information again just to be sure...")
            else:
                print("\t\t** No need to download data again.")
                return ()
    else:
        HCGB_files.create_folder(folder)

    ## Open file and readlines
    print('+ Downloading files:\n')
    for line in links:
        if not line.startswith('#'):
            HCGB_sys.wget_download(line, folder)

    ## decompress files
    print('+ Decompressing gzip files\n')
    files = os.listdir(folder)
    for item in files:
        #print (folder)
        if item.endswith('.gz'):
            HCGB_files.extract(folder + '/' + item, folder)

    ## make stamp time
    HCGB_time.print_time_stamp(filename_stamp)

    return ()
Exemplo n.º 6
0
def module_call(sequence_fasta, kingdom, genus, path, name, threads):
    """
	Function that checks and generates annotation.
	
	- It uses Prokka_ via :func:`BacterialTyper.scripts.annotation.prokka_call`.
	
	- It checks if previously generated 
	
	- Once finished, it prints timestamp 
	
	:param sequence_fasta: Assembled sequences in fasta file format. 
	:param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses
	:param genus: Available genus options for Prokka software. See details above.
	:param path: Absolute path to the output folder to include results.
	:param name: Sample name and tag to include in the annotation report and files.
	:param threads: Number of CPUs to use.
	  
	:type sequence_fasta: string
	:type kingdom: string
	:type genus: string 
	:type path: string 
	:type name: string 
	:type threads: integer 
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`HCGB.functions.time_functions.print_time_stamp`
				
		- :func:`HCGB.functions.time_functions.prokka_call`	

	.. include:: ../../links.inc	 	
	"""

    ## check if previously assembled and succeeded
    filename_stamp = path + '/.success'

    if os.path.isdir(path):
        if os.path.isfile(filename_stamp):
            stamp = HCGB_time.read_time_stamp(filename_stamp)
            print(
                colored(
                    "\tA previous command generated results on: %s [%s]" %
                    (stamp, name), 'yellow'))
            return ()

    ## call prokka
    prokka_bin = set_config.get_exe('prokka')
    dirname = prokka_call(prokka_bin, sequence_fasta, kingdom, genus, path,
                          name, threads)

    ## success stamps
    filename_stamp = path + '/.success'
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    return (dirname)
Exemplo n.º 7
0
def SPADES_systemCall(sample_folder, file1, file2, name, SPADES_bin, options, threads, debug=False):
	"""Generate SPADES system call.
	
	It calls system for SPADES and generates time stamp file in the folder provided (sample_folder + '/.success_assembly') for later analysis.
	
	Steps:
	
	- It generates system call for SPADES assembly. 
	
	- It generates timestamp file.
	
	:param sample_folder: Absolute path to store results. It must exists.
	:param file1: Absolute path to fastq reads (R1).
	:param file2: Absolute path to fastq reads (R2).
	:param name: Sample name or tag to identify sample.
	:param SPADES_bin: Binary executable for SPADES assembly software.
	:param options: Plasmid assembly is possible if specificed via options (--plasmid).
	:param threads: Number of CPUs to use.
	
	:type name: string
	:type sample_folder: string
	:type file1: string
	:type file2: string
	:type SPADES_bin: string
	:type options: string
	:type threads: integer
	
	:return: Returns **OK** if assembly process succeeded and fasta file is generated.
	:rtype: string.
	:warnings: Returns **FAIL** if assembly process stopped.
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.main_functions.system_call`
	
		- :func:`HCGB.functions.time_functions.print_time_stamp`
	"""
	
	## check if previously assembled and succeeded
	filename_stamp = sample_folder + '/.success_assembly'
	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
		return('OK')

	## call system for SPADES sample given
	logFile = sample_folder + '/' + name + '.log'
	
	## command	
	cmd_SPADES = '%s %s-t %s -o %s -1 %s -2 %s > %s 2> %s' %(SPADES_bin, options, threads, sample_folder, file1, file2, logFile, logFile)
	code = HCGB_sys.system_call(cmd_SPADES)
	
	if (code == 'OK'):
		## success stamps
		filename_stamp = sample_folder + '/.success_assembly'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		return('OK')

	return "FAIL"
Exemplo n.º 8
0
def update_db(ncbi_db, db_folder, debug):
    """Update database
    Created by Joe R. J. Healey; Nick Youngblut
    Original code.
    """

    ## debug messages
    if debug:
        debug_message('Update database at {}\n'.format(ncbi_db.dbfile),
                      "yellow")

    print('Updating the taxonomy database. This may take several minutes...\n')
    ncbi_db.update_taxonomy_database()

    ## print timestamp
    filename_stamp_parse = os.path.abspath(db_folder + '/timestamp_db.txt')
    time_functions.print_time_stamp(filename_stamp_parse)

    return ncbi_db
Exemplo n.º 9
0
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db):

	my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name)
	## timestamp
	filename_stamp =  my_out_folder + '/.success'

	print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow'))
		
	## check previous run
	if os.path.isfile(filename_stamp):
		timestamp = HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tSuccessfully run on date: %s"  %timestamp, 'green'))
	else:
	
		busco_bin = set_config.get_exe('busco')
		os.chdir(output_name)
		
		## init cmd configuration
		cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db)
		
		## options if autolineage or given dataset
		if "auto-lineage" == dataset_name:
			logFile = 'auto_lineage.log'
			cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile)
		else:
			logFile = dataset_name + '.log'
			cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile)
		
		## system call
		HCGB_sys.system_call(cmd)
		
		if os.path.isfile(my_out_folder + '/short_summary.txt'):
			## timestamp
			HCGB_time.print_time_stamp(filename_stamp)
		else:
			print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red'))
			return ('FAIL')

	return()
Exemplo n.º 10
0
def agrvate_caller(dict_assemblies, dict_folders, debug=False):
    """Create agrvate call and control for parameters"""
    
    ## ATTENTION: agrvate needs to chdir to output folder
    path_here = os.getcwd()
    
    print ("+ Checking agr genes for each sample retrieved...")
    
    agrvate_results = pd.DataFrame()
    
    ## No need to optimize. There is a problem with the working dir of agrvate and we 
    ## need to change every time.
    for name, assembly_file in dict_assemblies.items():
        sample_folder = HCGB_files.create_folder(dict_folders[name])
        ## check if previously done and succeeded
        filename_stamp = sample_folder + '/.success'
        if os.path.isfile(filename_stamp):
            stamp =  HCGB_time.read_time_stamp(filename_stamp)
            print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
        else:
            os.chdir(sample_folder)
            info_sample = agrvate_call(name, assembly_file, sample_folder, debug)
            agrvate_results = pd.concat([agrvate_results, info_sample], join='outer')
            
            if (info_sample.shape[0] == 0):
                print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name)
            else:
                ## success
                HCGB_time.print_time_stamp(filename_stamp)
    
    print ("+ Jobs finished%s\n+ Collecting information for all samples...")
    os.chdir(path_here)
    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('agrvate_results', 'yellow')
        HCGB_main.print_all_pandaDF(agrvate_results)
    
    return(agrvate_results)
Exemplo n.º 11
0
def mapReads_caller(files, folder, name, threads, STAR_exe, genomeDir,
                    limitRAM_option, Debug):
    ## check if previously joined and succeeded
    filename_stamp = folder + '/.success'
    if os.path.isfile(filename_stamp):
        stamp = time_functions.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s -- %s]" %
                (stamp, name, 'STAR'), 'yellow'))
    else:
        ##
        if Debug:
            print("\n** DEBUG: mapReads_caller options **\n")
            print("folder: " + folder)
            print("name: " + name)
            print("threads: " + str(threads))
            print("STAR_exe: " + STAR_exe)
            print("genomeDir: " + genomeDir)
            print("limitRAM_option: " + str(limitRAM_option))
            print("files: ")
            print(files)

        # Call STAR
        code_returned = mapReads.mapReads("LoadAndKeep", files, folder, name,
                                          STAR_exe, genomeDir, limitRAM_option,
                                          threads, Debug)

        if (code_returned):
            time_functions.print_time_stamp(filename_stamp)
        else:
            print("+ Mapping sample %s failed..." % name)

    ## return results
    bam_file = os.path.join(folder, 'Aligned.sortedByCoord.out.bam')
    mapping_results[name] = bam_file

    return ()
def kma_ident_call(out_file, files, sample_name, index_name, kma_bin, option, threads):
	"""Create kma system call for kmer identification. 
	
	Paired-end end or single end fastq files accepted. It generates a time stamp if succeeds.

	:param out_file: Absolute path and basename for the output files generated with results.
	:param files: List of absolute paths for fastq files to search againts the database.
	:param sample_name: Directory path to store database generated.
	:param index_name: Database name
	:param kma_bin: Binary executable for KMA software.
	:param option: Additional options to pass to the system call.
	:param threads: Number of CPUs to use. 

	:type out_file: string
	:type files: list
	:type sample_name: string
	:type index_name: string
	:type kma_bin: string
	:type option: string
	:type threads: integer	

	:returns: System call returned finish status.
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.system_call`
	
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`

	"""

	###
	out_file_log = out_file + '.log'
	if len(files) == 2:
		cmd_kma_search = "%s -ipe %s %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], files[1], out_file, index_name, threads, option, out_file_log)
	else:
		## TODO: test Single End
		cmd_kma_search = "%s -i %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], out_file, index_name, threads, option, out_file_log)

	code = HCGB_sys.system_call(cmd_kma_search)

	if (code == 'OK'):
		## success stamps
		basename_tag = os.path.basename(out_file)
		folder = os.path.dirname(out_file)
		filename_stamp = folder + '.success_' + basename_tag
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		return('OK')
	else:
		return('FAIL')
Exemplo n.º 13
0
def prepare_card_data(database_folder):
	
	## create CARD folder
	abs_folder = os.path.abspath(database_folder)
	CARD_folder = HCGB_files.create_subfolder('CARD', abs_folder)
	
	## make stamp time
	filename_stamp = CARD_folder + '/.success'

	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [CARD Ontology Data]" %stamp, 'yellow'))

		## check time passed
		days_passed = HCGB_time.get_diff_time(filename_stamp)
		print ("\t** %s days ago" %days_passed)		
		if (days_passed > 30): ## download again
			print ("\t ** Downloading information again just to be sure...")
			download=True
		else:
			print ("\t ** No need to download data again.")
			download=False
	else:
		download=True

	###
	if download:
		## uptade database in a path
		aro_obo_file = card_trick.ontology_functions.update_ontology(CARD_folder, False)
	
		## get ontology and save it in csv
		return_frame = card_trick.ontology_functions.parse_ontology(aro_obo_file, False)
	
		### if success return folder name
		if not return_frame.empty:
			## success stamps
			filename_stamp = CARD_folder + '/.success'
			stamp =	HCGB_time.print_time_stamp(filename_stamp)	
		else:
			return (FAIL)

	## return folder name
	return(CARD_folder)
Exemplo n.º 14
0
def snippy_variant_caller(reference, files, threads, outdir, name, contig_option, other_options, sample_name, Debug):
    
    ## create subfolder within phylo for this mapping
    tag = sample_name + '_vs_' + name
    subdir = HCGB_files.create_subfolder(tag, outdir)
       
    ## check if previously process and succeeded
    filename_stamp = subdir + '/.success'
    
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print (colored("\tA previous command generated results on: %s [%s]" %(stamp, tag), 'yellow'))
    else:
         # Call variant calling
        code = variant_calling.snippy_call(reference, files, threads, subdir, 
                                           sample_name, contig_option, other_options, Debug)
        if code == 'OK':
            stamp = HCGB_time.print_time_stamp(filename_stamp)

        return(code)    
Exemplo n.º 15
0
def parse_featureCount(out_file, path, name, bam_file, Debug):
	"""
	Parses featureCount results for RNAbiotype analysis.
	
	:param out_file: Name provided to featureCount for output results.
	:param path:
	:param name:
	
	
	"""

	## file names
	out_tsv_file_name = out_file + '.tsv'
	RNA_biotypes_file_name = os.path.join(path, name + '_RNAbiotype.tsv')

	##
	filename_stamp_parse = path + '/.success_parse'
	if os.path.isfile(filename_stamp_parse):
		stamp = time_functions.read_time_stamp(filename_stamp_parse)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'parse results'), 'yellow'))
	else:
	
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("Parse results for sample: " + name)
			
		## parse results
		out_tsv_file = open(out_tsv_file_name, 'w')
		RNA_biotypes_file = open(RNA_biotypes_file_name, 'w')
		tRNA_count = 0
		
		##########################################
		### read count file
		##########################################
		count_file = open(out_file)
		count_file_text = count_file.read()
		count_file_lines = count_file_text.splitlines()	
	
		for line in count_file_lines:
			if line.startswith('#'):
				continue
			elif line.startswith('Geneid'):
				continue
			else:
				ID = line.split('\t')[0]
				count = int(line.split('\t')[-1])
				string2write_raw = "%s\t%s\n" %(ID, count)
				out_tsv_file.write(string2write_raw)
	
				tRNA_search = re.search(r".*tRNA", ID)
				if tRNA_search:
					tRNA_count = int(tRNA_count) + int(count)				
				elif (count > 0):
					RNA_biotypes_file.write(string2write_raw)
		
		## count and summary tRNA
		string2write = "tRNA\t%s\n" %tRNA_count
		RNA_biotypes_file.write(string2write)
		RNA_biotypes_file.close()
				
		##########################################
		### read summary count file
		##########################################
		summary_count_file = open(out_file + '.summary')
		summary_count_file_text = summary_count_file.read()
		summary_count_file_lines = summary_count_file_text.splitlines()	
	
		for line in summary_count_file_lines:
			if line.startswith('Status'):
				continue
			elif line.startswith('Assigned'):
				continue
			else:
				## adds Unassigned_Ambiguity
				## adds Unassigned_NoFeatures
				ID = line.split('\t')[0]
				count = int(line.split('\t')[-1])
	
				## skip empty entries
				if count == 0:
					continue
				string2write_raw = "%s\t%s\n" %(ID, count)
				out_tsv_file.write(string2write_raw)
	
		##########################################
		## get mapping statistics according to mapping software
		##########################################
		count_multi = 0
		count_unmap = 0
		mapping_folder = os.path.dirname(bam_file)
		mapping_stats = mapping_folder + '/Log.final.out'
		
		## -------------------------------- ##
		### STAR mapping		
		## -------------------------------- ##
		if files_functions.is_non_zero_file(mapping_stats):
			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("STAR mapping available for sample: " + name)
				print ("mapping_folder: " + mapping_folder)
	
			mapping_stats_file = open(mapping_stats)
			mapping_stats_file_text = mapping_stats_file.read()
			mapping_stats_file_lines = mapping_stats_file_text.splitlines()	
	
			for line in mapping_stats_file_lines:
				multi_search = re.search(r".*Number of reads mapped to", line)
				unmap_search = re.search(r".*unmapped.*", line)
				input_search = re.search(r".*input reads.*", line)
			
				if input_search:
					total_input_reads = int(line.split('\t')[-1])
	
				if multi_search:
					count_tmp = int(line.split('\t')[-1])
					count_multi = count_multi + count_tmp
	
				elif unmap_search:
					perc_tmp = line.split('\t')[-1]
					count_reads = math_functions.percentage(perc_tmp, total_input_reads)
					count_unmap = count_unmap + count_reads
		else:
	
			## -------------------------------- ##
			## tophat
			## -------------------------------- ##
	
			mapping_stats = mapping_folder + '/align_summary.txt' 
			count_map = 0
			total_input_reads = 0
			
			if files_functions.is_non_zero_file(mapping_stats):
				## debugging messages
				if Debug:
					print ("** DEBUG:")
					print ("tophat mapping available for sample: " + name)
					print ("mapping_folder: " + mapping_folder)
				
				mapping_stats_file = open(mapping_stats)
				mapping_stats_file_text = mapping_stats_file.read()
				mapping_stats_file_lines = mapping_stats_file_text.splitlines()	
	
				for line in mapping_stats_file_lines:
					map_search2 = re.search(r"Aligned.*\:\s+(\d+).*", line)
					input_search2 = re.search(r".*Input.*\:\s+(\d+).*", line)
					if input_search2:
						total_input_reads = input_search2.group(1)
					if map_search2:
						count_map = map_search2.group(1)
		
				####
				count_unmap = int(total_input_reads) - int(count_map)
	
			else:
				## other
				print ("Neither tophat or STAR..., no mapping statistics")
	
		### print mapping stats
		string2write_unmap = "unmapped\t%s\n" %count_unmap
		out_tsv_file.write(string2write_unmap)
		
		## close files
		out_tsv_file.close()

		## print timestamp
		time_functions.print_time_stamp(filename_stamp_parse)

	return(out_tsv_file_name, RNA_biotypes_file_name)
Exemplo n.º 16
0
def NCBIdownload(acc_ID, data, data_folder):

    ## module ngd requires to download data in bacteria subfolder under genbank folder
    dir_path = os.path.join(data_folder, 'genbank', 'bacteria', acc_ID)
    ngd_download(dir_path, acc_ID, data_folder)

    ## get files download
    (genome, prot, gff, gbk) = get_files_download(dir_path)

    ## check if any plasmids downloaded
    plasmid_count = 0
    plasmid_id = []
    contig_out_file = dir_path + '/' + acc_ID + '_chromosome.fna'
    plasmid_out_file = dir_path + '/' + acc_ID + '_plasmid.fna'

    ## open
    contig_out_file_handle = open(contig_out_file, 'w')
    for seq_record in SeqIO.parse(genome, "fasta"):
        plasmid_search = re.search(r".*plasmid.*", seq_record.description)
        if plasmid_search:
            ## count and get names for plasmids
            plasmid_count += 1
            name = str(seq_record.id)
            plasmid_id.append(name)

            ### Separate plasmids from main sequence
            plasmid_out_file_handle = open(plasmid_out_file, 'a')
            plasmid_out_file_handle.write(seq_record.format("fasta"))
            plasmid_out_file_handle.write('\n')
            plasmid_out_file_handle.close()
        else:
            contig_out_file_handle.write(seq_record.format("fasta"))
            contig_out_file_handle.write('\n')
            contig_out_file_handle.close()

    ## no plasmids found
    if plasmid_count == 0:
        plasmid_out_file = ""

    data2download = pd.DataFrame(columns=('ID', 'folder', 'genus', 'species',
                                          'name', 'genome', 'chr', 'GFF',
                                          'GBK', 'proteins', 'plasmids_number',
                                          'plasmids_ID', 'plasmids'))
    data2download.loc[len(data2download)] = (acc_ID, dir_path,
                                             data.loc[acc_ID]['genus'],
                                             data.loc[acc_ID]['species'],
                                             data.loc[acc_ID]['name'], genome,
                                             contig_out_file, gff, prot, gbk,
                                             plasmid_count,
                                             "::".join(plasmid_id),
                                             plasmid_out_file)

    ## dump to file
    info_file = dir_path + '/info.txt'
    data2download.to_csv(info_file)

    ## timestamp
    filename_stamp = dir_path + '/.success'
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## return data
    return (info_file)
Exemplo n.º 17
0
def check_sample_assembly(name, sample_folder, files, threads):
    """Checks if sample is assembled.
	
	It checks whether a sample is assembled or not by reading file *sample_folder/.success_all*. 
	
	If file not available (no previous assembly or not suceeded it) it calls :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` to generate assembly for the sample speficied.
	
	:param name: Sample name or tag to identify sample.
	:param sample_folder:  directory to generate assembly ouptut. It must exist.
	:param files: List containing files (fastq R1 & R2) for the sample to be assembled.
	:param threads: Number of CPUs to use
	:type name: string
	:type sample_folder: string 
	:type files: list
	:type threads: integer
	
	:return: Populates dictionary assembly_stats with assembly stats dictionary information
	:rtype: Dataframe
	
	.. seealso:: This function depends on other BacterialTyper and HCGB functions called:
	
		- :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly`
	
	"""
    ## check if previously assembled and succeeded
    filename_stamp = sample_folder + '/.success_all'
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))

        ## Get information
        stat_output = {
            'Contig Stats':
            HCGB_main.file2dictionary(
                sample_folder + '/' + name + '_assembly-contigs.csv', ','),
            'Scaffold Stats':
            HCGB_main.file2dictionary(
                sample_folder + '/' + name + '_assembly-scaffolds.csv', ',')
        }

        ## populate main dictionary
        assembly_stats[name] = [
            stat_output, sample_folder + '/' + name + '_assembly_stats.xlsx'
        ]

    else:

        ## debug message
        if (Debug):
            HCGB_aes.debug_message(
                "spades_assembler.run_module_assembly call:", "yellow")
            print("spades_assembler.run_module_assembly " + name + "\t" +
                  sample_folder + "\t" + files[0] + "\t" + files[1] + "\t" +
                  str(threads) + "\n")

        # Call spades_assembler
        code = spades_assembler.run_module_assembly(name, sample_folder,
                                                    files[0], files[1],
                                                    threads)

        if (code != 'FAIL'):
            ## success stamps
            filename_stamp = sample_folder + '/.success_all'
            stamp = HCGB_time.print_time_stamp(filename_stamp)
            assembly_stats[
                name] = code  # list containing dictionary of data and excel
        else:
            print(
                "Some error occurred for sample %s while generating the assembly. "
                % name)
Exemplo n.º 18
0
def run_search(arg_dict):
    """Main function of the search module in BacDup package.
    
    This module searches and create gene duplication analysis. 
    
    It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user
    annotation data) or a single or multiple samples.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    if (arg_dict.blast_help):
        info.blast_help()
        exit()

    if (arg_dict.project_help):
        info.project_help()
        exit()

    if (arg_dict.detached_mode_help):
        info.detached_mode()
        exit()

    ### Start the analysis
    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Search module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    outdir = os.path.abspath(arg_dict.input_folder)

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        ## output folder
        print("\n+ Create output folder(s):")
        HCGB.functions.files_functions.create_folder(outdir)
    else:
        arg_dict.project = True

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting information provided... ')
    print('+ Several options available:')
    print('\t* BacDup project folder with initiated data')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\t* Single/Multiple raw BLAST results files')
    print('\t* Single/Multiple fasta proteins + annotation table')

    print("""\n\n**** NOTE: **** 
    For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs)
    use the input module to accommodate accordingly """)
    time.sleep(1)

    print()

    ## parse options
    pd_samples_retrieved = parse_search_options(arg_dict)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## for each sample
    dict_search_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "search",
        arg_dict.debug)

    dict_dup_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug)

    dict_parse_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "parse",
        arg_dict.debug)

    ## create results
    data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table())
    for sample, folder in dict_search_folders.items():

        annot_timestamp = os.path.join(dict_dup_folders[sample],
                                       '.annot_success')
        dup_annot_file = os.path.join(dict_dup_folders[sample],
                                      'dup_annot.csv')

        ## annotation
        annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table']

        if (not HCGB.functions.files_functions.is_non_zero_file(
                annot_timestamp)):

            ## get results
            file_data = pd_samples_retrieved.loc[sample, 'file_data']
            format = pd_samples_retrieved.loc[sample, 'format']
            filtered_data = dup_searcher.filter_data(
                sample, file_data, format, arg_dict.pident, arg_dict.evalue,
                arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug)

            ## timestamps
            filter_timestamp = os.path.join(dict_dup_folders[sample],
                                            '.filter_success')
            if (not HCGB.functions.files_functions.is_non_zero_file(
                    filter_timestamp)):
                #save results as a .csv file
                sort_csv = os.path.abspath(
                    os.path.join(dict_dup_folders[sample],
                                 'filtered_results.csv'))
                filtered_data.to_csv(sort_csv, header=True, index=False)

                ## print time stamp
                HCGB_time.print_time_stamp(filter_timestamp)
            else:
                read_time = HCGB_time.read_time_stamp(filter_timestamp)
                print(
                    colored(
                        "\t+ Filter results already available for sample %s [%s]"
                        % (sample, read_time), 'green'))

            ## get annotation
            (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(
                sample, filtered_data, annot_table_file, arg_dict.debug)

            ##
            info_dup_file = os.path.join(dict_dup_folders[sample],
                                         'info_dup.csv')
            data2add_entry.to_csv(info_dup_file, header=True, index=False)

            ## save into file
            dup_annot_df.to_csv(dup_annot_file, header=True)

            ## print time stamp
            HCGB_time.print_time_stamp(annot_timestamp)

        else:
            read_time = HCGB_time.read_time_stamp(annot_timestamp)
            print(
                colored(
                    "\t+ Duplicate annotation already available for sample %s [%s]"
                    % (sample, read_time), 'green'))

            ## add info for each
            dup_annot_df = HCGB_main.get_data(dup_annot_file, ',',
                                              "index_col=0")
            annot_table = HCGB_main.get_data(annot_table_file, ',',
                                             "index_col=0")
            data2add_entry = dup_searcher.get_dup_stats(
                sample, dup_annot_df, annot_table, arg_dict.debug)

        ## add genome length data
        data2add_entry['genome_len'] = ''
        len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv')
        if os.path.isfile(len_df_file):
            len_data = HCGB_main.get_data(len_df_file, ',', "header=None")
            data2add_entry['genome_len'] = len_data[1].sum()

        ## merge data
        #data2add_entry = data2add_entry.reset_index()
        data2add = data2add.append(data2add_entry, ignore_index=False)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing duplicated search")
    outdir_report = HCGB.functions.files_functions.create_subfolder(
        "report", outdir)
    dups_report = HCGB.functions.files_functions.create_subfolder(
        "dups", outdir_report)

    ## add data2add
    data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting search module.")
    return ()
Exemplo n.º 19
0
def ariba_getref(database, outdir, Debug, threads):
	######################################################################################
	## usage: ariba getref [options] <db> <outprefix>
	######################################################################################
	## Download reference data from one of a few supported public resources
	## positional arguments:
	##	DB name            Database to download. Must be one of: argannot card megares plasmidfinder resfinder srst2_argannot vfdb_core vfdb_full virulencefinder
	##  outprefix          Prefix of output filenames
	######################################################################################

	## where database is one of: 
	##	argannot, card, megares, plasmidfinder, resfinder,
	##	srst2_argannot, vfdb_core, vfdb_full, virulencefinder.

	## folders
	outdir_name = outdir + '/' + database
	outdir_prepare_ref = outdir + '_prepareref'

	## download information in database folder provided by config
	print ("\t+ Retrieve information from database: " + database)

	## check if previously downloaded and succeeded
	filename_stamp = outdir + '/.success'

	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
		download_ariba_cmd = 'OK'
	else:
		cmd_getref = 'ariba getref %s %s' %(database, outdir_name)
		download_ariba_cmd = HCGB_sys.system_call(cmd_getref)
	
	if (download_ariba_cmd == 'OK'):
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		## debug message
		if (Debug):
			print (colored("**DEBUG: ariba getref %s succeed " %database + "**", 'yellow'))

	else: 
		## rise error & exit
		print (colored("***ERROR: ariba getref %s failed " %database + " **",'red'))
		return('FAIL')

	## debug message
	if (Debug):
		print (colored("**DEBUG: Run ariba prepareref %s " %database + "**", 'yellow'))

	## check if previously prepareref and succeeded
	filename_stamp_prepare = outdir_prepare_ref + '/.success'
	if os.path.isfile(filename_stamp_prepare):
		stamp =	HCGB_time.read_time_stamp(filename_stamp_prepare)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
	
	else:
		## get information
		list_files = os.listdir(outdir)
		fasta = ""
		metadata = ""
		for f in list_files:
			if f.endswith('tsv'):
				metadata = outdir + '/' + f
			elif f.endswith('fa'):
				fasta = outdir + '/' + f
	
		code = ariba_prepareref(fasta, metadata, outdir_prepare_ref, threads)
		
		if (code == 'OK'):
			filename_stamp = outdir_prepare_ref + '/.success'

		HCGB_time.print_time_stamp(filename_stamp_prepare)

	return()		
def results_parser(database, folderResults, sampleName, outfolder,
                   assembly_cutoff, card_trick_info):
    """Parse ARIBA results
	
	This function basically extracts files and generated additionally information for later
	parse according to type of database provided.
	
	.. seealso:: Additional information to ARIBA results generated.
	
		- :ref:`ARIBA-explained`
	"""
    if not os.path.exists(folderResults):
        print(
            "+ Finish parsing information for sample [%s]. Results folder does not exist."
            % sampleName)
        return ('NaN', 'NaN')

    ## get files
    list_files = os.listdir(folderResults)

    ## init
    assemblies = ""
    assemled_genes = ""
    fileResults = ""

    print("\n+ Parsing result file for sample: ", sampleName)

    ## extract files
    print("\n+ Extracting files if necessary:")
    for f in list_files:
        filePath = os.path.join(folderResults, f)
        if f.endswith('.gz'):
            HCGB_files.extract(filePath, folderResults)
        if (f == 'report.tsv'):
            fileResults = filePath
        elif (f == 'assemblies.fa.gz'):
            assemblies = os.path.join(folderResults, 'assemblies.fa')
        elif (f == 'assembled_genes.fa.gz'):
            assemled_genes = os.path.join(folderResults, 'assembled_genes.fa')
    print("\n")

    ## no results generated
    if not HCGB_files.is_non_zero_file(fileResults):
        print('+ No results generated for sample: ', sampleName)
        return ('', '')

    ### expand flags
    flagResults = folderResults + '/flags_explain.tsv'
    fileFlags = ariba_caller.ariba_expandflag(fileResults, flagResults)

    ######################
    ## generate summary
    ######################
    ##
    ## ariba has function that generates a summary for samples
    ##
    summary_results_tmp = folderResults + '/report_summary_tmp'
    summary_results = folderResults + '/report_summary.csv'
    options = "--no_tree"
    ## Info
    ## https://github.com/sanger-pathogens/ariba/wiki/The-assembled-column-from-ariba-summary

    ariba_caller.ariba_summary(summary_results_tmp, [fileResults], options)

    ## fix names: just for aesthetics
    fake_dict = {sampleName: fileResults}
    ariba_caller.fix_ariba_summary(summary_results_tmp + '.csv',
                                   summary_results, fake_dict)
    os.remove(summary_results_tmp + '.csv')

    ############################################
    ### check results according to database
    ############################################
    if (database == 'vfdb_full'):
        (name_excel, name_csv) = parse_vfdb(outfolder, sampleName, fileResults,
                                            fileFlags, summary_results,
                                            assembly_cutoff)
    elif (database == 'card'):
        (name_excel, name_csv) = parse_card(outfolder, sampleName, fileResults,
                                            fileFlags, summary_results,
                                            assembly_cutoff, card_trick_info)
    else:
        ## [TODO] check results according to databases different than CARD/VFDB
        (name_excel, name_csv) = parse_results(outfolder, sampleName,
                                               fileResults, fileFlags,
                                               summary_results)

    print('\tCheck additional information on ', name_excel)

    ## print success timestamp
    filename_stamp = outfolder + '/.success_' + database
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    return (name_excel, name_csv)
Exemplo n.º 21
0
def parse_information(arg_dict, df_accID, outdir):

    ### Parse df_accID
    dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "input",
                                                   arg_dict.debug)
    dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "parse",
                                                   arg_dict.debug)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        print("dict_input_folders")
        print(dict_input_folders)
        print("dict_parse_folders")
        print(dict_parse_folders)

    ## parse each sample retrieved
    for sample, folder_input in dict_input_folders.items():

        if (arg_dict.debug):
            debug_message('sample: ' + sample, 'yellow')
            debug_message('folder_input: ' + folder_input, 'yellow')
            debug_message('folder_parse: ' + dict_parse_folders[sample],
                          'yellow')
            debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'],
                          'yellow')
            debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow')

        ## timestamps
        input_timestamp = os.path.join(folder_input, '.success')
        parse_timestamp = os.path.join(dict_parse_folders[sample], '.success')

        print()
        print("\t+ Parsing sample: " + sample)

        if (not HCGB_files.is_non_zero_file(parse_timestamp)
                and not HCGB_files.is_non_zero_file(input_timestamp)):

            ## TODO: Set threads to use in parallel
            process_OK = parse_annot_file(sample, folder_input,
                                          df_accID.loc[sample, 'annot_file'],
                                          dict_parse_folders[sample],
                                          arg_dict.debug,
                                          df_accID.loc[sample, 'genome'])

            if (process_OK):

                ## link or copy annotation file into folder_input
                HCGB_files.get_symbolic_link_file(
                    df_accID.loc[sample, 'annot_file'], folder_input)

                ## add df_accID.loc[sample,] information as csv into input folder
                df_accID.loc[sample, ].to_csv(os.path.join(
                    folder_input, 'info.csv'),
                                              index=True,
                                              header=True)

                ## print time stamp
                HCGB_time.print_time_stamp(input_timestamp)

                ## print time stamp
                HCGB_time.print_time_stamp(parse_timestamp)
            else:
                print(
                    colored(
                        "\t+ Some error occurred for sample %s while parsing input options"
                        % sample, 'red'))

                ## print time stamp
                HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail'))

                ## print time stamp
                HCGB_time.print_time_stamp(
                    os.path.join(dict_parse_folders[sample], '.fail'))
        else:
            read_time = HCGB_time.read_time_stamp(parse_timestamp)
            print(
                colored(
                    "\t+ Input parsing already available for sample %s [%s]" %
                    (sample, read_time), 'green'))
            print()
Exemplo n.º 22
0
def send_kma_job(outdir_file, list_files, name, database, threads, Debug):
    """
	Executes KMA identification jobs
	
	This function automates the process of checking if any previous run succeeded or
	runs the appropiate identification process for the sample and database provided.
	
	:param outdir_file:
	:param list_files:
	:param name:
	:param database:
	:param threads:
	:param dataFrame_sample:
	
	:type outdir_file:
	:type list_files:
	:type name:
	:type database:
	:type threads:
	:type dataFrame_sample:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call`
	
		- :func:`BacterialTyper.module.ident.get_outfile`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
		
		
	"""

    if (Debug):
        print(colored("**DEBUG: ident.send_kma_job call**", 'yellow'))
        print("outdir_file")
        print(outdir_file)
        print("list_files")
        print(list_files)
        print("name: " + name)
        print("database: " + database)

    ## outdir_KMA
    outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file)

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## get outfile
    outfile = get_outfile(outdir_dict_kma, name, database)

    ## check if previously run and succeeded
    basename_tag = os.path.basename(outfile)
    filename_stamp = outdir_dict_kma + '/.success_' + basename_tag

    if (Debug):
        print("Outdir: ", outdir_dict_kma)
        print("outfile: ", outfile)
        print("Filename_stamp: ", filename_stamp)

    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
    else:
        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: species_identification_KMA.kma_ident_module call**",
                    'yellow'))
            print("outfile = get_outfile(outdir_dict_kma, name, db2use)")
            print("outfile: ", outfile)
            print(
                "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) "
            )
            print("species_identification_KMA.kma_ident_module" + "\t" +
                  outfile + "\t" + str(list_files) + "\t" + name + "\t" +
                  database + "\t" + str(threads) + "\n")

        ## Sparse or not
        #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']):
#		if (basename_tag == 'userData_KMA'):
#			option = ''
#		else:
#			option = '-Sparse '

## Add option to retrieve databse from memory
        option = ""
        option = option + '-shm 1'

        # Call KMA
        species_identification_KMA.kma_ident_call(outfile, list_files, name,
                                                  database, kma_bin, option,
                                                  threads)
        stamp = HCGB_time.print_time_stamp(filename_stamp)
def download_kma_database(folder, database, debug):
	"""
	Downloads databases from KMA website.
	
	Using the latest available ftp datasets, this function downloads available datasets using
	function :func:`BacterialTyper.scripts.functions.wget_download`. 
	
	Ftp site: "ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/latest/"
	
	It also downloads the md5sum for the dataset selected and compares with the 
	
	:param folder: Absolute path to folder that contains database.
	:param database: Possible options: [bacteria, archaea, protozoa, fungi, plasmids, typestrains, viral].
	:param debug: True/false for printing debugging messages.
	
	:type folder: string
	:type database: string
	:type debug: boolean
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.wget_download`

		- :func:`BacterialTyper.scripts.functions.check_md5sum`

		- :func:`BacterialTyper.scripts.functions.extract`
		
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`

		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`

	"""

	## ToDo: update with latest version
	ftp_site = "http://www.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/latest/"
	
	## In v20190107 there was a plasmid database.
	#ftp_site = "ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/20190107/"

	############################################################################
	## ToDo: Set automatic: download config file and look for prefix for each 
	## sample and generate a dictionary to code the prefix for each db.
	############################################################################
	
	# Database configuration file - Describes the content of the database
	# Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name
	# Other important files are: .name, .kma.entries.all, .kma.entries.deleted, .kma.entries.added, .md5
	# db_prefix	name	description
	#bacteria.ATG	Bacteria Organisms	Bacteria organisms library prefix=ATG
	#plasmids.T	Bacteria Plasmids	Bacteria plasmids library prefix=T
	#typestrains.ATG	Bacteria Type Strains	Bacteria type strains library prefix=ATG
	#fungi.ATG	Fungi	Fungi library prefix=ATG
	#protozoa.ATG	Protozoa	Protozoa library prefix=ATG
	#archaea.ATG	Archaea	Archaea library prefix=ATG	
	
	HCGB_files.create_folder(folder)
	
	## debug message
	if (debug):
		print (colored("Function call: download_kma_database " + folder + ' ' + database + '\n','yellow'))

	## prefix
	if (database == 'plasmids'):
		prefix = '.T'
	elif (database == 'viral'):
		prefix = '.TG'
	else:
		prefix = '.ATG'
		
	index_name = os.path.join(folder, database + prefix)

	## check if already download
	return_code_down = False
	if os.path.exists(folder):
		return_code_down = check_db_indexed(index_name, folder)
		## debug message
		if (debug):
			print (colored("Folder database is already available:" + folder,'yellow'))
		
	if (return_code_down == False): ## folder does not exists

		## Download data
		print ("\t+ Downloading data now, it may take a while....")

		## debug message
		if (debug):
			print (colored("Download files via function wget_download:",'yellow'))
		
		## connect to url
		url = ftp_site + database + '.tar.gz'
		HCGB_sys.wget_download(url, folder)

		md5_url = ftp_site + database + '.md5'
		HCGB_sys.wget_download(md5_url, folder)
		print ("\n\t+ Data downloaded.....")

		## get files
		files = os.listdir(folder)
		md5_sum = ""
		for f in files:
			if f.endswith('tar.gz'):
				tar_file = folder + '/' + f
			elif f.endswith('md5'):
				md5_sum = folder + '/' + f
		
		## check md5sum
		print ("\t+ Checking for integrity using md5sum")
		
		# get md5 sum from source
		md5_string = ""
		with open(md5_sum, 'r') as myfile:
			line = myfile.read()
		
		line = re.sub(r"\s", ',', line)
		md5_string = line.split(",")[0]
		
		## calculate md5 for file
		result_md5 = HCGB_sys.check_md5sum(md5_string, tar_file) ## FIXME: Not conda supported
		if (result_md5 == True):
		
			## debug message
			if (debug):
				print (colored("result md5sum matches code provided for file " + tar_file,'yellow'))

			# extract
			print ("\t+ Extracting database into destination folder: " + folder)
			HCGB_files.extract(tar_file, folder)	

		else:
			print (colored("*** ERROR: Some error occurred during the downloading and file is corrupted ***", 'red'))
			return ("Error")
			
		## database should be unzipped and containing files...
		return_code_extract = check_db_indexed(index_name, folder)
		
		if (return_code_extract):
			print("+ Database (%s) successfully extracted in folder: %s..." %(database, folder))
		else:
			string = "*** ERROR: Some error occurred during the extraction of the database (%s). Please check folder (%s) and downloading and file is corrupted ***" %(database, folder)
			print (colored(string, 'red'))
			return ("Error")
		
		## print timestamp
		filename_stamp = folder + '/.success'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
Exemplo n.º 24
0
def pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug):
	
	##
	filename_stamp_plot = folder + '/.success_plot'
	if os.path.isfile(filename_stamp_plot):
		stamp = time_functions.read_time_stamp(filename_stamp_plot)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'plot results'), 'yellow'))
	else:
	
		# PLOT and SHOW results
		RNAbiotypes_stats = main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')
	
		# create plot
		plt.figure(figsize=(16,8))
		df_genetype_2 = pd.DataFrame({'Type':RNAbiotypes_stats[0], 
									'Count':RNAbiotypes_stats[1]}).sort_values(by=['Count'])
	
		## get total count
		df_genetype_ReadCount_sum = df_genetype_2['Count'].sum()
	
		## filter 1% values
		minimun = df_genetype_ReadCount_sum * 0.01
		df_genetype_filter_greater = df_genetype_2[ df_genetype_2['Count'] >= minimun ]
		df_genetype_filter_smaller = df_genetype_2[ df_genetype_2['Count'] < minimun ]
	
		## create %values
		df_genetype_2['Percentage'] = (df_genetype_2['Count']/df_genetype_ReadCount_sum*100).round(3)
		
		## merge and generate Other class
		df_genetype_filter_smaller_sum = df_genetype_filter_smaller['Count'].sum() ## total filter smaller
		df_genetype_filter_greater2 = df_genetype_filter_greater.append({
			'Count':df_genetype_filter_smaller_sum, 
			'Type':'Other'}, ignore_index=True)
	
		## Create Pie Plot
		ax1 = plt.subplot(121, aspect='equal')
		df_genetype_filter_greater2.plot.pie(
			y = 'Count', 
			ax=ax1, 
			autopct='%1.2f%%', 
			shadow=False, 
			labels=df_genetype_filter_greater2['Type'], 
			legend = False)
	
		# plot table
		ax2 = plt.subplot(122)
		plt.axis('off')
		tbl = ax2.table(
			cellText=df_genetype_2.values, 
			colLabels=df_genetype_2.columns,
			loc='center', rowLoc='left', cellLoc='center', 
			)
		tbl.auto_set_font_size(True)
		#tbl.set_fontsize(12)
		tbl.scale(1.1,1.1)
	
		## set PDF name
		name_figure = os.path.join(folder, name + '_RNAbiotypes.pdf')
	
		## generate image
		plt.savefig(name_figure)
		plt.close(name_figure)

		## print time stamps
		time_functions.print_time_stamp(filename_stamp_plot)
		filename_stamp_all = folder + '/.success_all'
		time_functions.print_time_stamp(filename_stamp_all)
Exemplo n.º 25
0
def edirect_ident(dataFrame, outdir_dict, Debug):
    """Connect to NCBI for information retrieval
	
	This functions uses the software edirect_ to connect to NCBI and retrieve some information regarding samples, assemblies, publications, etc.
	
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	
	:return: Information of the identification 
	:rtype: pandas.DataFrame()
	
	See example of returned dataframe in file :file:`/devel/results/edirect_download_results.csv` here:
	
	.. include:: ../../devel/results/edirect_download_results.csv
		:literal:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.get_info_file`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`

		- :func:`BacterialTyper.scripts.functions.optimize_threads`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
	
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.functions.is_non_zero_file`
	
		- :func:`BacterialTyper.scripts.edirect_caller.generate_docsum_call`
		
		- :func:`BacterialTyper.scripts.edirect_caller.generate_xtract_call`
		
	.. include:: ../../links.inc	
	"""
    ################################################
    ## TODO: What to do if multi-isolate sample?
    ################################################

    ## edirect
    HCGB_aes.boxymcboxface("EDirect information")
    print("+ Connect to NCBI to get information from samples identified...")

    ## create dataframe to return results
    edirect_frame = pd.DataFrame(columns=("sample", "genus", "species",
                                          "strain", "BioSample", "genome",
                                          "Plasmids"))

    ## debugging messages
    if Debug:
        print("*******************************************************")
        print("Dataframe sample_results: ")

    # Group dataframe sample name
    sample_results = dataFrame.groupby(["Sample"])

    for name, grouped in sample_results:
        ## debugging messages
        if Debug:
            print("Name: ", name)
            print(grouped)

        ## use edirect to get Species_name and entry for later identification
        edirect_folder = HCGB_files.create_subfolder('edirect',
                                                     outdir_dict[name])

        ## chromosome match
        if (len(grouped.loc[grouped['Database'] == 'bacteria.ATG']
                ['#Template']) == 0):
            if Debug:
                print("Name: ", name)
                print("No chromosome match identified by kmer")

            genus = ''
            species = ''
            BioSample_name = ''
            AssemblyAcc = ''

        else:
            nucc_entry = grouped.loc[grouped['Database'] == 'bacteria.ATG'][
                '#Template'].values[0].split()
            ## e.g. NZ_CP029680.1 Staphylococcus aureus strain AR_0215 chromosome, complete genome

            ##
            out_docsum_file = edirect_folder + '/nuccore_docsum.txt'
            tmp_species_outfile = edirect_folder + '/info.csv'
            filename_stamp = edirect_folder + '/.success_species'

            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous command generated results on: %s [%s]" %
                        (stamp, name), 'yellow'))
                status = True
            else:
                edirect_caller.generate_docsum_call('nuccore', nucc_entry[0],
                                                    out_docsum_file)
                status = edirect_caller.generate_xtract_call(
                    out_docsum_file, 'DocumentSummary',
                    'Organism,BioSample,AssemblyAcc,Strain',
                    tmp_species_outfile)

            ########################################
            ## get information from edirect call
            ########################################
            if not status:
                print("NO INFORMATION")
                continue

            taxa_name_tmp = HCGB_main.get_info_file(tmp_species_outfile)
            Organism = taxa_name_tmp[0].split(',')[0].split()
            genus = Organism[0]  ## genus
            species = Organism[1]  ## species
            BioSample_name = taxa_name_tmp[0].split(',')[1]  ## BioSample
            AssemblyAcc = taxa_name_tmp[0].split(',')[2]  ## AssemblyAcc

            ## sometimes strain is missing
            if len(taxa_name_tmp[0].split(',')) > 3:
                strain = taxa_name_tmp[0].split(',')[3]  ## strain
            else:
                strain = 'NaN'

            ## get GenBank accession ID
            out_docsum_file_assembly = edirect_folder + '/assembly_docsum.txt'
            AssemblyAcc_outfile = edirect_folder + '/AssemblyAcc.csv'

            edirect_caller.generate_docsum_call('assembly', AssemblyAcc,
                                                out_docsum_file_assembly)
            edirect_caller.generate_xtract_call(out_docsum_file_assembly,
                                                'DocumentSummary', 'Genbank',
                                                AssemblyAcc_outfile)

            ## some error occurred
            if not HCGB_main.is_non_zero_file(out_docsum_file_assembly):
                continue

            ## Is it better to download Refseq or Genbank?
            ## https://www.quora.com/What-is-the-difference-between-Refseq-and-Genbank

            GenbankAcc = HCGB_main.get_info_file(AssemblyAcc_outfile)
            if Debug:
                print("Sample: ", name)
                print("Genbank Acc: ", GenbankAcc[0])

        ## plasmid match
        group_plasmid = grouped.loc[grouped['Database'] == 'plasmids.T']
        plasmid_entries = group_plasmid['#Template'].tolist()
        ## e.g. NZ_CP029083.1 Staphylococcus aureus strain AR464 plasmid unnamed1, complete sequence
        plasmid_entries_str = ",".join([i.split()[0] for i in plasmid_entries])

        ## save edirect_frame
        #("sample", "taxa", strain, genome "BioSample", "Plasmids"))
        edirect_frame.loc[len(edirect_frame)] = (name, genus, species, strain,
                                                 BioSample_name, GenbankAcc[0],
                                                 plasmid_entries_str)

        stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## debugging messages
    if Debug:
        print("*******************************************************")

    return (edirect_frame)
Exemplo n.º 26
0
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect,
               retrieve_databases):
    """Generate MLST profile identification
	
	This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. 
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`.
	:param retrieve_databases: 
	
	:type options: 
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type dataFrame_edirect: pandas.DataFrame()
	:type retrieve_databases: pandas.DataFrame()
	
	:return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information.
	:rtype: Dictionary

	
	See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here:
	
	.. include:: ../../devel/results/doMLST_result_example.csv
		:literal:
	
	.. seealso:: Additional information to PubMLST available datasets.
	
		- :doc:`PubMLST datasets<../../../data/PubMLST_datasets>`
	
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
		
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.MLSTar.run_MLSTar`
		
		- :func:`HCGB.sampleParser.files.get_files`
		
		- :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species`
		
	.. include:: ../../links.inc	
	"""
    ## set config
    rscript = set_config.get_exe("Rscript")

    ## TODO: Samples might not be assembled...to take into account and return 0

    ## TODO: Fix and install MLSTar during installation
    print(MLSTar.get_MLSTar_package_installed())
    exit()

    ########################################################################################

    ## TODO: What to do if multi-isolate sample?
    ## TODO: Control if a different profile is provided via --MLST_profile
    ## TODO: Check time passed and download again if >?? days passed]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow'))
        print(dataFrame_edirect)

    ## MLST call
    HCGB_aes.boxymcboxface("MLST typing")
    print(
        "+ Create classical MLST typification of each sample according to species retrieved by kmer..."
    )

    ## get assembly files
    input_dir = os.path.abspath(options.input)
    assembly_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow'))
        print(assembly_samples_retrieved)

    # init
    MLST_results = {}

    ## get MLST_profile: default or provided
    mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] ==
                                               'PubMLST']['path'].tolist()

    if (Debug):
        print("** Debug **")
        print("mlst_profile_list")
        print(mlst_profile_list)

        print("dataFrame_edirect")
        print(dataFrame_edirect)

    ## Generate MLST call according to species identified for each sample
    for index, row in dataFrame_edirect.iterrows():
        MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'],
                                                     row['species'])

        if (MLSTar_taxa_name == 'NaN'):
            print(
                colored(
                    "\t- Not available PubMLST profile for sample [%s] identified as %s %s"
                    % (row['sample'], row['genus'], row['species']), 'yellow'))

        else:
            for mlst_profile in mlst_profile_list:

                ## species folder
                #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder)
                species_mlst = mlst_profile.split(',')[0]
                species_mlst_folder = mlst_profile.split(',')[1]

                ## output file
                output_file = species_mlst_folder + '/PubMLST_available_scheme.csv'
                filename_stamp = species_mlst_folder + '/.success_scheme'

                ##
                if MLSTar_taxa_name == species_mlst:
                    if os.path.isfile(filename_stamp):
                        stamp = HCGB_time.read_time_stamp(filename_stamp)
                        print(
                            colored(
                                "\tA previous command generated results on: %s"
                                % stamp, 'yellow'))
                    else:
                        ### get scheme available
                        MLSTar.getPUBMLST(MLSTar_taxa_name, rscript,
                                          output_file)
                        stamp = HCGB_time.print_time_stamp(filename_stamp)

                    ## parse and get scheme for classical MLST
                    schemes_MLST = pd.read_csv(output_file, sep=',', header=0)

                    ##
                    for item, cluster in schemes_MLST.iterrows():
                        if cluster['len'] < 10:
                            scheme2use = int(cluster['scheme'])
                            continue
                    ###
                    sample = row['sample']
                    MLSTar_folder = HCGB_files.create_subfolder(
                        'MLST', outdir_dict[sample])
                    genome_file = assembly_samples_retrieved.loc[
                        assembly_samples_retrieved['name'] ==
                        sample]['sample'].values[0]

                    ## call MLST
                    (results, profile_folder) = MLSTar.run_MLSTar(
                        species_mlst_folder, rscript, MLSTar_taxa_name,
                        scheme2use, sample, MLSTar_folder, genome_file,
                        options.threads)
                    MLST_results[sample] = results

    ##
    print("+ Finish this step...")
    return (MLST_results)
Exemplo n.º 27
0
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded):
	
	## folder for results
	if not os.path.isdir(path):
		files_functions.create_folder(path)

	out_file = os.path.join(path, 'featureCount.out')
	logfile = os.path.join(path, name + '_RNAbiotype.log')

	filename_stamp_all = path + '/.success_all'
	if os.path.isfile(filename_stamp_all):
		stamp = time_functions.read_time_stamp(filename_stamp_all)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow'))
		return()

	else:
		filename_stamp_featureCounts = path + '/.success_featureCounts'
		if os.path.isfile(filename_stamp_featureCounts):
			stamp = time_functions.read_time_stamp(filename_stamp_featureCounts)
			print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow'))
		else:

			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("featureCounts system call for sample: " + name)
				print ("out_file: " + out_file)
				print ("logfile: " + logfile)
		
			## send command for feature count
			## Allow multimapping
			if allow_multimap:
				cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
			else:
				cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
				
				
			## system call
			cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True)
			if not cmd_featureCount_code:
				print("** ERROR: featureCount failed for sample " + name)
				exit()
				
			## print time stamp
			time_functions.print_time_stamp(filename_stamp_featureCounts)
		
		## parse results
		(extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug)
		
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("extended_Stats: " + extended_Stats_file)
			print (main_functions.get_data(extended_Stats_file, '\t', 'header=None'))
			print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file)
			print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None'))

	return ()
Exemplo n.º 28
0
def run_annotation(options):

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()

    elif (options.help_Prokka):
        ## information for Prokka
        annotation.print_list_prokka()
        exit()

    ## set default
    options.batch = False

    ###
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly annotation")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "annot",
                                            options.debug)

    ## annotate
    print("+ Annotate assemblies using prokka:")
    print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode")
    if options.genera == 'Other':
        print(
            "\t-Option: genera = Off; No genus-specific BLAST databases option provided"
        )
    else:
        print("\t-Option: genera = ", options.genera,
              "; Genus-specific BLAST databases option provided")

    print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature")
    print("\t-Option: addmrna;  Add 'mRNA' features for each 'CDS' feature")
    print("\t-Option: cdsrnaolap;  Allow [tr]RNA to overlap CDS")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(annot_caller, row['sample'],
                            outdir_dict[row['name']], options, row['name'],
                            threads_job): index
            for index, row in pd_samples_retrieved.iterrows()
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get folders
    givenList = [v for v in outdir_dict.values()]
    protein_files = []
    print(
        "+ Detail information for each sample could be identified in separate folders:"
    )
    for folder in givenList:
        print('\t + ', folder)
        protein_files.extend(
            HCGB_main.retrieve_matching_files(folder, '.faa', Debug))

    ### report generation
    if (options.skip_report):
        print("+ No annotation report generation...")
    else:
        ### report generation
        HCGB_aes.boxymcboxface("Annotation report")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        PROKKA_report = HCGB_files.create_subfolder("annotation",
                                                    outdir_report)
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % PROKKA_report)

        ## check if previously report generated
        filename_stamp = PROKKA_report + '/.success'
        done = 0
        if os.path.isdir(PROKKA_report):
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous report generated results on: %s" % stamp,
                        'yellow'))
                done = 1

        ## generate report
        if done == 0:
            ## get subdirs generated and call multiQC report module
            multiQC_report.multiQC_module_call(givenList, "Prokka",
                                               PROKKA_report, "-dd 2")
            print(
                '\n+ A summary HTML report of each sample is generated in folder: %s'
                % PROKKA_report)

            ## success stamps
            filename_stamp = PROKKA_report + '/.success'
            stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## time stamp
    start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total)

    ## Check each annotation using BUSCO
    results = qc.BUSCO_check(input_dir, outdir, options,
                             start_time_partial_BUSCO, "proteins")

    ## print to file: results

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Annotation module.")
    return ()
Exemplo n.º 29
0
def trimmo_call(java_path, sample_folder, sample_name, files, trimmomatic_jar, threads, trimmomatic_adapters, Debug):
	##
	## Function to call trimmomatic using java. Can take single-end and pair-end files
	## sample_folder must exists before calling this function. 
	## It can be call from main or a module.
	## Returns code OK/FAIL according if succeeded or failed the system call
	## 

	#######################################
	## http://www.usadellab.org/cms/?page=trimmomatic
	#
	# ILLUMINACLIP:fasta_file.fa:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24
	#
	# This will perform the following:
	#	Remove adapters (ILLUMINACLIP:fasta_file.fa:2:30:10)
	#	Remove leading low quality or N bases (below quality 11) (LEADING:11)
	#	Remove trailing low quality or N bases (below quality 11) (TRAILING:11)
	#	Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 20 (SLIDINGWINDOW:4:20)
	#	Drop reads below the 24 bases long (MINLEN:24)
	#######################################


	## debug message
	if (Debug):
		print (colored("+ Cutting adapters for sample: " + sample_name, 'yellow'))
		
	## log files
	log_file = sample_folder + '/' + sample_name + '_call.log'
	trimmo_log = sample_folder + '/' + sample_name + '.log'
	
	## init
	file_R1 = ""
	file_R2 = ""
	trim_R1 = ""
	orphan_R1 = ""
	trim_R2 = ""
	orphan_R2 = ""

	## conda installation includes a wrapper and no java jar call is required
	if trimmomatic_jar.endswith('jar'):
		cmd = "%s -jar %s"  %(java_path, trimmomatic_jar)
	else:
		cmd = "%s"  %(trimmomatic_jar)

	## Paired or single end
	## set command
	if (len(files) == 2): ## paired-end
		file_R1 = files[0]
		file_R2 = files[1]

		#print ('\t-', file_R2)
		trim_R1 = sample_folder + '/' + sample_name + '_trim_R1.fastq'
		orphan_R1 = sample_folder + '/' + sample_name + '_orphan_R1.fastq'
		trim_R2 = sample_folder + '/' + sample_name + '_trim_R2.fastq'
		orphan_R2 = sample_folder + '/' + sample_name + '_orphan_R2.fastq'

		cmd = cmd + " PE -threads %s -trimlog %s %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, file_R2, trim_R1, orphan_R1, trim_R2, orphan_R2, trimmomatic_adapters, trimmo_log)

	else: ## single end
		file_R1 = files[0]
		trim_R1 = sample_folder + '/' + sample_name + '_trim.fastq'

		cmd = cmd + " SE -threads %s -trimlog %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, trim_R1, trimmomatic_adapters, trimmo_log)

	## system call & return
	code = HCGB_sys.system_call(cmd)
	if code == 'OK':
		## success stamps
		filename_stamp = sample_folder + '/.success'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)	
		return('OK')	
	else:
		return('FAIL')	
Exemplo n.º 30
0
def update_sample(name, cluster, own_data, user_data_db, Debug):

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: sample_frame groupby: name & cluster **",
                    'yellow'))
        print(name)
        print(cluster)

    if (name == 'report'):
        return ()

    print('\t+ Sending command for sample: ', name)

    ############################################
    #### check information for this sample
    ############################################

    ## generate sample
    dir_sample = HCGB_files.create_subfolder(name, own_data)

    if name in user_data_db.index:
        print(
            colored(
                "\t\t+ Data available in database for sample: %s. Checking integrity..."
                % name, 'yellow'))
        #functions.print_sepLine("+", 75, False)

    ## data to generate
    data2dump = pd.DataFrame(columns=('ID', 'folder', 'genus', 'species',
                                      'name', 'genome', 'GFF', 'proteins',
                                      'signature', 'profile', 'ident',
                                      'reads'))
    ## iterate over files with different tags: reads, annot, assembly, profile, ident

    ##########
    ## assembly
    ##########
    assembly_dir = HCGB_files.create_subfolder('assembly', dir_sample)
    assembly_file = cluster.loc[cluster['tag'] ==
                                'assembly']['sample'].to_list()
    if assembly_file:
        assembly_file_name = os.path.basename(assembly_file[0])
        genome = assembly_dir + '/' + assembly_file_name
        if not os.path.exists(genome):
            shutil.copy(assembly_file[0], assembly_dir)
    else:
        genome = ""

    ##########
    ## annot
    ##########
    annot_dir = HCGB_files.create_subfolder('annot', dir_sample)
    annot_files = cluster.loc[cluster['tag'] == 'annot']['sample'].to_list()
    prof = ""
    gff = ""
    if annot_files:
        for f in annot_files:
            file_name = os.path.basename(f)
            if f.endswith('faa'):
                prot = annot_dir + '/' + file_name
                if os.path.exists(prot):
                    continue
            elif f.endswith('gff'):
                gff = annot_dir + '/' + file_name
                if os.path.exists(gff):
                    continue
            shutil.copy(f, annot_dir)
    else:
        gff = ""
        prot = ""

    ##########
    ## trimm
    ##########
    trimm_dir = HCGB_files.create_subfolder('trimm', dir_sample)
    reads_files = cluster.loc[cluster['tag'] == 'reads']['sample'].to_list()
    reads = []
    if reads_files:
        for f in reads_files:
            file_name = os.path.basename(f)
            reads_name = trimm_dir + '/' + file_name
            reads.append(reads_name)
            if not os.path.exists(reads_name):
                shutil.copy(f, trimm_dir)

    ##########
    ## ident
    ##########
    ident_dir = HCGB_files.create_subfolder('ident', dir_sample)
    ident_file = cluster.loc[cluster['tag'] == 'ident']['sample'].to_list()
    if ident_file:
        file_name = os.path.basename(ident_file[0])
        ident_file_name = ident_dir + '/' + file_name
        if not os.path.exists(ident_file_name):
            shutil.copy(ident_file[0], ident_dir)
    else:
        ident_file_name = ""

    ##########
    ## profile
    ##########
    profile_dir = HCGB_files.create_subfolder('profile', dir_sample)
    profile_files = cluster.loc[cluster['tag'] ==
                                'profile']['sample'].to_list()
    profile_file = []
    if profile_files:
        for f in profile_files:
            file_name = os.path.basename(f)
            profile_file_name = profile_dir + '/' + file_name
            profile_file.append(profile_file_name)
            if not os.path.exists(profile_file_name):
                shutil.copy(f, profile_dir)

    ##########
    ## mash profile
    ##########
    mash_dir = HCGB_files.create_subfolder('mash', dir_sample)
    mash_file = cluster.loc[cluster['tag'] == 'mash']['sample'].to_list()
    if mash_file:
        file_name = os.path.basename(mash_file[0])
        sig_file = mash_dir + '/' + file_name
        if not os.path.exists(sig_file):
            shutil.copy(mash_file[0], mash_dir)
    else:
        sig_file = ""

    ############################################
    ### Dump information

    ## TODO: Add species and genus information when parsed from ident csv file
    #####
    data2dump.loc[len(data2dump)] = (name, dir_sample, 'genus', 'species',
                                     name, genome, gff, prot, sig_file,
                                     '::'.join(sorted(profile_file)),
                                     ident_file_name, '::'.join(sorted(reads)))
    #data2dump = data2dump.set_index('ID')

    ###### dump to file
    info_file = dir_sample + '/info.txt'
    data2dump.to_csv(info_file)

    ###### dump file information to file
    info_file2 = dir_sample + '/info_files.txt'
    cluster.to_csv(info_file2)

    ###### timestamp
    filename_stamp = dir_sample + '/.success'
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    return ()