Exemplo n.º 1
0
def check_db_indexed(folder, option):
	"""Check if ARIBA_ database is indexed.
	
	In the given folder it looks for '00.info.txt' file.
	
	:param folder: Absolute path to database folder.
	:param option: Whether to print more information messages or not [Yes/No]. 
	
	:type folder: string 
	:type option: string
	:returns: Boolean True/False.
	 
	 
	 .. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.time_functions.read_time_stamp`
	 
	.. include:: ../../links.inc
	"""
	
	path_basename = folder.split('/')
	if os.path.isfile(folder + '00.info.txt'):
		if os.path.isfile(folder + '.success'):
			stamp =	HCGB_time.read_time_stamp(folder + '.success')
			print (colored("\tA previous command generated results on: %s [%s]" %(stamp, path_basename[-2]), 'yellow'))
			return True
	else:
		if (option == 'YES'):
			print (colored("\t- ARIBA database: " + path_basename + " [ ERROR ]", 'red'))
		return False
Exemplo n.º 2
0
def annot_caller(seq_file, sample_folder, options, name, threads):
    ## check if previously assembled and succeeded
    filename_stamp = sample_folder + '/.success'

    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
    else:

        ## debug message
        if (Debug):
            print(colored("**DEBUG: annotation.module_call call**", 'yellow'))
            print(
                " annotation.module_call (seq_file, options.kingdom, options.genera, sample_folder, name, threads)"
            )
            print(" annotation.module_call " + seq_file + "\t" +
                  options.kingdom + "\t" + options.genera + "\t" +
                  sample_folder + "\t" + name + "\t" + str(threads))

        # Call annotation
        annotation.module_call(seq_file, options.kingdom, options.genera,
                               sample_folder, name, threads)
def get_database(db_frame, Debug):
    data4db = pd.DataFrame()
    for index, row in db_frame.iterrows():
        ## information
        this_file = db_frame.loc[index]['path'] + '/info.txt'
        if os.path.isfile(this_file):
            print('+ Reading information for sample: ',
                  db_frame.loc[index]['db'])
            print(
                colored("\t+ Obtaining information from file: %s" % this_file,
                        'yellow'))
            this_db = HCGB_main.get_data(this_file, ',', 'index_col=0')
            data4db = data4db.append(this_db)
            timestamp = db_frame.loc[index]['path'] + '/.success'
            if os.path.isfile(timestamp):
                stamp = HCGB_time.read_time_stamp(timestamp)
                print(colored("\t+ Data generated on: %s" % stamp, 'yellow'))

            HCGB_aes.print_sepLine("*", 25, False)

    ## index by ID
    if not data4db.empty:
        data4db = data4db.set_index('ID')

    return (data4db)
Exemplo n.º 4
0
def ariba_run_caller(db2use, db_name, list_files, folder_out, threads, cutoff):
    ## check if already is done
    # generate a stamp when finish parsing each file

    ## make stamp time
    filename_stamp = os.path.join(folder_out, '.success_' + db_name)
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        files_names = [os.path.basename(s) for s in list_files]
        print(
            colored(
                "\tA previous command generated results on: %s [Files: %s]" %
                (stamp, files_names), 'yellow'))

    else:
        if os.path.exists(folder_out):
            shutil.rmtree(
                folder_out)  ## delete folder if exists but failed before

        ## call
        code = ariba_caller.ariba_run(db2use, list_files, folder_out, threads,
                                      cutoff)
        if code == 'FAIL':
            print("*** ERROR: System call failed for ", folder_out)

        ## print success timestamp
        HCGB_time.print_time_stamp(filename_stamp)
Exemplo n.º 5
0
def module_call(sequence_fasta, kingdom, genus, path, name, threads):
    """
	Function that checks and generates annotation.
	
	- It uses Prokka_ via :func:`BacterialTyper.scripts.annotation.prokka_call`.
	
	- It checks if previously generated 
	
	- Once finished, it prints timestamp 
	
	:param sequence_fasta: Assembled sequences in fasta file format. 
	:param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses
	:param genus: Available genus options for Prokka software. See details above.
	:param path: Absolute path to the output folder to include results.
	:param name: Sample name and tag to include in the annotation report and files.
	:param threads: Number of CPUs to use.
	  
	:type sequence_fasta: string
	:type kingdom: string
	:type genus: string 
	:type path: string 
	:type name: string 
	:type threads: integer 
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`HCGB.functions.time_functions.print_time_stamp`
				
		- :func:`HCGB.functions.time_functions.prokka_call`	

	.. include:: ../../links.inc	 	
	"""

    ## check if previously assembled and succeeded
    filename_stamp = path + '/.success'

    if os.path.isdir(path):
        if os.path.isfile(filename_stamp):
            stamp = HCGB_time.read_time_stamp(filename_stamp)
            print(
                colored(
                    "\tA previous command generated results on: %s [%s]" %
                    (stamp, name), 'yellow'))
            return ()

    ## call prokka
    prokka_bin = set_config.get_exe('prokka')
    dirname = prokka_call(prokka_bin, sequence_fasta, kingdom, genus, path,
                          name, threads)

    ## success stamps
    filename_stamp = path + '/.success'
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    return (dirname)
Exemplo n.º 6
0
def SPADES_systemCall(sample_folder, file1, file2, name, SPADES_bin, options, threads, debug=False):
	"""Generate SPADES system call.
	
	It calls system for SPADES and generates time stamp file in the folder provided (sample_folder + '/.success_assembly') for later analysis.
	
	Steps:
	
	- It generates system call for SPADES assembly. 
	
	- It generates timestamp file.
	
	:param sample_folder: Absolute path to store results. It must exists.
	:param file1: Absolute path to fastq reads (R1).
	:param file2: Absolute path to fastq reads (R2).
	:param name: Sample name or tag to identify sample.
	:param SPADES_bin: Binary executable for SPADES assembly software.
	:param options: Plasmid assembly is possible if specificed via options (--plasmid).
	:param threads: Number of CPUs to use.
	
	:type name: string
	:type sample_folder: string
	:type file1: string
	:type file2: string
	:type SPADES_bin: string
	:type options: string
	:type threads: integer
	
	:return: Returns **OK** if assembly process succeeded and fasta file is generated.
	:rtype: string.
	:warnings: Returns **FAIL** if assembly process stopped.
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.main_functions.system_call`
	
		- :func:`HCGB.functions.time_functions.print_time_stamp`
	"""
	
	## check if previously assembled and succeeded
	filename_stamp = sample_folder + '/.success_assembly'
	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
		return('OK')

	## call system for SPADES sample given
	logFile = sample_folder + '/' + name + '.log'
	
	## command	
	cmd_SPADES = '%s %s-t %s -o %s -1 %s -2 %s > %s 2> %s' %(SPADES_bin, options, threads, sample_folder, file1, file2, logFile, logFile)
	code = HCGB_sys.system_call(cmd_SPADES)
	
	if (code == 'OK'):
		## success stamps
		filename_stamp = sample_folder + '/.success_assembly'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		return('OK')

	return "FAIL"
def check_results(db2use, outdir_sample, assembly_cutoff, card_trick_info):
    """
	.. seealso:: Additional information to ARIBA results generated.
	
		- :ref:`ARIBA-explained`
	
	"""

    ##
    ## outdir_sample is a dataframe containing information of the output folder generated by ariba.
    ## It is index for each database and for each sample.
    ## This function iterates for each sample and generates call to specific function to parse results.
    ##

    ## iterate multi-index dataframe
    dataFrame_results = pd.DataFrame(columns=("csv", "excel", "database"))
    for sample, data in outdir_sample.groupby(level='sample'):
        for database, data2 in data.groupby(level='db'):
            if (database != db2use):
                continue

            folderResults = data2.loc[sample, db2use]['output']
            outfolder = data2.loc[sample, db2use]['dirname']
            if db2use == 'card':
                database = 'card'
                name_db = 'CARD'
            elif db2use == 'vfdb_full':
                database = 'vfdb_full'
                name_db = 'VFDB'
            else:
                database = 'other'
                name_db = 'other'

            ## might generate conflicts if several other databases provided
            ## TODO: check
            filename_stamp = outfolder + '/.success_' + database
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous command generated results on: %s [%s]" %
                        (stamp, sample), 'yellow'))
                name_excel = outfolder + '/' + sample + '_' + name_db + '_results.xlsx'
                name_csv = outfolder + '/' + sample + '_' + name_db + '_summary.csv'

            else:
                (name_excel,
                 name_csv) = results_parser(database, folderResults, sample,
                                            outfolder, assembly_cutoff,
                                            card_trick_info)

            dataFrame_results.loc[sample] = (name_csv, name_excel, name_db
                                             )  ## to return

    return (dataFrame_results)
Exemplo n.º 8
0
def init_db_object(debug):
    """Instantiate the ete taxonomy object     
    Created by Joe R. J. Healey; Nick Youngblut
    Original code.
    """
    # Instantiate the ete NCBI taxa object
    print("+ ------------------------------------- +")
    print("+ Looking for NCBI taxonomy database:")
    ncbi = NCBITaxa()

    ## dbfile location
    if debug:
        debug_message(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        debug_message(
            'NCBI Taxonomy database is stored at {}\n'.format(ncbi.dbfile),
            "yellow")

    ## folder would be download here: ~/.etetoolkit/taxa.sqlite
    db_folder = os.path.dirname(format(ncbi.dbfile))

    ## check timestamp, update if necessary
    filename_stamp_parse = db_folder + '/timestamp_db.txt'
    if os.path.isfile(filename_stamp_parse):
        stamp = time_functions.read_time_stamp(filename_stamp_parse)
        days_passed = time_functions.get_diff_time(filename_stamp_parse)

        ## debug messages
        if debug:
            debug_message('Database previously initiated', "yellow")
            debug_message('on date: {}'.format(stamp), "yellow")
            debug_message('Days passed: {}'.format(days_passed), "yellow")

        if (days_passed > 30):
            ## update_db
            update_db(ncbi, db_folder, debug)
        else:
            ## debug messages
            if debug:
                debug_message('No need to update db', "yellow")
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )

            print(
                colored(
                    "\tA previous command generated results on: %s [%s]" %
                    (stamp, 'init database'), 'yellow'))
    else:
        ## create first timestamp
        time_functions.print_time_stamp(filename_stamp_parse)

    return ncbi
Exemplo n.º 9
0
def trimmo_caller(list_reads, sample_folder, name, threads, Debug, adapters):
    ## check if previously assembled and succeeded
    filename_stamp = sample_folder + '/.success'
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
    else:
        # Call trimmomatic
        trimmomatic_call.trimmo_module(list_reads, sample_folder, name,
                                       threads, Debug, adapters)
Exemplo n.º 10
0
def create_blast_results(sample, fasta_file, outdir, debug):
    '''Creates BLAST results for each fasta vs. itself'''
    
    #phr is the header file, pin is the index file, psq is the sequence file
    
    ## debug messages
    if debug:
        debug_message('create_blast_results function call:', 'yellow')
        debug_message('sample: ' + sample, 'yellow')
        debug_message('fasta_file: ' + fasta_file, 'yellow')
        debug_message('outdir: ' + outdir, 'yellow')
    
    ## output file
    raw_blast = os.path.abspath(os.path.join(outdir, "BLAST_raw_results.tsv"))

    ## timestamps 
    db_timestamp = os.path.join(outdir, '.db_success')
    search_timestamp = os.path.join(outdir, '.blast_success')
        
    if (not HCGB.functions.files_functions.is_non_zero_file(search_timestamp)):

        ## get binaries
        (makeblastdb_exe, blastp_exe) = BacDup.modules.config.get_exe('BLAST', debug)
        makeblastdb_exe = "/usr/bin/makeblastdb" 
        blastp_exe = "/usr/bin/blastp"
        
        ## check if db is indexed already
        db_path_name = os.path.join(os.path.abspath(outdir), sample + '_db')
        if (not HCGB.functions.files_functions.is_non_zero_file(db_timestamp)):
            ## generate blastdb for genome
            HCGB.functions.blast_functions.makeblastdb(db_path_name, fasta_file, makeblastdb_exe, 'prot') # HCGB function    
        
            ## print time stamp
            HCGB_time.print_time_stamp(db_timestamp)
        
        else:
            print (colored("\t+ BLAST database already available for sample %s [%s]" %(sample, read_time), 'green'))
            
        ## create blastp outfile
        HCGB.functions.blast_functions.blastp(blastp_exe, raw_blast, db_path_name, fasta_file, 1) # HCGB function

        ## print time stamp
        HCGB_time.print_time_stamp(search_timestamp)
    else:
        read_time = HCGB_time.read_time_stamp(search_timestamp)
        print (colored("\t+ Duplicate search already available for sample %s [%s]" %(sample, read_time), 'green'))
            
    return (raw_blast)
def download_VFDB_files(folder):
    ##
    ## Given a folder, check if it contains VFDB information
    ## or download it from website: http://www.mgc.ac.cn
    ##
    links = (
        "http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz",
        "http://www.mgc.ac.cn/VFs/Down/Comparative_tables_from_VFDB.tar.gz")

    ## check if data is downloaded, how old is the data and if it is necessary to download again
    ## consider >30 days long enough to be updated again

    ## time stamp
    filename_stamp = folder + '/download_timestamp.txt'
    if os.path.exists(folder):
        if os.path.isfile(filename_stamp):
            stamp = HCGB_time.read_time_stamp(filename_stamp)
            print("+ A previous download generated results on: ", stamp)
            days_passed = HCGB_time.get_diff_time(filename_stamp)
            print("\t\t** %s days ago" % days_passed)
            if (days_passed > 30):  ## download again
                print(
                    "\t\t** Downloading information again just to be sure...")
            else:
                print("\t\t** No need to download data again.")
                return ()
    else:
        HCGB_files.create_folder(folder)

    ## Open file and readlines
    print('+ Downloading files:\n')
    for line in links:
        if not line.startswith('#'):
            HCGB_sys.wget_download(line, folder)

    ## decompress files
    print('+ Decompressing gzip files\n')
    files = os.listdir(folder)
    for item in files:
        #print (folder)
        if item.endswith('.gz'):
            HCGB_files.extract(folder + '/' + item, folder)

    ## make stamp time
    HCGB_time.print_time_stamp(filename_stamp)

    return ()
Exemplo n.º 12
0
def prepare_card_data(database_folder):
	
	## create CARD folder
	abs_folder = os.path.abspath(database_folder)
	CARD_folder = HCGB_files.create_subfolder('CARD', abs_folder)
	
	## make stamp time
	filename_stamp = CARD_folder + '/.success'

	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [CARD Ontology Data]" %stamp, 'yellow'))

		## check time passed
		days_passed = HCGB_time.get_diff_time(filename_stamp)
		print ("\t** %s days ago" %days_passed)		
		if (days_passed > 30): ## download again
			print ("\t ** Downloading information again just to be sure...")
			download=True
		else:
			print ("\t ** No need to download data again.")
			download=False
	else:
		download=True

	###
	if download:
		## uptade database in a path
		aro_obo_file = card_trick.ontology_functions.update_ontology(CARD_folder, False)
	
		## get ontology and save it in csv
		return_frame = card_trick.ontology_functions.parse_ontology(aro_obo_file, False)
	
		### if success return folder name
		if not return_frame.empty:
			## success stamps
			filename_stamp = CARD_folder + '/.success'
			stamp =	HCGB_time.print_time_stamp(filename_stamp)	
		else:
			return (FAIL)

	## return folder name
	return(CARD_folder)
Exemplo n.º 13
0
def snippy_variant_caller(reference, files, threads, outdir, name, contig_option, other_options, sample_name, Debug):
    
    ## create subfolder within phylo for this mapping
    tag = sample_name + '_vs_' + name
    subdir = HCGB_files.create_subfolder(tag, outdir)
       
    ## check if previously process and succeeded
    filename_stamp = subdir + '/.success'
    
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print (colored("\tA previous command generated results on: %s [%s]" %(stamp, tag), 'yellow'))
    else:
         # Call variant calling
        code = variant_calling.snippy_call(reference, files, threads, subdir, 
                                           sample_name, contig_option, other_options, Debug)
        if code == 'OK':
            stamp = HCGB_time.print_time_stamp(filename_stamp)

        return(code)    
Exemplo n.º 14
0
def agrvate_caller(dict_assemblies, dict_folders, debug=False):
    """Create agrvate call and control for parameters"""
    
    ## ATTENTION: agrvate needs to chdir to output folder
    path_here = os.getcwd()
    
    print ("+ Checking agr genes for each sample retrieved...")
    
    agrvate_results = pd.DataFrame()
    
    ## No need to optimize. There is a problem with the working dir of agrvate and we 
    ## need to change every time.
    for name, assembly_file in dict_assemblies.items():
        sample_folder = HCGB_files.create_folder(dict_folders[name])
        ## check if previously done and succeeded
        filename_stamp = sample_folder + '/.success'
        if os.path.isfile(filename_stamp):
            stamp =  HCGB_time.read_time_stamp(filename_stamp)
            print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
        else:
            os.chdir(sample_folder)
            info_sample = agrvate_call(name, assembly_file, sample_folder, debug)
            agrvate_results = pd.concat([agrvate_results, info_sample], join='outer')
            
            if (info_sample.shape[0] == 0):
                print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name)
            else:
                ## success
                HCGB_time.print_time_stamp(filename_stamp)
    
    print ("+ Jobs finished%s\n+ Collecting information for all samples...")
    os.chdir(path_here)
    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('agrvate_results', 'yellow')
        HCGB_main.print_all_pandaDF(agrvate_results)
    
    return(agrvate_results)
Exemplo n.º 15
0
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db):

	my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name)
	## timestamp
	filename_stamp =  my_out_folder + '/.success'

	print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow'))
		
	## check previous run
	if os.path.isfile(filename_stamp):
		timestamp = HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tSuccessfully run on date: %s"  %timestamp, 'green'))
	else:
	
		busco_bin = set_config.get_exe('busco')
		os.chdir(output_name)
		
		## init cmd configuration
		cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db)
		
		## options if autolineage or given dataset
		if "auto-lineage" == dataset_name:
			logFile = 'auto_lineage.log'
			cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile)
		else:
			logFile = dataset_name + '.log'
			cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile)
		
		## system call
		HCGB_sys.system_call(cmd)
		
		if os.path.isfile(my_out_folder + '/short_summary.txt'):
			## timestamp
			HCGB_time.print_time_stamp(filename_stamp)
		else:
			print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red'))
			return ('FAIL')

	return()
Exemplo n.º 16
0
def mapReads_caller(files, folder, name, threads, STAR_exe, genomeDir,
                    limitRAM_option, Debug):
    ## check if previously joined and succeeded
    filename_stamp = folder + '/.success'
    if os.path.isfile(filename_stamp):
        stamp = time_functions.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s -- %s]" %
                (stamp, name, 'STAR'), 'yellow'))
    else:
        ##
        if Debug:
            print("\n** DEBUG: mapReads_caller options **\n")
            print("folder: " + folder)
            print("name: " + name)
            print("threads: " + str(threads))
            print("STAR_exe: " + STAR_exe)
            print("genomeDir: " + genomeDir)
            print("limitRAM_option: " + str(limitRAM_option))
            print("files: ")
            print(files)

        # Call STAR
        code_returned = mapReads.mapReads("LoadAndKeep", files, folder, name,
                                          STAR_exe, genomeDir, limitRAM_option,
                                          threads, Debug)

        if (code_returned):
            time_functions.print_time_stamp(filename_stamp)
        else:
            print("+ Mapping sample %s failed..." % name)

    ## return results
    bam_file = os.path.join(folder, 'Aligned.sortedByCoord.out.bam')
    mapping_results[name] = bam_file

    return ()
Exemplo n.º 17
0
def pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug):
	
	##
	filename_stamp_plot = folder + '/.success_plot'
	if os.path.isfile(filename_stamp_plot):
		stamp = time_functions.read_time_stamp(filename_stamp_plot)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'plot results'), 'yellow'))
	else:
	
		# PLOT and SHOW results
		RNAbiotypes_stats = main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')
	
		# create plot
		plt.figure(figsize=(16,8))
		df_genetype_2 = pd.DataFrame({'Type':RNAbiotypes_stats[0], 
									'Count':RNAbiotypes_stats[1]}).sort_values(by=['Count'])
	
		## get total count
		df_genetype_ReadCount_sum = df_genetype_2['Count'].sum()
	
		## filter 1% values
		minimun = df_genetype_ReadCount_sum * 0.01
		df_genetype_filter_greater = df_genetype_2[ df_genetype_2['Count'] >= minimun ]
		df_genetype_filter_smaller = df_genetype_2[ df_genetype_2['Count'] < minimun ]
	
		## create %values
		df_genetype_2['Percentage'] = (df_genetype_2['Count']/df_genetype_ReadCount_sum*100).round(3)
		
		## merge and generate Other class
		df_genetype_filter_smaller_sum = df_genetype_filter_smaller['Count'].sum() ## total filter smaller
		df_genetype_filter_greater2 = df_genetype_filter_greater.append({
			'Count':df_genetype_filter_smaller_sum, 
			'Type':'Other'}, ignore_index=True)
	
		## Create Pie Plot
		ax1 = plt.subplot(121, aspect='equal')
		df_genetype_filter_greater2.plot.pie(
			y = 'Count', 
			ax=ax1, 
			autopct='%1.2f%%', 
			shadow=False, 
			labels=df_genetype_filter_greater2['Type'], 
			legend = False)
	
		# plot table
		ax2 = plt.subplot(122)
		plt.axis('off')
		tbl = ax2.table(
			cellText=df_genetype_2.values, 
			colLabels=df_genetype_2.columns,
			loc='center', rowLoc='left', cellLoc='center', 
			)
		tbl.auto_set_font_size(True)
		#tbl.set_fontsize(12)
		tbl.scale(1.1,1.1)
	
		## set PDF name
		name_figure = os.path.join(folder, name + '_RNAbiotypes.pdf')
	
		## generate image
		plt.savefig(name_figure)
		plt.close(name_figure)

		## print time stamps
		time_functions.print_time_stamp(filename_stamp_plot)
		filename_stamp_all = folder + '/.success_all'
		time_functions.print_time_stamp(filename_stamp_all)
Exemplo n.º 18
0
def parse_featureCount(out_file, path, name, bam_file, Debug):
	"""
	Parses featureCount results for RNAbiotype analysis.
	
	:param out_file: Name provided to featureCount for output results.
	:param path:
	:param name:
	
	
	"""

	## file names
	out_tsv_file_name = out_file + '.tsv'
	RNA_biotypes_file_name = os.path.join(path, name + '_RNAbiotype.tsv')

	##
	filename_stamp_parse = path + '/.success_parse'
	if os.path.isfile(filename_stamp_parse):
		stamp = time_functions.read_time_stamp(filename_stamp_parse)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'parse results'), 'yellow'))
	else:
	
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("Parse results for sample: " + name)
			
		## parse results
		out_tsv_file = open(out_tsv_file_name, 'w')
		RNA_biotypes_file = open(RNA_biotypes_file_name, 'w')
		tRNA_count = 0
		
		##########################################
		### read count file
		##########################################
		count_file = open(out_file)
		count_file_text = count_file.read()
		count_file_lines = count_file_text.splitlines()	
	
		for line in count_file_lines:
			if line.startswith('#'):
				continue
			elif line.startswith('Geneid'):
				continue
			else:
				ID = line.split('\t')[0]
				count = int(line.split('\t')[-1])
				string2write_raw = "%s\t%s\n" %(ID, count)
				out_tsv_file.write(string2write_raw)
	
				tRNA_search = re.search(r".*tRNA", ID)
				if tRNA_search:
					tRNA_count = int(tRNA_count) + int(count)				
				elif (count > 0):
					RNA_biotypes_file.write(string2write_raw)
		
		## count and summary tRNA
		string2write = "tRNA\t%s\n" %tRNA_count
		RNA_biotypes_file.write(string2write)
		RNA_biotypes_file.close()
				
		##########################################
		### read summary count file
		##########################################
		summary_count_file = open(out_file + '.summary')
		summary_count_file_text = summary_count_file.read()
		summary_count_file_lines = summary_count_file_text.splitlines()	
	
		for line in summary_count_file_lines:
			if line.startswith('Status'):
				continue
			elif line.startswith('Assigned'):
				continue
			else:
				## adds Unassigned_Ambiguity
				## adds Unassigned_NoFeatures
				ID = line.split('\t')[0]
				count = int(line.split('\t')[-1])
	
				## skip empty entries
				if count == 0:
					continue
				string2write_raw = "%s\t%s\n" %(ID, count)
				out_tsv_file.write(string2write_raw)
	
		##########################################
		## get mapping statistics according to mapping software
		##########################################
		count_multi = 0
		count_unmap = 0
		mapping_folder = os.path.dirname(bam_file)
		mapping_stats = mapping_folder + '/Log.final.out'
		
		## -------------------------------- ##
		### STAR mapping		
		## -------------------------------- ##
		if files_functions.is_non_zero_file(mapping_stats):
			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("STAR mapping available for sample: " + name)
				print ("mapping_folder: " + mapping_folder)
	
			mapping_stats_file = open(mapping_stats)
			mapping_stats_file_text = mapping_stats_file.read()
			mapping_stats_file_lines = mapping_stats_file_text.splitlines()	
	
			for line in mapping_stats_file_lines:
				multi_search = re.search(r".*Number of reads mapped to", line)
				unmap_search = re.search(r".*unmapped.*", line)
				input_search = re.search(r".*input reads.*", line)
			
				if input_search:
					total_input_reads = int(line.split('\t')[-1])
	
				if multi_search:
					count_tmp = int(line.split('\t')[-1])
					count_multi = count_multi + count_tmp
	
				elif unmap_search:
					perc_tmp = line.split('\t')[-1]
					count_reads = math_functions.percentage(perc_tmp, total_input_reads)
					count_unmap = count_unmap + count_reads
		else:
	
			## -------------------------------- ##
			## tophat
			## -------------------------------- ##
	
			mapping_stats = mapping_folder + '/align_summary.txt' 
			count_map = 0
			total_input_reads = 0
			
			if files_functions.is_non_zero_file(mapping_stats):
				## debugging messages
				if Debug:
					print ("** DEBUG:")
					print ("tophat mapping available for sample: " + name)
					print ("mapping_folder: " + mapping_folder)
				
				mapping_stats_file = open(mapping_stats)
				mapping_stats_file_text = mapping_stats_file.read()
				mapping_stats_file_lines = mapping_stats_file_text.splitlines()	
	
				for line in mapping_stats_file_lines:
					map_search2 = re.search(r"Aligned.*\:\s+(\d+).*", line)
					input_search2 = re.search(r".*Input.*\:\s+(\d+).*", line)
					if input_search2:
						total_input_reads = input_search2.group(1)
					if map_search2:
						count_map = map_search2.group(1)
		
				####
				count_unmap = int(total_input_reads) - int(count_map)
	
			else:
				## other
				print ("Neither tophat or STAR..., no mapping statistics")
	
		### print mapping stats
		string2write_unmap = "unmapped\t%s\n" %count_unmap
		out_tsv_file.write(string2write_unmap)
		
		## close files
		out_tsv_file.close()

		## print timestamp
		time_functions.print_time_stamp(filename_stamp_parse)

	return(out_tsv_file_name, RNA_biotypes_file_name)
Exemplo n.º 19
0
def check_sample_assembly(name, sample_folder, files, threads):
    """Checks if sample is assembled.
	
	It checks whether a sample is assembled or not by reading file *sample_folder/.success_all*. 
	
	If file not available (no previous assembly or not suceeded it) it calls :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` to generate assembly for the sample speficied.
	
	:param name: Sample name or tag to identify sample.
	:param sample_folder:  directory to generate assembly ouptut. It must exist.
	:param files: List containing files (fastq R1 & R2) for the sample to be assembled.
	:param threads: Number of CPUs to use
	:type name: string
	:type sample_folder: string 
	:type files: list
	:type threads: integer
	
	:return: Populates dictionary assembly_stats with assembly stats dictionary information
	:rtype: Dataframe
	
	.. seealso:: This function depends on other BacterialTyper and HCGB functions called:
	
		- :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly`
	
	"""
    ## check if previously assembled and succeeded
    filename_stamp = sample_folder + '/.success_all'
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))

        ## Get information
        stat_output = {
            'Contig Stats':
            HCGB_main.file2dictionary(
                sample_folder + '/' + name + '_assembly-contigs.csv', ','),
            'Scaffold Stats':
            HCGB_main.file2dictionary(
                sample_folder + '/' + name + '_assembly-scaffolds.csv', ',')
        }

        ## populate main dictionary
        assembly_stats[name] = [
            stat_output, sample_folder + '/' + name + '_assembly_stats.xlsx'
        ]

    else:

        ## debug message
        if (Debug):
            HCGB_aes.debug_message(
                "spades_assembler.run_module_assembly call:", "yellow")
            print("spades_assembler.run_module_assembly " + name + "\t" +
                  sample_folder + "\t" + files[0] + "\t" + files[1] + "\t" +
                  str(threads) + "\n")

        # Call spades_assembler
        code = spades_assembler.run_module_assembly(name, sample_folder,
                                                    files[0], files[1],
                                                    threads)

        if (code != 'FAIL'):
            ## success stamps
            filename_stamp = sample_folder + '/.success_all'
            stamp = HCGB_time.print_time_stamp(filename_stamp)
            assembly_stats[
                name] = code  # list containing dictionary of data and excel
        else:
            print(
                "Some error occurred for sample %s while generating the assembly. "
                % name)
Exemplo n.º 20
0
def ariba_getref(database, outdir, Debug, threads):
	######################################################################################
	## usage: ariba getref [options] <db> <outprefix>
	######################################################################################
	## Download reference data from one of a few supported public resources
	## positional arguments:
	##	DB name            Database to download. Must be one of: argannot card megares plasmidfinder resfinder srst2_argannot vfdb_core vfdb_full virulencefinder
	##  outprefix          Prefix of output filenames
	######################################################################################

	## where database is one of: 
	##	argannot, card, megares, plasmidfinder, resfinder,
	##	srst2_argannot, vfdb_core, vfdb_full, virulencefinder.

	## folders
	outdir_name = outdir + '/' + database
	outdir_prepare_ref = outdir + '_prepareref'

	## download information in database folder provided by config
	print ("\t+ Retrieve information from database: " + database)

	## check if previously downloaded and succeeded
	filename_stamp = outdir + '/.success'

	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
		download_ariba_cmd = 'OK'
	else:
		cmd_getref = 'ariba getref %s %s' %(database, outdir_name)
		download_ariba_cmd = HCGB_sys.system_call(cmd_getref)
	
	if (download_ariba_cmd == 'OK'):
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		## debug message
		if (Debug):
			print (colored("**DEBUG: ariba getref %s succeed " %database + "**", 'yellow'))

	else: 
		## rise error & exit
		print (colored("***ERROR: ariba getref %s failed " %database + " **",'red'))
		return('FAIL')

	## debug message
	if (Debug):
		print (colored("**DEBUG: Run ariba prepareref %s " %database + "**", 'yellow'))

	## check if previously prepareref and succeeded
	filename_stamp_prepare = outdir_prepare_ref + '/.success'
	if os.path.isfile(filename_stamp_prepare):
		stamp =	HCGB_time.read_time_stamp(filename_stamp_prepare)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
	
	else:
		## get information
		list_files = os.listdir(outdir)
		fasta = ""
		metadata = ""
		for f in list_files:
			if f.endswith('tsv'):
				metadata = outdir + '/' + f
			elif f.endswith('fa'):
				fasta = outdir + '/' + f
	
		code = ariba_prepareref(fasta, metadata, outdir_prepare_ref, threads)
		
		if (code == 'OK'):
			filename_stamp = outdir_prepare_ref + '/.success'

		HCGB_time.print_time_stamp(filename_stamp_prepare)

	return()		
Exemplo n.º 21
0
def download_ariba_databases(list_dbs, main_folder, Debug, threads):

	"""Download ARIBA_ databases.
	
	Using ARIBA software this function retrieves desired databases and prepare them for later analysis.
	
	:param list_dbs: List of databases to download.
	:param main_folder: Absolute path to database folder.
	:param Debug: True/false for printing developer messages
	:param threads: Number of CPUs to use.
	
	:type list_dbs: string 
	:type main_folder: string
	:type Debug: Boolean
	:type threads: integer
	
	 .. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.file_functions.create_subfolder`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`BacterialTyper.scripts.ariba_caller.get_ARIBA_dbs`
	
		- :func:`BacterialTyper.scripts.ariba_caller.ariba_getref`		
		
	 
	.. include:: ../../links.inc
	"""

	print("\n\n+ Download databases for Antimicrobial Resistance Identification By Assembly (ARIBA).")
	ariba_folder = HCGB_files.create_subfolder("ARIBA", main_folder)

	## print ARIBA databases: 
	print ("+ Available databases:")
	dbs = get_ARIBA_dbs(list_dbs)
	
	for db_set in dbs:

		HCGB_aes.print_sepLine("-",30, False)
		print (colored("+ " + db_set,'yellow'))
		
		## prepare folders
		folder_set = HCGB_files.create_subfolder(db_set, ariba_folder)
		outdir_prepare_ref = folder_set + '_prepareref'

		## stamp time file
		filename_stamp_prepare = outdir_prepare_ref + '/.success'
	
		## check if previously done
		if os.path.isfile(filename_stamp_prepare):
			stamp =	HCGB_time.read_time_stamp(filename_stamp_prepare)
			print ("\t+ Database is downloaded in folder: ", folder_set)
			print ("\t+ Data is available and indexed in folder: ", outdir_prepare_ref)
			print (colored("\tDatabase was previously downloaded and prepared on: %s" %stamp, 'yellow'))
		
			## Check if necessary to download again after several months/days
			days_passed = HCGB_time.get_diff_time(filename_stamp_prepare)
			print ("\t\t** %s days ago" %days_passed)		
			if (days_passed > 30): ## download again
				print ("\t\t** Downloading information again just to be sure...")
				return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads)
			else:
				return_ariba_getref = 'OK'
		else:
			return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads)
		
		if (return_ariba_getref == 'OK'):
			print()
		else:
			print (colored("** ARIBA getref failed or generated a warning for " + db_set, 'red'))
Exemplo n.º 22
0
def edirect_ident(dataFrame, outdir_dict, Debug):
    """Connect to NCBI for information retrieval
	
	This functions uses the software edirect_ to connect to NCBI and retrieve some information regarding samples, assemblies, publications, etc.
	
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	
	:return: Information of the identification 
	:rtype: pandas.DataFrame()
	
	See example of returned dataframe in file :file:`/devel/results/edirect_download_results.csv` here:
	
	.. include:: ../../devel/results/edirect_download_results.csv
		:literal:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.get_info_file`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`

		- :func:`BacterialTyper.scripts.functions.optimize_threads`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
	
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.functions.is_non_zero_file`
	
		- :func:`BacterialTyper.scripts.edirect_caller.generate_docsum_call`
		
		- :func:`BacterialTyper.scripts.edirect_caller.generate_xtract_call`
		
	.. include:: ../../links.inc	
	"""
    ################################################
    ## TODO: What to do if multi-isolate sample?
    ################################################

    ## edirect
    HCGB_aes.boxymcboxface("EDirect information")
    print("+ Connect to NCBI to get information from samples identified...")

    ## create dataframe to return results
    edirect_frame = pd.DataFrame(columns=("sample", "genus", "species",
                                          "strain", "BioSample", "genome",
                                          "Plasmids"))

    ## debugging messages
    if Debug:
        print("*******************************************************")
        print("Dataframe sample_results: ")

    # Group dataframe sample name
    sample_results = dataFrame.groupby(["Sample"])

    for name, grouped in sample_results:
        ## debugging messages
        if Debug:
            print("Name: ", name)
            print(grouped)

        ## use edirect to get Species_name and entry for later identification
        edirect_folder = HCGB_files.create_subfolder('edirect',
                                                     outdir_dict[name])

        ## chromosome match
        if (len(grouped.loc[grouped['Database'] == 'bacteria.ATG']
                ['#Template']) == 0):
            if Debug:
                print("Name: ", name)
                print("No chromosome match identified by kmer")

            genus = ''
            species = ''
            BioSample_name = ''
            AssemblyAcc = ''

        else:
            nucc_entry = grouped.loc[grouped['Database'] == 'bacteria.ATG'][
                '#Template'].values[0].split()
            ## e.g. NZ_CP029680.1 Staphylococcus aureus strain AR_0215 chromosome, complete genome

            ##
            out_docsum_file = edirect_folder + '/nuccore_docsum.txt'
            tmp_species_outfile = edirect_folder + '/info.csv'
            filename_stamp = edirect_folder + '/.success_species'

            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous command generated results on: %s [%s]" %
                        (stamp, name), 'yellow'))
                status = True
            else:
                edirect_caller.generate_docsum_call('nuccore', nucc_entry[0],
                                                    out_docsum_file)
                status = edirect_caller.generate_xtract_call(
                    out_docsum_file, 'DocumentSummary',
                    'Organism,BioSample,AssemblyAcc,Strain',
                    tmp_species_outfile)

            ########################################
            ## get information from edirect call
            ########################################
            if not status:
                print("NO INFORMATION")
                continue

            taxa_name_tmp = HCGB_main.get_info_file(tmp_species_outfile)
            Organism = taxa_name_tmp[0].split(',')[0].split()
            genus = Organism[0]  ## genus
            species = Organism[1]  ## species
            BioSample_name = taxa_name_tmp[0].split(',')[1]  ## BioSample
            AssemblyAcc = taxa_name_tmp[0].split(',')[2]  ## AssemblyAcc

            ## sometimes strain is missing
            if len(taxa_name_tmp[0].split(',')) > 3:
                strain = taxa_name_tmp[0].split(',')[3]  ## strain
            else:
                strain = 'NaN'

            ## get GenBank accession ID
            out_docsum_file_assembly = edirect_folder + '/assembly_docsum.txt'
            AssemblyAcc_outfile = edirect_folder + '/AssemblyAcc.csv'

            edirect_caller.generate_docsum_call('assembly', AssemblyAcc,
                                                out_docsum_file_assembly)
            edirect_caller.generate_xtract_call(out_docsum_file_assembly,
                                                'DocumentSummary', 'Genbank',
                                                AssemblyAcc_outfile)

            ## some error occurred
            if not HCGB_main.is_non_zero_file(out_docsum_file_assembly):
                continue

            ## Is it better to download Refseq or Genbank?
            ## https://www.quora.com/What-is-the-difference-between-Refseq-and-Genbank

            GenbankAcc = HCGB_main.get_info_file(AssemblyAcc_outfile)
            if Debug:
                print("Sample: ", name)
                print("Genbank Acc: ", GenbankAcc[0])

        ## plasmid match
        group_plasmid = grouped.loc[grouped['Database'] == 'plasmids.T']
        plasmid_entries = group_plasmid['#Template'].tolist()
        ## e.g. NZ_CP029083.1 Staphylococcus aureus strain AR464 plasmid unnamed1, complete sequence
        plasmid_entries_str = ",".join([i.split()[0] for i in plasmid_entries])

        ## save edirect_frame
        #("sample", "taxa", strain, genome "BioSample", "Plasmids"))
        edirect_frame.loc[len(edirect_frame)] = (name, genus, species, strain,
                                                 BioSample_name, GenbankAcc[0],
                                                 plasmid_entries_str)

        stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## debugging messages
    if Debug:
        print("*******************************************************")

    return (edirect_frame)
Exemplo n.º 23
0
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded):
	
	## folder for results
	if not os.path.isdir(path):
		files_functions.create_folder(path)

	out_file = os.path.join(path, 'featureCount.out')
	logfile = os.path.join(path, name + '_RNAbiotype.log')

	filename_stamp_all = path + '/.success_all'
	if os.path.isfile(filename_stamp_all):
		stamp = time_functions.read_time_stamp(filename_stamp_all)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow'))
		return()

	else:
		filename_stamp_featureCounts = path + '/.success_featureCounts'
		if os.path.isfile(filename_stamp_featureCounts):
			stamp = time_functions.read_time_stamp(filename_stamp_featureCounts)
			print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow'))
		else:

			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("featureCounts system call for sample: " + name)
				print ("out_file: " + out_file)
				print ("logfile: " + logfile)
		
			## send command for feature count
			## Allow multimapping
			if allow_multimap:
				cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
			else:
				cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
				
				
			## system call
			cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True)
			if not cmd_featureCount_code:
				print("** ERROR: featureCount failed for sample " + name)
				exit()
				
			## print time stamp
			time_functions.print_time_stamp(filename_stamp_featureCounts)
		
		## parse results
		(extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug)
		
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("extended_Stats: " + extended_Stats_file)
			print (main_functions.get_data(extended_Stats_file, '\t', 'header=None'))
			print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file)
			print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None'))

	return ()
Exemplo n.º 24
0
def run_annotation(options):

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()

    elif (options.help_Prokka):
        ## information for Prokka
        annotation.print_list_prokka()
        exit()

    ## set default
    options.batch = False

    ###
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly annotation")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "annot",
                                            options.debug)

    ## annotate
    print("+ Annotate assemblies using prokka:")
    print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode")
    if options.genera == 'Other':
        print(
            "\t-Option: genera = Off; No genus-specific BLAST databases option provided"
        )
    else:
        print("\t-Option: genera = ", options.genera,
              "; Genus-specific BLAST databases option provided")

    print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature")
    print("\t-Option: addmrna;  Add 'mRNA' features for each 'CDS' feature")
    print("\t-Option: cdsrnaolap;  Allow [tr]RNA to overlap CDS")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(annot_caller, row['sample'],
                            outdir_dict[row['name']], options, row['name'],
                            threads_job): index
            for index, row in pd_samples_retrieved.iterrows()
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get folders
    givenList = [v for v in outdir_dict.values()]
    protein_files = []
    print(
        "+ Detail information for each sample could be identified in separate folders:"
    )
    for folder in givenList:
        print('\t + ', folder)
        protein_files.extend(
            HCGB_main.retrieve_matching_files(folder, '.faa', Debug))

    ### report generation
    if (options.skip_report):
        print("+ No annotation report generation...")
    else:
        ### report generation
        HCGB_aes.boxymcboxface("Annotation report")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        PROKKA_report = HCGB_files.create_subfolder("annotation",
                                                    outdir_report)
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % PROKKA_report)

        ## check if previously report generated
        filename_stamp = PROKKA_report + '/.success'
        done = 0
        if os.path.isdir(PROKKA_report):
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous report generated results on: %s" % stamp,
                        'yellow'))
                done = 1

        ## generate report
        if done == 0:
            ## get subdirs generated and call multiQC report module
            multiQC_report.multiQC_module_call(givenList, "Prokka",
                                               PROKKA_report, "-dd 2")
            print(
                '\n+ A summary HTML report of each sample is generated in folder: %s'
                % PROKKA_report)

            ## success stamps
            filename_stamp = PROKKA_report + '/.success'
            stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## time stamp
    start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total)

    ## Check each annotation using BUSCO
    results = qc.BUSCO_check(input_dir, outdir, options,
                             start_time_partial_BUSCO, "proteins")

    ## print to file: results

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Annotation module.")
    return ()
Exemplo n.º 25
0
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect,
               retrieve_databases):
    """Generate MLST profile identification
	
	This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. 
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`.
	:param retrieve_databases: 
	
	:type options: 
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type dataFrame_edirect: pandas.DataFrame()
	:type retrieve_databases: pandas.DataFrame()
	
	:return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information.
	:rtype: Dictionary

	
	See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here:
	
	.. include:: ../../devel/results/doMLST_result_example.csv
		:literal:
	
	.. seealso:: Additional information to PubMLST available datasets.
	
		- :doc:`PubMLST datasets<../../../data/PubMLST_datasets>`
	
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
		
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.MLSTar.run_MLSTar`
		
		- :func:`HCGB.sampleParser.files.get_files`
		
		- :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species`
		
	.. include:: ../../links.inc	
	"""
    ## set config
    rscript = set_config.get_exe("Rscript")

    ## TODO: Samples might not be assembled...to take into account and return 0

    ## TODO: Fix and install MLSTar during installation
    print(MLSTar.get_MLSTar_package_installed())
    exit()

    ########################################################################################

    ## TODO: What to do if multi-isolate sample?
    ## TODO: Control if a different profile is provided via --MLST_profile
    ## TODO: Check time passed and download again if >?? days passed]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow'))
        print(dataFrame_edirect)

    ## MLST call
    HCGB_aes.boxymcboxface("MLST typing")
    print(
        "+ Create classical MLST typification of each sample according to species retrieved by kmer..."
    )

    ## get assembly files
    input_dir = os.path.abspath(options.input)
    assembly_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow'))
        print(assembly_samples_retrieved)

    # init
    MLST_results = {}

    ## get MLST_profile: default or provided
    mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] ==
                                               'PubMLST']['path'].tolist()

    if (Debug):
        print("** Debug **")
        print("mlst_profile_list")
        print(mlst_profile_list)

        print("dataFrame_edirect")
        print(dataFrame_edirect)

    ## Generate MLST call according to species identified for each sample
    for index, row in dataFrame_edirect.iterrows():
        MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'],
                                                     row['species'])

        if (MLSTar_taxa_name == 'NaN'):
            print(
                colored(
                    "\t- Not available PubMLST profile for sample [%s] identified as %s %s"
                    % (row['sample'], row['genus'], row['species']), 'yellow'))

        else:
            for mlst_profile in mlst_profile_list:

                ## species folder
                #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder)
                species_mlst = mlst_profile.split(',')[0]
                species_mlst_folder = mlst_profile.split(',')[1]

                ## output file
                output_file = species_mlst_folder + '/PubMLST_available_scheme.csv'
                filename_stamp = species_mlst_folder + '/.success_scheme'

                ##
                if MLSTar_taxa_name == species_mlst:
                    if os.path.isfile(filename_stamp):
                        stamp = HCGB_time.read_time_stamp(filename_stamp)
                        print(
                            colored(
                                "\tA previous command generated results on: %s"
                                % stamp, 'yellow'))
                    else:
                        ### get scheme available
                        MLSTar.getPUBMLST(MLSTar_taxa_name, rscript,
                                          output_file)
                        stamp = HCGB_time.print_time_stamp(filename_stamp)

                    ## parse and get scheme for classical MLST
                    schemes_MLST = pd.read_csv(output_file, sep=',', header=0)

                    ##
                    for item, cluster in schemes_MLST.iterrows():
                        if cluster['len'] < 10:
                            scheme2use = int(cluster['scheme'])
                            continue
                    ###
                    sample = row['sample']
                    MLSTar_folder = HCGB_files.create_subfolder(
                        'MLST', outdir_dict[sample])
                    genome_file = assembly_samples_retrieved.loc[
                        assembly_samples_retrieved['name'] ==
                        sample]['sample'].values[0]

                    ## call MLST
                    (results, profile_folder) = MLSTar.run_MLSTar(
                        species_mlst_folder, rscript, MLSTar_taxa_name,
                        scheme2use, sample, MLSTar_folder, genome_file,
                        options.threads)
                    MLST_results[sample] = results

    ##
    print("+ Finish this step...")
    return (MLST_results)
def check_db_indexed(index_name, folder):
	"""
	Check the status of a database
	
	:param index_name: Index name for the database
	:param folder: Absolute path of the folder containing the database.
	
	:type index_name: string
	:type folder: string
	
	:returns: True/False for the index status.
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
		
		- :func:`BacterialTyper.scripts.functions.readList_fromFile`
		
		- :func:`BacterialTyper.scripts.functions.get_number_lines`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`

		- :func:`BacterialTyper.scripts.functions.print_time_stamp`
	
	"""
	
	# Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name
	my_index_list = [".comp.b", ".index.b", ".length.b", ".name", ".seq.b"]

	print ("\t+ Checking if database has been previously indexed...")
	for sufix in my_index_list:
		##print (sufix)
		
		my_file = index_name + sufix
		if os.path.isfile(my_file):
			print ("\t" + my_file + ' exists...')
		else:
			if (sufix == '.index.b'):
				continue
			else:
				return(False)
	
	## check if previously assembled and succeeded
	filename_stamp = folder + '/.success'
	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tDatabase was generated on: %s" %stamp, 'yellow'))

		## Check if necessary to download again after several months/days
		days_passed = HCGB_time.get_diff_time(filename_stamp)
		print ("\t\t** %s days ago" %days_passed)		
		## download again
		if (days_passed > 60): 
			print ("\t\t** Downloading information again just to be sure...")
			return(False)
	
	## dump in screen
	names = index_name + '.name'
	count = HCGB_main.get_number_lines(names)
	
	print ("\n\t+ Database seems OK and contains several entries (%s):\n" %count)
	if (count > 50):
		print ("\tToo many entries in the database.\n\tCheck file %s for further details." %names)
	else:
		entries = HCGB_main.readList_fromFile(names)
		print (*entries, sep='\n')

	return(True)
Exemplo n.º 27
0
def parse_information(arg_dict, df_accID, outdir):

    ### Parse df_accID
    dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "input",
                                                   arg_dict.debug)
    dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "parse",
                                                   arg_dict.debug)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        print("dict_input_folders")
        print(dict_input_folders)
        print("dict_parse_folders")
        print(dict_parse_folders)

    ## parse each sample retrieved
    for sample, folder_input in dict_input_folders.items():

        if (arg_dict.debug):
            debug_message('sample: ' + sample, 'yellow')
            debug_message('folder_input: ' + folder_input, 'yellow')
            debug_message('folder_parse: ' + dict_parse_folders[sample],
                          'yellow')
            debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'],
                          'yellow')
            debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow')

        ## timestamps
        input_timestamp = os.path.join(folder_input, '.success')
        parse_timestamp = os.path.join(dict_parse_folders[sample], '.success')

        print()
        print("\t+ Parsing sample: " + sample)

        if (not HCGB_files.is_non_zero_file(parse_timestamp)
                and not HCGB_files.is_non_zero_file(input_timestamp)):

            ## TODO: Set threads to use in parallel
            process_OK = parse_annot_file(sample, folder_input,
                                          df_accID.loc[sample, 'annot_file'],
                                          dict_parse_folders[sample],
                                          arg_dict.debug,
                                          df_accID.loc[sample, 'genome'])

            if (process_OK):

                ## link or copy annotation file into folder_input
                HCGB_files.get_symbolic_link_file(
                    df_accID.loc[sample, 'annot_file'], folder_input)

                ## add df_accID.loc[sample,] information as csv into input folder
                df_accID.loc[sample, ].to_csv(os.path.join(
                    folder_input, 'info.csv'),
                                              index=True,
                                              header=True)

                ## print time stamp
                HCGB_time.print_time_stamp(input_timestamp)

                ## print time stamp
                HCGB_time.print_time_stamp(parse_timestamp)
            else:
                print(
                    colored(
                        "\t+ Some error occurred for sample %s while parsing input options"
                        % sample, 'red'))

                ## print time stamp
                HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail'))

                ## print time stamp
                HCGB_time.print_time_stamp(
                    os.path.join(dict_parse_folders[sample], '.fail'))
        else:
            read_time = HCGB_time.read_time_stamp(parse_timestamp)
            print(
                colored(
                    "\t+ Input parsing already available for sample %s [%s]" %
                    (sample, read_time), 'green'))
            print()
Exemplo n.º 28
0
def run_search(arg_dict):
    """Main function of the search module in BacDup package.
    
    This module searches and create gene duplication analysis. 
    
    It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user
    annotation data) or a single or multiple samples.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    if (arg_dict.blast_help):
        info.blast_help()
        exit()

    if (arg_dict.project_help):
        info.project_help()
        exit()

    if (arg_dict.detached_mode_help):
        info.detached_mode()
        exit()

    ### Start the analysis
    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Search module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    outdir = os.path.abspath(arg_dict.input_folder)

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        ## output folder
        print("\n+ Create output folder(s):")
        HCGB.functions.files_functions.create_folder(outdir)
    else:
        arg_dict.project = True

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting information provided... ')
    print('+ Several options available:')
    print('\t* BacDup project folder with initiated data')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\t* Single/Multiple raw BLAST results files')
    print('\t* Single/Multiple fasta proteins + annotation table')

    print("""\n\n**** NOTE: **** 
    For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs)
    use the input module to accommodate accordingly """)
    time.sleep(1)

    print()

    ## parse options
    pd_samples_retrieved = parse_search_options(arg_dict)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## for each sample
    dict_search_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "search",
        arg_dict.debug)

    dict_dup_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug)

    dict_parse_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "parse",
        arg_dict.debug)

    ## create results
    data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table())
    for sample, folder in dict_search_folders.items():

        annot_timestamp = os.path.join(dict_dup_folders[sample],
                                       '.annot_success')
        dup_annot_file = os.path.join(dict_dup_folders[sample],
                                      'dup_annot.csv')

        ## annotation
        annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table']

        if (not HCGB.functions.files_functions.is_non_zero_file(
                annot_timestamp)):

            ## get results
            file_data = pd_samples_retrieved.loc[sample, 'file_data']
            format = pd_samples_retrieved.loc[sample, 'format']
            filtered_data = dup_searcher.filter_data(
                sample, file_data, format, arg_dict.pident, arg_dict.evalue,
                arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug)

            ## timestamps
            filter_timestamp = os.path.join(dict_dup_folders[sample],
                                            '.filter_success')
            if (not HCGB.functions.files_functions.is_non_zero_file(
                    filter_timestamp)):
                #save results as a .csv file
                sort_csv = os.path.abspath(
                    os.path.join(dict_dup_folders[sample],
                                 'filtered_results.csv'))
                filtered_data.to_csv(sort_csv, header=True, index=False)

                ## print time stamp
                HCGB_time.print_time_stamp(filter_timestamp)
            else:
                read_time = HCGB_time.read_time_stamp(filter_timestamp)
                print(
                    colored(
                        "\t+ Filter results already available for sample %s [%s]"
                        % (sample, read_time), 'green'))

            ## get annotation
            (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(
                sample, filtered_data, annot_table_file, arg_dict.debug)

            ##
            info_dup_file = os.path.join(dict_dup_folders[sample],
                                         'info_dup.csv')
            data2add_entry.to_csv(info_dup_file, header=True, index=False)

            ## save into file
            dup_annot_df.to_csv(dup_annot_file, header=True)

            ## print time stamp
            HCGB_time.print_time_stamp(annot_timestamp)

        else:
            read_time = HCGB_time.read_time_stamp(annot_timestamp)
            print(
                colored(
                    "\t+ Duplicate annotation already available for sample %s [%s]"
                    % (sample, read_time), 'green'))

            ## add info for each
            dup_annot_df = HCGB_main.get_data(dup_annot_file, ',',
                                              "index_col=0")
            annot_table = HCGB_main.get_data(annot_table_file, ',',
                                             "index_col=0")
            data2add_entry = dup_searcher.get_dup_stats(
                sample, dup_annot_df, annot_table, arg_dict.debug)

        ## add genome length data
        data2add_entry['genome_len'] = ''
        len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv')
        if os.path.isfile(len_df_file):
            len_data = HCGB_main.get_data(len_df_file, ',', "header=None")
            data2add_entry['genome_len'] = len_data[1].sum()

        ## merge data
        #data2add_entry = data2add_entry.reset_index()
        data2add = data2add.append(data2add_entry, ignore_index=False)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing duplicated search")
    outdir_report = HCGB.functions.files_functions.create_subfolder(
        "report", outdir)
    dups_report = HCGB.functions.files_functions.create_subfolder(
        "dups", outdir_report)

    ## add data2add
    data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting search module.")
    return ()
Exemplo n.º 29
0
def send_kma_job(outdir_file, list_files, name, database, threads, Debug):
    """
	Executes KMA identification jobs
	
	This function automates the process of checking if any previous run succeeded or
	runs the appropiate identification process for the sample and database provided.
	
	:param outdir_file:
	:param list_files:
	:param name:
	:param database:
	:param threads:
	:param dataFrame_sample:
	
	:type outdir_file:
	:type list_files:
	:type name:
	:type database:
	:type threads:
	:type dataFrame_sample:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call`
	
		- :func:`BacterialTyper.module.ident.get_outfile`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
		
		
	"""

    if (Debug):
        print(colored("**DEBUG: ident.send_kma_job call**", 'yellow'))
        print("outdir_file")
        print(outdir_file)
        print("list_files")
        print(list_files)
        print("name: " + name)
        print("database: " + database)

    ## outdir_KMA
    outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file)

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## get outfile
    outfile = get_outfile(outdir_dict_kma, name, database)

    ## check if previously run and succeeded
    basename_tag = os.path.basename(outfile)
    filename_stamp = outdir_dict_kma + '/.success_' + basename_tag

    if (Debug):
        print("Outdir: ", outdir_dict_kma)
        print("outfile: ", outfile)
        print("Filename_stamp: ", filename_stamp)

    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
    else:
        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: species_identification_KMA.kma_ident_module call**",
                    'yellow'))
            print("outfile = get_outfile(outdir_dict_kma, name, db2use)")
            print("outfile: ", outfile)
            print(
                "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) "
            )
            print("species_identification_KMA.kma_ident_module" + "\t" +
                  outfile + "\t" + str(list_files) + "\t" + name + "\t" +
                  database + "\t" + str(threads) + "\n")

        ## Sparse or not
        #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']):
#		if (basename_tag == 'userData_KMA'):
#			option = ''
#		else:
#			option = '-Sparse '

## Add option to retrieve databse from memory
        option = ""
        option = option + '-shm 1'

        # Call KMA
        species_identification_KMA.kma_ident_call(outfile, list_files, name,
                                                  database, kma_bin, option,
                                                  threads)
        stamp = HCGB_time.print_time_stamp(filename_stamp)