def run_mpwt(): """ Function used with a mpwt call in the terminal. """ parser = argparse.ArgumentParser( 'mpwt', description='For specific help on each subcommand use: mpwt --help', allow_abbrev=False, ) parser.add_argument( '-f', dest='input', required=False, help='Working folder containing sub-folders with Genbank/GFF/PF files.', metavar='INPUT_DIR') parser.add_argument( '-o', dest='output', required=False, help='Output folder path. Will create a output folder in this folder.', metavar='OUPUT_DIR') parser.add_argument( '--patho', dest='patho', help='Will run an inference of Pathologic on the input files.', required=False, action='store_true', default=False, ) parser.add_argument( '--hf', dest='hf', help='Use with --patho. Run the Hole Filler using Blast.', required=False, action='store_true', default=False, ) parser.add_argument( '--op', dest='op', help='Use with --patho. Run the Operon predictor of Pathway-Tools.', required=False, action='store_true', default=False, ) parser.add_argument( '--tp', dest='tp', help= 'Use with --patho. Run the Transport Inference Parser of Pathway-Tools.', required=False, action='store_true', default=False, ) parser.add_argument( '--nc', dest='nc', help='Use with --patho. Turn off loading of Pubmed entries.', required=False, action='store_true', default=False, ) parser.add_argument( '-p', dest='p', help= 'Use with --patho. Modify PathoLogic pathway prediction score. Must be a float between 0 and 1.', required=False, ) parser.add_argument( '--flat', dest='flat', help='Will create BioPAX/attribute-value flat files from PGDB.', required=False, action='store_true', default=False, ) parser.add_argument( '--md', dest='md', help='Move the dat files into the output folder.', required=False, action='store_true', default=False, ) parser.add_argument( '--mx', dest='mx', help='Move the metabolic-reactions.xml file into the output folder.', required=False, action='store_true', default=False, ) parser.add_argument( '--mo', dest='mo', help='Move owl files into the output folder.', required=False, action='store_true', default=False, ) parser.add_argument( '--mc', dest='mc', help='Move tabular files into the output folder.', required=False, action='store_true', default=False, ) parser.add_argument( '--clean', dest='clean', help='Clean ptools-local folder, before any other operations.', required=False, action='store_true', default=False, ) parser.add_argument( '--delete', dest='delete', help= "Give a PGDB name and mpwt will delete it (if multiple separe them with a ',', example: ecolicyc,athalianacyc).", required=False, ) parser.add_argument( '-r', dest='r', help= "Will delete files in ptools-local and compress results files to reduce results size (use it with -o).", required=False, action='store_true', default=False, ) parser.add_argument( '--log', dest='log', help= "Create PathoLogic log files inside the given folder (use it with --patho).", required=False, ) parser.add_argument( '--list', dest='list', help="List all PGDBs inside the ptools-local folder.", required=False, action='store_true', default=False, ) parser.add_argument( '--taxon-file', dest='taxon_file', help="For the use of the taxon_id.tsv file to find the taxon ID.", required=False, ) parser.add_argument( '-v', dest='verbose', help="Verbose.", required=False, action='store_true', default=False, ) parser.add_argument( 'topf', help= "Will convert Genbank and/or GFF files into PathoLogic Format file.", nargs='?', ) parser.add_argument('--version', dest='version', action='version', default=False, version='%(prog)s ' + VERSION + '\n' + LICENSE) parser.add_argument( '--cpu', help= 'Number of cpu to use for the multiprocessing (default=1). [default: 1]', required=False, type=int, default=1) parser.add_argument( '--permission', dest='permission', help= "Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user).", required=False, ) args = parser.parse_args() input_folder = args.input output_folder = args.output patho_inference = args.patho patho_hole_filler = args.hf patho_operon_predictor = args.op patho_transporter_inference = args.tp no_download_articles = args.nc flat_creation = args.flat move_dat = args.md move_xml = args.mx move_owl = args.mo move_col = args.mc size_reduction = args.r number_cpu = args.cpu patho_log = args.log clean_arg = args.clean pgdb_to_deletes = args.delete pgdb_list = args.list taxon_file = args.taxon_file pathway_score = args.p verbose = args.verbose topf = args.topf version = args.version permission = args.permission # If no argument print the help. if len(sys.argv) == 1: parser.print_help() sys.exit(1) if version: print('Mpwt v' + VERSION + '\n' + LICENSE) sys.exit() if verbose: logging.getLogger('mpwt').setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) if pgdb_list: pgdbs = utils.list_pgdb() if pgdbs == []: logger.critical('No PGDB inside ptools-local.') else: logger.critical( str(len(pgdbs)) + ' PGDB inside ptools-local:\n' + '\t'.join(pgdbs)) sys.exit() # Delete PGDB if use of --delete argument. # Use a set to remove redudant PGDB. if pgdb_to_deletes: utils.remove_pgdbs(list(set(pgdb_to_deletes.split(','))), number_cpu) sys.exit() if clean_arg: if verbose: logger.info('~~~~~~~~~~Remove local PGDB~~~~~~~~~~') if input_folder: utils.cleaning_input(input_folder, verbose) input_pgdb_to_deletes = [ species.lower() + 'cyc' for species in os.listdir(input_folder) if not species.startswith('.') and species != 'taxon_id.tsv' ] utils.remove_pgdbs(input_pgdb_to_deletes, number_cpu) else: utils.cleaning(number_cpu, verbose) if not patho_inference and not flat_creation and not move_dat and not output_folder: sys.exit() if topf is not None: if topf == 'topf': if input_folder and output_folder: to_pathologic.create_pathologic_file(input_folder, output_folder, number_cpu) sys.exit() else: sys.exit( 'topf argument needs input_folder (-f) and output_folder options (-o).' ) else: sys.exit( f'Wrong positional argument passed: {topf}, only "topf" is expected as a postional argument.' ) multiprocess_pwt(input_folder=input_folder, output_folder=output_folder, patho_inference=patho_inference, patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor, patho_transporter_inference=patho_transporter_inference, no_download_articles=no_download_articles, flat_creation=flat_creation, dat_extraction=move_dat, xml_extraction=move_xml, owl_extraction=move_owl, col_extraction=move_col, size_reduction=size_reduction, number_cpu=number_cpu, patho_log=patho_log, pathway_score=pathway_score, taxon_file=taxon_file, verbose=verbose, permission=permission)
def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean, use_pwt_xml): """Run Pathway Tools on each genome of the repository Args: genomes_dir (str): genome repository output_dir (str): output repository cpu (int): number of CPUs to use clean (bool): delete PGDBs in ptools-local coresponding to the input data use_pwt_xml (bool): use Pathway Tools XML instead of creating them with padmet Returns: pgdb_dir (str): pgdb repository """ logger.info( "######### Running metabolic network reconstruction with Pathway Tools #########" ) if not os.path.isdir(genomes_dir): logger.critical("Genomes directory path does not exist.") sys.exit(1) pgdb_dir = os.path.join(output_dir, 'pgdb') log_dir = os.path.join(output_dir, 'pgdb_log') ncbirc_path = os.path.join(os.path.expanduser('~'), '.ncbirc') log_path = os.path.join(log_dir, 'log_error.txt') if not utils.is_valid_dir(pgdb_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) if not utils.check_program('pathway-tools'): logger.critical( 'Pathway Tools is not in the PATH, please fix it before using the program' ) sys.exit(1) if not utils.check_program("blastp"): logger.critical( 'blastp is not in the PATH, please fix it before using the program' ) sys.exit(1) if not utils.is_valid_file(ncbirc_path): logger.critical( f'No {ncbirc_path} file, please fix it before using the program' ) sys.exit(1) genomes_pgdbs = [genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir)] if clean: remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu) cleaning_input(genomes_dir, verbose=False) # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)] if set(pgdb_dirs) == set(genomes_pgdbs): logger.warning("PGDBs are already created and will be used. To overrun them, run m2m with --clean option") return pgdb_dir taxon_file = None if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]): taxon_file = True if use_pwt_xml: move_dat = False move_xml = True else: move_dat = True move_xml = False multiprocess_pwt(genomes_dir, pgdb_dir, patho_inference=True, patho_hole_filler=False, patho_operon_predictor=False, no_download_articles=False, flat_creation=True, dat_extraction=move_dat, xml_extraction=move_xml, owl_extraction=False, col_extraction=False, size_reduction=False, number_cpu=cpu, taxon_file=taxon_file, patho_log=log_dir, verbose=False) nb_genomes_dir = len([folder for folder in os.listdir(genomes_dir) if os.path.isdir(os.path.join(genomes_dir, folder))]) if use_pwt_xml: nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isfile(os.path.join(pgdb_dir, folder))]) else: nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isdir(os.path.join(pgdb_dir, folder))]) if nb_pgdb_dir != nb_genomes_dir: if os.path.exists(log_path): logger.critical("Something went wrong running Pathway Tools. See the log file in " + log_path) else: logger.critical("Something went wrong running Pathway Tools.") sys.exit(1) return (pgdb_dir)
def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean): """Run Pathway Tools on each genome of the repository Args: genomes_dir (str): genome repository output_dir (str): output repository cpu (int): number of CPUs to use clean (bool): delete PGDBs in ptools-local coresponding to the input data Returns: pgdb_dir (str): pgdb repository """ logger.info( "######### Running metabolic network reconstruction with Pathway Tools #########" ) if not os.path.isdir(genomes_dir): logger.critical("Genomes directory path does not exist.") sys.exit(1) pgdb_dir = output_dir + "/pgdb" log_dir = output_dir + "/pgdb_log" if not utils.is_valid_dir(pgdb_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) if not utils.check_program("pathway-tools"): logger.critical( "Pathway Tools is not in the PATH, please fix it before using the program" ) sys.exit(1) if not utils.check_program("blastp"): logger.critical( "blastp is not in the PATH, please fix it before using the program" ) sys.exit(1) if not utils.is_valid_file(os.path.expanduser("~") + "/.ncbirc"): logger.critical( "No ~/.ncbirc file, please fix it before using the program") sys.exit(1) genomes_pgdbs = [ genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir) ] if clean: remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu) cleaning_input(genomes_dir, verbose=False) # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)] if set(pgdb_dirs) == set(genomes_pgdbs): logger.warning( "PGDBs are already created and will be used. To overrun them, run m2m with --clean option" ) return pgdb_dir taxon_file = None if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]): taxon_file = True multiprocess_pwt(genomes_dir, pgdb_dir, patho_inference=True, patho_hole_filler=False, patho_operon_predictor=False, no_download_articles=False, dat_creation=True, dat_extraction=True, size_reduction=False, number_cpu=cpu, taxon_file=taxon_file, patho_log=log_dir, verbose=False) if len(os.listdir(pgdb_dir)) != len(os.listdir(genomes_dir)): if os.path.exists(log_dir + "/log_error.txt"): logger.critical( "Something went wrong running Pathway Tools. See the log file in " + log_dir + "/log_error.txt") else: logger.critical("Something went wrong running Pathway Tools.") sys.exit(1) return (pgdb_dir)
def check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_cpu_to_use): """ Check input structure and data in output folder and ptools-local. Args: run_ids (list): species IDs (folder and GBK/GFF file name) input_folder (str): pathname to the input folder output_folder (str): pathname to the output folder number_cpu_to_use (int): number of CPU to use for multiprocessing Returns: list: input IDs for PathoLogic and BioPAX/flat files creation list: input IDs for BioPAX/flat files creation """ # Check if there are files/folders inside the input folder. # And do not use hidden folder/file (beginning with '.'). species_folders = [ species_folder for species_folder in os.listdir(input_folder) if not species_folder.startswith('.') ] if len(species_folders) == 0: logger.critical( "No folder containing genbank/gff file. In {0} you must have sub-folders containing Genbank/GFF file." .format(input_folder)) return None, None # Remove Pathologic taxon ID file. if 'taxon_id.tsv' in species_folders: species_folders.remove('taxon_id.tsv') # Check if there is a Genbank, a GFF or a PathoLogic file inside each subfolder. check_species_folders = [] for species_folder in species_folders: species_input_files = [] species_folder_path = os.path.join(input_folder, species_folder) for species_file in os.listdir(species_folder_path): species_filename, species_file_extension = os.path.splitext( species_file) if species_file_extension in ['.gbk', '.gbff', '.gff']: if species_filename == species_folder: check_species_folders.append(species_folder) species_input_files.append(species_file_extension) if any(input_extension in species_file for input_extension in ['.pf']): check_species_folders.append(species_folder) species_input_files.append(species_file_extension) species_input_files = list(set(species_input_files)) if len(species_input_files) > 1: logger.critical( 'Multiple input files for {0}, there must be only one type of files among: GenBank, GFF or multiple PF files' .format(species_folder)) return None, None elif len(species_input_files) == 0: logger.critical( 'Missing input file for {0}. A GenBank file, GFF file or multiple PF files are required.' .format(species_folder)) return None, None check_species_folders = list(set(check_species_folders)) missing_input_files = list(set(run_ids) - set(check_species_folders)) if len(check_species_folders) == 0: logger.critical( 'Missing Genbank/GFF/PF file for: {0} \nCheck for input files (.gbk/.gbff/.gff/.pf)' .format(','.join(missing_input_files))) return None, None # Check the structure of the input folder. invalid_characters = ['.', '/'] for species_folder in check_species_folders: species_folder_path = os.path.join(input_folder, species_folder) if os.path.isfile(species_folder_path): logger.critical( 'Error: file inside the input_folder ({0}) instead of a subfolder. Check that you have a structure file of input_folder/species_1/species1.gbk and not input_folder/species_1.gbk.' .format(species_folder_path)) return None, None elif os.path.isdir(species_folder_path): if any(char in invalid_characters for char in species_folder): logger.critical( 'Error: . or / in genbank/gff name {0} \nGenbank name is used as an ID in Pathway Tools and Pathway Tools does not create PGDB with . in ID.' .format(species_folder)) return None, None # Take run_ids and remove folder with error (with the intersection with check_species_folders) and if there is already present output. clean_run_ids = set(run_ids).intersection(set(check_species_folders)) if output_folder: if os.path.exists(output_folder): if os.path.isdir(output_folder): already_present_outputs = [ output_pgdb for output_pgdb in os.listdir(output_folder) ] new_run_ids = clean_run_ids - set(already_present_outputs) new_run_ids = list(new_run_ids) for pgdb in already_present_outputs: if pgdb in clean_run_ids: logger.warning( "! PGDB {0} already in output folder {1}, no inference will be launched on this species." .format(pgdb, output_folder)) if len(new_run_ids) == 0: logger.info( "All PGDBs are already present in the output folder. Remove them if you want a new inference." ) return None, None else: logger.info(output_folder + " is not a valid output folder.") return None, None else: new_run_ids = list(clean_run_ids) else: new_run_ids = list(clean_run_ids) # Check for PGDB in ptools-local to see if PGDB are already present but they haven't been exported. already_present_pgdbs = [ pgdb_species_folder[:-3] for pgdb_species_folder in utils.list_pgdb() ] # Check the already finished PGDBs. if already_present_pgdbs: pathologic_builds = compare_input_ids_to_ptools_ids( new_run_ids, already_present_pgdbs, 'intersection') # Check for unfinished build of PGDB using their pathologic.log file. logger.info("Check and delete unfinished builds of Pathway Tools.") unfinished_builds = [] finished_builds = [] for pathologic_build in pathologic_builds: pathologic_build_lower = pathologic_build.lower() pathologic_file = os.path.join( *[input_folder, pathologic_build, 'pathologic.log']) if os.path.exists(pathologic_file): with open(pathologic_file, 'r') as pathologic_log: pathologic_string = pathologic_log.read() if 'Done' in pathologic_string: finished_builds.append(pathologic_build_lower) else: unfinished_builds.append(pathologic_build_lower) # Delete the unfinished PGDBs. if unfinished_builds: utils.remove_pgdbs([ unfinished_build + 'cyc' for unfinished_build in unfinished_builds ], number_cpu_to_use) already_present_pgdbs = list( set(already_present_pgdbs) - set(unfinished_builds)) run_patho_flat_ids = compare_input_ids_to_ptools_ids( new_run_ids, already_present_pgdbs, 'difference') run_flat_ids = compare_input_ids_to_ptools_ids(new_run_ids, already_present_pgdbs, 'intersection') for run_flat_id in run_flat_ids: logger.info( "! PGDB {0} already in ptools-local, no PathoLogic inference will be launched on this species." .format(run_flat_id)) return run_patho_flat_ids, run_flat_ids return new_run_ids, None