Exemplo n.º 1
0
def run_mpwt():
    """
    Function used with a mpwt call in the terminal.
    """
    parser = argparse.ArgumentParser(
        'mpwt',
        description='For specific help on each subcommand use: mpwt --help',
        allow_abbrev=False,
    )

    parser.add_argument(
        '-f',
        dest='input',
        required=False,
        help='Working folder containing sub-folders with Genbank/GFF/PF files.',
        metavar='INPUT_DIR')

    parser.add_argument(
        '-o',
        dest='output',
        required=False,
        help='Output folder path. Will create a output folder in this folder.',
        metavar='OUPUT_DIR')

    parser.add_argument(
        '--patho',
        dest='patho',
        help='Will run an inference of Pathologic on the input files.',
        required=False,
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--hf',
        dest='hf',
        help='Use with --patho. Run the Hole Filler using Blast.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--op',
        dest='op',
        help='Use with --patho. Run the Operon predictor of Pathway-Tools.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--tp',
        dest='tp',
        help=
        'Use with --patho. Run the Transport Inference Parser of Pathway-Tools.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--nc',
        dest='nc',
        help='Use with --patho. Turn off loading of Pubmed entries.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '-p',
        dest='p',
        help=
        'Use with --patho. Modify PathoLogic pathway prediction score. Must be a float between 0 and 1.',
        required=False,
    )

    parser.add_argument(
        '--flat',
        dest='flat',
        help='Will create BioPAX/attribute-value flat files from PGDB.',
        required=False,
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--md',
        dest='md',
        help='Move the dat files into the output folder.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--mx',
        dest='mx',
        help='Move the metabolic-reactions.xml file into the output folder.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--mo',
        dest='mo',
        help='Move owl files into the output folder.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--mc',
        dest='mc',
        help='Move tabular files into the output folder.',
        required=False,
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--clean',
        dest='clean',
        help='Clean ptools-local folder, before any other operations.',
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--delete',
        dest='delete',
        help=
        "Give a PGDB name and mpwt will delete it (if multiple separe them with a ',', example: ecolicyc,athalianacyc).",
        required=False,
    )
    parser.add_argument(
        '-r',
        dest='r',
        help=
        "Will delete files in ptools-local and compress results files to reduce results size (use it with -o).",
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--log',
        dest='log',
        help=
        "Create PathoLogic log files inside the given folder (use it with --patho).",
        required=False,
    )
    parser.add_argument(
        '--list',
        dest='list',
        help="List all PGDBs inside the ptools-local folder.",
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        '--taxon-file',
        dest='taxon_file',
        help="For the use of the taxon_id.tsv file to find the taxon ID.",
        required=False,
    )
    parser.add_argument(
        '-v',
        dest='verbose',
        help="Verbose.",
        required=False,
        action='store_true',
        default=False,
    )
    parser.add_argument(
        'topf',
        help=
        "Will convert Genbank and/or GFF files into PathoLogic Format file.",
        nargs='?',
    )

    parser.add_argument('--version',
                        dest='version',
                        action='version',
                        default=False,
                        version='%(prog)s ' + VERSION + '\n' + LICENSE)

    parser.add_argument(
        '--cpu',
        help=
        'Number of cpu to use for the multiprocessing (default=1). [default: 1]',
        required=False,
        type=int,
        default=1)
    parser.add_argument(
        '--permission',
        dest='permission',
        help=
        "Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user).",
        required=False,
    )

    args = parser.parse_args()

    input_folder = args.input
    output_folder = args.output
    patho_inference = args.patho
    patho_hole_filler = args.hf
    patho_operon_predictor = args.op
    patho_transporter_inference = args.tp
    no_download_articles = args.nc
    flat_creation = args.flat
    move_dat = args.md
    move_xml = args.mx
    move_owl = args.mo
    move_col = args.mc
    size_reduction = args.r
    number_cpu = args.cpu
    patho_log = args.log
    clean_arg = args.clean
    pgdb_to_deletes = args.delete
    pgdb_list = args.list
    taxon_file = args.taxon_file
    pathway_score = args.p
    verbose = args.verbose
    topf = args.topf
    version = args.version
    permission = args.permission

    # If no argument print the help.
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    if version:
        print('Mpwt v' + VERSION + '\n' + LICENSE)
        sys.exit()

    if verbose:
        logging.getLogger('mpwt').setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)

    if pgdb_list:
        pgdbs = utils.list_pgdb()
        if pgdbs == []:
            logger.critical('No PGDB inside ptools-local.')
        else:
            logger.critical(
                str(len(pgdbs)) + ' PGDB inside ptools-local:\n' +
                '\t'.join(pgdbs))
        sys.exit()

    # Delete PGDB if use of --delete argument.
    # Use a set to remove redudant PGDB.
    if pgdb_to_deletes:
        utils.remove_pgdbs(list(set(pgdb_to_deletes.split(','))), number_cpu)
        sys.exit()

    if clean_arg:
        if verbose:
            logger.info('~~~~~~~~~~Remove local PGDB~~~~~~~~~~')

        if input_folder:
            utils.cleaning_input(input_folder, verbose)
            input_pgdb_to_deletes = [
                species.lower() + 'cyc' for species in os.listdir(input_folder)
                if not species.startswith('.') and species != 'taxon_id.tsv'
            ]
            utils.remove_pgdbs(input_pgdb_to_deletes, number_cpu)
        else:
            utils.cleaning(number_cpu, verbose)
        if not patho_inference and not flat_creation and not move_dat and not output_folder:
            sys.exit()

    if topf is not None:
        if topf == 'topf':
            if input_folder and output_folder:
                to_pathologic.create_pathologic_file(input_folder,
                                                     output_folder, number_cpu)
                sys.exit()
            else:
                sys.exit(
                    'topf argument needs input_folder (-f) and output_folder options (-o).'
                )
        else:
            sys.exit(
                f'Wrong positional argument passed: {topf}, only "topf" is expected as a postional argument.'
            )

    multiprocess_pwt(input_folder=input_folder,
                     output_folder=output_folder,
                     patho_inference=patho_inference,
                     patho_hole_filler=patho_hole_filler,
                     patho_operon_predictor=patho_operon_predictor,
                     patho_transporter_inference=patho_transporter_inference,
                     no_download_articles=no_download_articles,
                     flat_creation=flat_creation,
                     dat_extraction=move_dat,
                     xml_extraction=move_xml,
                     owl_extraction=move_owl,
                     col_extraction=move_col,
                     size_reduction=size_reduction,
                     number_cpu=number_cpu,
                     patho_log=patho_log,
                     pathway_score=pathway_score,
                     taxon_file=taxon_file,
                     verbose=verbose,
                     permission=permission)
Exemplo n.º 2
0
def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean, use_pwt_xml):
    """Run Pathway Tools on each genome of the repository
    
    Args:
        genomes_dir (str): genome repository
        output_dir (str): output repository
        cpu (int): number of CPUs to use
        clean (bool): delete PGDBs in ptools-local coresponding to the input data
        use_pwt_xml (bool): use Pathway Tools XML instead of creating them with padmet

    Returns:
        pgdb_dir (str): pgdb repository
    """
    logger.info(
        "######### Running metabolic network reconstruction with Pathway Tools #########"
    )
    if not os.path.isdir(genomes_dir):
        logger.critical("Genomes directory path does not exist.")
        sys.exit(1)

    pgdb_dir = os.path.join(output_dir, 'pgdb')
    log_dir = os.path.join(output_dir,  'pgdb_log')
    ncbirc_path = os.path.join(os.path.expanduser('~'), '.ncbirc')
    log_path = os.path.join(log_dir, 'log_error.txt')

    if not utils.is_valid_dir(pgdb_dir):
        logger.critical('Impossible to access/create output directory')
        sys.exit(1)

    if not utils.check_program('pathway-tools'):
        logger.critical(
            'Pathway Tools is not in the PATH, please fix it before using the program'
        )
        sys.exit(1)

    if not utils.check_program("blastp"):
        logger.critical(
            'blastp is not in the PATH, please fix it before using the program'
        )
        sys.exit(1)

    if not utils.is_valid_file(ncbirc_path):
        logger.critical(
            f'No {ncbirc_path} file, please fix it before using the program'
        )
        sys.exit(1)

    genomes_pgdbs = [genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir)]
    if clean:
        remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu)
        cleaning_input(genomes_dir, verbose=False)

    # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again
    pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)]
    if set(pgdb_dirs) == set(genomes_pgdbs):
        logger.warning("PGDBs are already created and will be used. To overrun them, run m2m with --clean option")
        return pgdb_dir

    taxon_file = None
    if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]):
        taxon_file = True

    if use_pwt_xml:
        move_dat = False
        move_xml = True
    else:
        move_dat = True
        move_xml = False

    multiprocess_pwt(genomes_dir, pgdb_dir,
                        patho_inference=True,
                        patho_hole_filler=False,
                        patho_operon_predictor=False,
                        no_download_articles=False,
                        flat_creation=True,
                        dat_extraction=move_dat,
                        xml_extraction=move_xml,
                        owl_extraction=False,
                        col_extraction=False,
                        size_reduction=False,
                        number_cpu=cpu,
                        taxon_file=taxon_file,
                        patho_log=log_dir,
                        verbose=False)

    nb_genomes_dir = len([folder for folder in os.listdir(genomes_dir) if os.path.isdir(os.path.join(genomes_dir, folder))])
    if use_pwt_xml:
        nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isfile(os.path.join(pgdb_dir, folder))])
    else:
        nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isdir(os.path.join(pgdb_dir, folder))])

    if nb_pgdb_dir != nb_genomes_dir:
        if os.path.exists(log_path):
            logger.critical("Something went wrong running Pathway Tools. See the log file in " + log_path)
        else:
            logger.critical("Something went wrong running Pathway Tools.")
        sys.exit(1)

    return (pgdb_dir)
Exemplo n.º 3
0
def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean):
    """Run Pathway Tools on each genome of the repository
    
    Args:
        genomes_dir (str): genome repository
        output_dir (str): output repository
        cpu (int): number of CPUs to use
        clean (bool): delete PGDBs in ptools-local coresponding to the input data

    Returns:
        pgdb_dir (str): pgdb repository
    """
    logger.info(
        "######### Running metabolic network reconstruction with Pathway Tools #########"
    )
    if not os.path.isdir(genomes_dir):
        logger.critical("Genomes directory path does not exist.")
        sys.exit(1)

    pgdb_dir = output_dir + "/pgdb"
    log_dir = output_dir + "/pgdb_log"
    if not utils.is_valid_dir(pgdb_dir):
        logger.critical("Impossible to access/create output directory")
        sys.exit(1)

    if not utils.check_program("pathway-tools"):
        logger.critical(
            "Pathway Tools is not in the PATH, please fix it before using the program"
        )
        sys.exit(1)

    if not utils.check_program("blastp"):
        logger.critical(
            "blastp is not in the PATH, please fix it before using the program"
        )
        sys.exit(1)

    if not utils.is_valid_file(os.path.expanduser("~") + "/.ncbirc"):
        logger.critical(
            "No ~/.ncbirc file, please fix it before using the program")
        sys.exit(1)

    genomes_pgdbs = [
        genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir)
    ]
    if clean:
        remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu)
        cleaning_input(genomes_dir, verbose=False)

    # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again
    pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)]
    if set(pgdb_dirs) == set(genomes_pgdbs):
        logger.warning(
            "PGDBs are already created and will be used. To overrun them, run m2m with --clean option"
        )
        return pgdb_dir

    taxon_file = None
    if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]):
        taxon_file = True

    multiprocess_pwt(genomes_dir,
                     pgdb_dir,
                     patho_inference=True,
                     patho_hole_filler=False,
                     patho_operon_predictor=False,
                     no_download_articles=False,
                     dat_creation=True,
                     dat_extraction=True,
                     size_reduction=False,
                     number_cpu=cpu,
                     taxon_file=taxon_file,
                     patho_log=log_dir,
                     verbose=False)

    if len(os.listdir(pgdb_dir)) != len(os.listdir(genomes_dir)):
        if os.path.exists(log_dir + "/log_error.txt"):
            logger.critical(
                "Something went wrong running Pathway Tools. See the log file in "
                + log_dir + "/log_error.txt")
        else:
            logger.critical("Something went wrong running Pathway Tools.")
        sys.exit(1)

    return (pgdb_dir)
Exemplo n.º 4
0
def check_input_and_existing_pgdb(run_ids, input_folder, output_folder,
                                  number_cpu_to_use):
    """ Check input structure and data in output folder and ptools-local.

    Args:
        run_ids (list): species IDs (folder and GBK/GFF file name)
        input_folder (str): pathname to the input folder
        output_folder (str): pathname to the output folder
        number_cpu_to_use (int): number of CPU to use for multiprocessing
    Returns:
        list: input IDs for PathoLogic and BioPAX/flat files creation
        list: input IDs for BioPAX/flat files creation
    """
    # Check if there are files/folders inside the input folder.
    # And do not use hidden folder/file (beginning with '.').
    species_folders = [
        species_folder for species_folder in os.listdir(input_folder)
        if not species_folder.startswith('.')
    ]
    if len(species_folders) == 0:
        logger.critical(
            "No folder containing genbank/gff file. In {0} you must have sub-folders containing Genbank/GFF file."
            .format(input_folder))
        return None, None

    # Remove Pathologic taxon ID file.
    if 'taxon_id.tsv' in species_folders:
        species_folders.remove('taxon_id.tsv')

    # Check if there is a Genbank, a GFF or a PathoLogic file inside each subfolder.
    check_species_folders = []
    for species_folder in species_folders:
        species_input_files = []
        species_folder_path = os.path.join(input_folder, species_folder)
        for species_file in os.listdir(species_folder_path):
            species_filename, species_file_extension = os.path.splitext(
                species_file)
            if species_file_extension in ['.gbk', '.gbff', '.gff']:
                if species_filename == species_folder:
                    check_species_folders.append(species_folder)
                    species_input_files.append(species_file_extension)
            if any(input_extension in species_file
                   for input_extension in ['.pf']):
                check_species_folders.append(species_folder)
                species_input_files.append(species_file_extension)
        species_input_files = list(set(species_input_files))
        if len(species_input_files) > 1:
            logger.critical(
                'Multiple input files for {0}, there must be only one type of files among: GenBank, GFF or multiple PF files'
                .format(species_folder))
            return None, None
        elif len(species_input_files) == 0:
            logger.critical(
                'Missing input file for {0}. A GenBank file, GFF file or multiple PF files are required.'
                .format(species_folder))
            return None, None

    check_species_folders = list(set(check_species_folders))

    missing_input_files = list(set(run_ids) - set(check_species_folders))
    if len(check_species_folders) == 0:
        logger.critical(
            'Missing Genbank/GFF/PF file for: {0} \nCheck for input files (.gbk/.gbff/.gff/.pf)'
            .format(','.join(missing_input_files)))
        return None, None

    # Check the structure of the input folder.
    invalid_characters = ['.', '/']
    for species_folder in check_species_folders:
        species_folder_path = os.path.join(input_folder, species_folder)
        if os.path.isfile(species_folder_path):
            logger.critical(
                'Error: file inside the input_folder ({0}) instead of a subfolder. Check that you have a structure file of input_folder/species_1/species1.gbk and not input_folder/species_1.gbk.'
                .format(species_folder_path))
            return None, None
        elif os.path.isdir(species_folder_path):
            if any(char in invalid_characters for char in species_folder):
                logger.critical(
                    'Error: . or / in genbank/gff name {0} \nGenbank name is used as an ID in Pathway Tools and Pathway Tools does not create PGDB with . in ID.'
                    .format(species_folder))
                return None, None

    # Take run_ids and remove folder with error (with the intersection with check_species_folders) and if there is already present output.
    clean_run_ids = set(run_ids).intersection(set(check_species_folders))

    if output_folder:
        if os.path.exists(output_folder):
            if os.path.isdir(output_folder):
                already_present_outputs = [
                    output_pgdb for output_pgdb in os.listdir(output_folder)
                ]
                new_run_ids = clean_run_ids - set(already_present_outputs)
                new_run_ids = list(new_run_ids)
                for pgdb in already_present_outputs:
                    if pgdb in clean_run_ids:
                        logger.warning(
                            "! PGDB {0} already in output folder {1}, no inference will be launched on this species."
                            .format(pgdb, output_folder))

                if len(new_run_ids) == 0:
                    logger.info(
                        "All PGDBs are already present in the output folder. Remove them if you want a new inference."
                    )
                    return None, None
            else:
                logger.info(output_folder + " is not a valid output folder.")
                return None, None
        else:
            new_run_ids = list(clean_run_ids)

    else:
        new_run_ids = list(clean_run_ids)

    # Check for PGDB in ptools-local to see if PGDB are already present but they haven't been exported.
    already_present_pgdbs = [
        pgdb_species_folder[:-3] for pgdb_species_folder in utils.list_pgdb()
    ]

    # Check the already finished PGDBs.
    if already_present_pgdbs:
        pathologic_builds = compare_input_ids_to_ptools_ids(
            new_run_ids, already_present_pgdbs, 'intersection')

        # Check for unfinished build of PGDB using their pathologic.log file.
        logger.info("Check and delete unfinished builds of Pathway Tools.")
        unfinished_builds = []
        finished_builds = []
        for pathologic_build in pathologic_builds:
            pathologic_build_lower = pathologic_build.lower()
            pathologic_file = os.path.join(
                *[input_folder, pathologic_build, 'pathologic.log'])
            if os.path.exists(pathologic_file):
                with open(pathologic_file, 'r') as pathologic_log:
                    pathologic_string = pathologic_log.read()
                    if 'Done' in pathologic_string:
                        finished_builds.append(pathologic_build_lower)
                    else:
                        unfinished_builds.append(pathologic_build_lower)

        # Delete the unfinished PGDBs.
        if unfinished_builds:
            utils.remove_pgdbs([
                unfinished_build + 'cyc'
                for unfinished_build in unfinished_builds
            ], number_cpu_to_use)

        already_present_pgdbs = list(
            set(already_present_pgdbs) - set(unfinished_builds))

        run_patho_flat_ids = compare_input_ids_to_ptools_ids(
            new_run_ids, already_present_pgdbs, 'difference')
        run_flat_ids = compare_input_ids_to_ptools_ids(new_run_ids,
                                                       already_present_pgdbs,
                                                       'intersection')

        for run_flat_id in run_flat_ids:
            logger.info(
                "! PGDB {0} already in ptools-local, no PathoLogic inference will be launched on this species."
                .format(run_flat_id))
        return run_patho_flat_ids, run_flat_ids

    return new_run_ids, None