示例#1
0
def get_indexed_gene_types_matching_gtf_file_name(index_file_name, logger,
                                                  regex_str):
    """
    Get gene structures contained in a GTF file whose file name matches regex_str
    Because more than one species may match, returns a list of file names / gene_types

    :param index_file_name: path to index file created by ``index_gtf_files(...)``
    :type index_file_name: string
    :param regex_str: regular expression used to match GTF file name by ``index_gtf_files(...)``
    :type regex_str: string
    :rtype list of tuples of (<matching identifier>, <original GTF path>, <dictionary of lists of genes>)
    """
    index_data = _read_index_file(index_file_name)
    regex = re.compile(regex_str)

    results = []
    #
    #   fields = <identifier><original_path><cache_path>
    #
    for id, original_gtf_path, gtf_cache_path in index_data:
        m = regex.search(original_gtf_path)
        if not m:
            continue
        gene_structures = gene.t_parse_gtf(id)
        logger.info("Get indexed gene_types for %s from %s" %
                    (id, original_gtf_path))
        results.append(
            (id, original_gtf_path,
             gene_structures.get_gene_types(original_gtf_path, gtf_cache_path,
                                            logger)))
    if not len(results):
        logger.info(
            "Regular expression %s did not match any entries in the index file %s"
            % (regex_str, index_file_name))
    return results
示例#2
0
def get_indexed_gene_types_for_identifier(index_file_name, logger, identifier):
    """
    Get gene structures contained in a GTF file whose file name matches identifier

    :param index_file_name: path to index file created by ``index_gtf_files(...)``
    :type index_file_name: string
    :param identifier: identifier parsed from the GTF file name by ``index_gtf_files(...)``
    :type identifier: string
    :rtype tuple of (<matching identifier>, <original GTF path>, <dictionary of lists of genes>)
    """
    index_data = sorted(_read_index_file(index_file_name), reverse=True)
    all_ids = []
    # go through in reverse order of id2 so that higher version numbers are retrieved first
    for id, original_gtf_path, gtf_cache_path in index_data:
        all_ids.append(id)
        if id == identifier or id == identifier.replace(' ', '_'):
            gene_structures = gene.t_parse_gtf(id)
            logger.info("Get indexed genes for %s from %s" %
                        (id, original_gtf_path))
            return (id, original_gtf_path,
                    gene_structures.get_gene_types(original_gtf_path,
                                                   gtf_cache_path, logger))
    logger.info("Identifier %s was not found in the index file %s" %
                (identifier, index_file_name))
    return (None, all_ids, None)
def index_gtf_files(index_file_name, search_path_root, regex_input, cache_file_pattern,
                    identifier_pattern, ignore_cache = False, logger = None):
    """
    Iterate through a directory, looking for all GTF files matching a regular expression.
    Cache the GTF data to a file and write the location of the file to an index file
    """
    if options.verbose > 1:
        logger.debug("Using regular expression %s" % regex_input)

    index_file = open(index_file_name,  "w")
    index_file_path = os.path.dirname(index_file_name)
    regex = re.compile(regex_input)
    for root, dirs, files in os.walk(search_path_root):
        for file_name in files:
            file_path = os.path.abspath(os.path.join(root, file_name))
            m = regex.search(file_path)
            if not m:
                continue
            if options.verbose > 1:
                logger.debug("Indexing %s" % file_name)
            cache_file_path  = regex.sub(cache_file_pattern.format(INDEX_FILE_PATH = index_file_path),
                                        file_path)
            identifier      = regex.sub(identifier_pattern , file_path)
            index_file.write("%s\t%s\t%s\n" % (identifier, file_path, cache_file_path))

            gene_structures = gene.t_parse_gtf(identifier)
            if logger:
                logger.debug("%s\t%s\t%s" %       (identifier, file_path, cache_file_path))
            gene_structures.index_genes (file_path, cache_file_path, logger, ignore_cache = ignore_cache)
def get_indexed_genes_matching_gtf_file_name(index_file_name, logger, regex_str):
    """
    Get gene structures contained in a GTF file whose file name matches regex_str
    Because more than one species may match, returns a list of file names / genes

    :param index_file_name: path to index file created by ``index_gtf_files(...)``
    :type index_file_name: string
    :param regex_str: regular expression used to match GTF file name by ``index_gtf_files(...)``
    :type regex_str: string
    :rtype list of tuples of (<matching identifier>, <original GTF path>, <dictionary of lists of genes>)
    """
    index_data = _read_index_file(index_file_name)
    regex = re.compile(regex_str)

    results = []
    #
    #   fields = <identifier><original_path><cache_path>
    #
    for id, original_gtf_path, gtf_cache_path in index_data:
        m = regex.search(original_gtf_path)
        if not m:
            continue
        gene_structures = gene.t_parse_gtf(id)
        logger.info("Get indexed genes for %s from %s" %
                            (id, original_gtf_path))
        results.append((id, original_gtf_path,
                        gene_structures.get_genes(original_gtf_path, gtf_cache_path,
                                                  logger)))
    if not len(results):
        logger.info("Regular expression %s did not match any entries in the index file %s" %
                            (regex_str, index_file_name))
    return results
示例#5
0
def index_gtf_files(index_file_name,
                    search_path_root,
                    regex_input,
                    cache_file_pattern,
                    identifier_pattern,
                    ignore_cache=False,
                    logger=None):
    """
    Iterate through a directory, looking for all GTF files matching a regular expression.
    Cache the GTF data to a file and write the location of the file to an index file
    """
    if options.verbose > 1:
        logger.debug("Using regular expression %s" % regex_input)

    index_file = open(index_file_name, "w")
    index_file_path = os.path.dirname(index_file_name)
    regex = re.compile(regex_input)
    for root, dirs, files in os.walk(search_path_root):
        for file_name in files:
            file_path = os.path.abspath(os.path.join(root, file_name))
            m = regex.search(file_path)
            if not m:
                continue
            if options.verbose > 1:
                logger.debug("Indexing %s" % file_name)
            cache_file_path = regex.sub(
                cache_file_pattern.format(INDEX_FILE_PATH=index_file_path),
                file_path)
            identifier = regex.sub(identifier_pattern, file_path)
            index_file.write("%s\t%s\t%s\n" %
                             (identifier, file_path, cache_file_path))

            gene_structures = gene.t_parse_gtf(identifier)
            if logger:
                logger.debug("%s\t%s\t%s" %
                             (identifier, file_path, cache_file_path))
            gene_structures.index_genes(file_path,
                                        cache_file_path,
                                        logger,
                                        ignore_cache=ignore_cache)
def get_indexed_genes_for_identifier(index_file_name, logger, identifier):
    """
    Get gene structures contained in a GTF file whose file name matches identifier

    :param index_file_name: path to index file created by ``index_gtf_files(...)``
    :type index_file_name: string
    :param identifier: identifier parsed from the GTF file name by ``index_gtf_files(...)``
    :type identifier: string
    :rtype tuple of (<matching identifier>, <original GTF path>, <dictionary of lists of genes>)
    """
    index_data = sorted(_read_index_file(index_file_name),  reverse = True)
    all_ids = []
    # go through in reverse order of id2 so that higher version numbers are retrieved first
    for id, original_gtf_path, gtf_cache_path in index_data:
        all_ids.append(id)
        if id == identifier or id == identifier.replace(' ', '_'):
            gene_structures = gene.t_parse_gtf(id)
            logger.info("Get indexed genes for %s from %s" % (id, original_gtf_path))
            return (id, original_gtf_path, gene_structures.get_genes(original_gtf_path, gtf_cache_path,
                                                            logger))
    logger.info("Identifier %s was not found in the index file %s" %
                        (identifier, index_file_name))
    return (None, all_ids, None)