def index_gtf_files(index_file_name, search_path_root, regex_input, cache_file_pattern, identifier_pattern, ignore_cache = False, logger = None): """ Iterate through a directory, looking for all GTF files matching a regular expression. Cache the GTF data to a file and write the location of the file to an index file """ if options.verbose > 1: logger.debug("Using regular expression %s" % regex_input) index_file = open(index_file_name, "w") index_file_path = os.path.dirname(index_file_name) regex = re.compile(regex_input) for root, dirs, files in os.walk(search_path_root): for file_name in files: file_path = os.path.abspath(os.path.join(root, file_name)) m = regex.search(file_path) if not m: continue if options.verbose > 1: logger.debug("Indexing %s" % file_name) cache_file_path = regex.sub(cache_file_pattern.format(INDEX_FILE_PATH = index_file_path), file_path) identifier = regex.sub(identifier_pattern , file_path) index_file.write("%s\t%s\t%s\n" % (identifier, file_path, cache_file_path)) gene_structures = gene.t_parse_gtf(identifier) if logger: logger.debug("%s\t%s\t%s" % (identifier, file_path, cache_file_path)) gene_structures.index_genes (file_path, cache_file_path, logger, ignore_cache = ignore_cache)
def get_indexed_gene_types_matching_gtf_file_name(index_file_name, logger, regex_str): """ Get gene structures contained in a GTF file whose file name matches regex_str Because more than one species may match, returns a list of file names / gene_types :param index_file_name: path to index file created by ``index_gtf_files(...)`` :type index_file_name: string :param regex_str: regular expression used to match GTF file name by ``index_gtf_files(...)`` :type regex_str: string :rtype list of tuples of (<matching identifier>, <original GTF path>, <dictionary of lists of genes>) """ index_data = _read_index_file(index_file_name) regex = re.compile(regex_str) results = [] # # fields = <identifier><original_path><cache_path> # for id, original_gtf_path, gtf_cache_path in index_data: m = regex.search(original_gtf_path) if not m: continue gene_structures = gene.t_parse_gtf(id) logger.info("Get indexed gene_types for %s from %s" % (id, original_gtf_path)) results.append((id, original_gtf_path, gene_structures.get_gene_types(original_gtf_path, gtf_cache_path, logger))) if not len(results): logger.info("Regular expression %s did not match any entries in the index file %s" % (regex_str, index_file_name)) return results
def get_indexed_gene_types_for_identifier(index_file_name, logger, identifier): """ Get gene structures contained in a GTF file whose file name matches identifier :param index_file_name: path to index file created by ``index_gtf_files(...)`` :type index_file_name: string :param identifier: identifier parsed from the GTF file name by ``index_gtf_files(...)`` :type identifier: string :rtype tuple of (<matching identifier>, <original GTF path>, <dictionary of lists of genes>) """ index_data = sorted(_read_index_file(index_file_name), reverse = True) all_ids = [] # go through in reverse order of id2 so that higher version numbers are retrieved first for id, original_gtf_path, gtf_cache_path in index_data: all_ids.append(id) if id == identifier or id == identifier.replace(' ', '_'): gene_structures = gene.t_parse_gtf(id) logger.info("Get indexed genes for %s from %s" % (id, original_gtf_path)) return (id, original_gtf_path, gene_structures.get_gene_types(original_gtf_path, gtf_cache_path, logger)) logger.info("Identifier %s was not found in the index file %s" % (identifier, index_file_name)) return (None, all_ids, None)
def get_gene_dict(self): if self.gene_dict is None: gene_structures = gene.t_parse_gtf('test') self.gene_dict = gene_structures.get_genes(self.gtf_file, logger=logger) return self.gene_dict