def main(args=None): """Extracts gene-level expression data from StringTie output. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() stringtie_file = args.stringtie_file gene_file = args.gene_file no_novel_transcripts = args.no_novel_transcripts output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read list of gene symbols logger.info('Reading gene data...') genes = misc.read_single(gene_file) # read StringTie output file and summarize FPKM and TPM per gene logger.info('Parsing StringTie output...') logger.info('Associating StringTie gene IDs with gene symbols...') stringtie_genes = {} with open(stringtie_file) as fh: reader = csv.reader(fh, dialect='excel-tab') for l in reader: if l[0][0] == '#': continue assert len(l) == 9 if l[2] != 'transcript': continue attr = parse_attributes(l[8]) try: ref_gene = attr['ref_gene_name'] except KeyError: continue else: # entry has a "ref_gene_name" attribute try: g = stringtie_genes[attr['gene_id']] except KeyError: stringtie_genes[attr['gene_id']] = { ref_gene, } else: g.add(ref_gene) logger.info('Associated %d gene IDs with gene symbols.', len(stringtie_genes)) # C = Counter(len(v) for v in stringtie_genes.itervalues()) gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1] n = len(gene_ids_ambiguous) logger.info('%d / %d associated with multiple gene symbols (%.1f%%).', n, len(stringtie_genes), 100 * (n / float(len(stringtie_genes)))) # read StringTie output file and summarize FPKM and TPM per gene n = len(genes) fpkm = np.zeros(n, dtype=np.float64) tpm = np.zeros(n, dtype=np.float64) fpkm_novel_gene = 0 fpkm_unknown_gene_name = 0 fpkm_novel_trans = 0 fpkm_ambig = 0 with open(stringtie_file) as fh: reader = csv.reader(fh, dialect='excel-tab') for l in reader: if l[0][0] == '#': # skip header continue assert len(l) == 9 if l[2] != 'transcript': # skip exon lines continue attr = parse_attributes(l[8]) f = float(attr['FPKM']) try: g = attr['ref_gene_name'] except KeyError: if no_novel_transcripts: # ignore this transcript fpkm_novel_trans += f continue else: # see if we can assign a gene name based on the gene ID try: assoc = stringtie_genes[attr['gene_id']] except KeyError: # gene_id not associated with any reference gene fpkm_novel_gene += f continue else: if len(assoc) > 1: # gene ID associated with multiple ref. genes # => ingored fpkm_ambig += f continue else: # gene ID associated with exactly one ref. gene g = list(assoc)[0] try: idx = misc.bisect_index(genes, g) except ValueError: fpkm_unknown_gene_name += f logger.warning('Unknown gene name: "%s".', g) continue t = float(attr['TPM']) fpkm[idx] += f tpm[idx] += t # ignored_fpkm = None if no_novel_transcripts: ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name else: ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name total_fpkm = np.sum(fpkm) + ignored_fpkm logger.info('Ignored %.1f / %.1f FPKM (%.1f%%)', ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm)) if no_novel_transcripts and fpkm_novel_trans > 0: logger.info('Ignored %.1f FPKM from novel transcripts (%.1f%%).', fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm)) else: if fpkm_novel_gene > 0: logger.info( 'Ignored %.1f FPKM from transcripts of novel genes ' '(%.1f%%).', fpkm_novel_gene, 100 * (fpkm_novel_gene / total_fpkm)) if fpkm_ambig > 0: logger.info( 'Ignored %.1f FPKM from transcripts with ambiguous ' 'gene membership (%.1f%%).', fpkm_ambig, 100 * (fpkm_ambig / total_fpkm)) if fpkm_unknown_gene_name > 0: logger.info( 'Ignored %.1f FPKM from transcripts of genes with unknown ' 'names (%.1f%%).', fpkm_unknown_gene_name, 100 * (fpkm_unknown_gene_name / total_fpkm)) # write output file E = np.c_[fpkm, tpm] with open(output_file, 'w') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for i, g in enumerate(genes): writer.writerow([g] + ['%.5f' % e for e in E[i, :]]) return 0
def get_gene_exons(gene_table, genome_annotation_file, chunksize=10000): """Parse GTF file and get a dictionary of gene=>list of exon intervals. (Only for protein-coding genes.) TODO: docstring""" # get gene names that are guaranteed to be unique #gene_names = get_readable_gene_identifiers(gene_table) # series with index = Ensembl ID, value = unique gene name #genes = pd.Series(index=gene_table.index, data=gene_names) # sort genes by chromosome, strand, and then position sorted_gene_ids = sorted([id_ for id_ in gene_table.index], key=lambda id_: [ gene_table.loc[id_, 'chromosome'], gene_table. loc[id_, 'position'] < 0, abs(gene_table.loc[id_, 'position']) ]) #genes = genes.loc[sorted_gene_ids] gene_table = gene_table.loc[sorted_gene_ids] # dictionary for holding list of intervals for each gene gene_exons = OrderedDict([id_, []] for id_ in gene_table.index) valid = 0 total = 0 _LOGGER.info('Parsing GTF file "%s" in chunks...', genome_annotation_file) for i, df in enumerate( pd.read_csv(genome_annotation_file, dtype={0: str}, sep='\t', comment='#', header=None, chunksize=chunksize)): # select only exon entries df_sel = df.loc[df.iloc[:, 2] == 'exon'] # extract gene IDs gene_ids = df_sel.iloc[:, 8].apply( lambda x: gtf.parse_attributes(x)['gene_id']) for id_, chrom, start, end in zip(gene_ids, df_sel.iloc[:, 0], df_sel.iloc[:, 3], df_sel.iloc[:, 4]): total += 1 try: gene = gene_table.loc[id_] except KeyError: # this gene is not contained in the gene table continue gene_chrom = gene_table.loc[id_, 'chromosome'] if chrom != gene_chrom: _LOGGER.warning( '%s exon ignored (wrong chromosome: ' '%s instead of %s).', id_, chrom, gene_chrom) else: valid += 1 gene_exons[id_].append([start - 1, end]) _LOGGER.info('%d / %d exons from valid genes (%.1f %%).', valid, total, 100 * (valid / float(total))) return gene_exons
def get_protein_coding_genes( path_or_buffer, chunksize=100000, chromosome_pattern=r"(?:\d\d?|MT|X|Y)$", include_polymorphic_pseudogenes=True, only_manual=False, remove_duplicates=True, fancy_sorting=True, ): r"""Get list of all protein-coding genes based on Ensembl GTF file. Parameters ---------- path_or_buffer : str or buffer The GTF file (either the file path or a buffer) chromosome_pattern : str, optional Regular expression specifying valid chromosomes. [r'(?:\d\d?|MT|X|Y)$'] include_polymorphic_pseudogene : bool, optional Whether to include genes annotated as "polymorphic pseudogenes"? only_manual : bool, optional Whether to exclude annotations with source "ensembl", which are based only on an automatic annotation pipeline. [True] remove_duplicates : bool, optional Whether to remove duplicate annotations, i.e. those with different Ensembl IDs for the same gene. [True] fancy_sorting : bool, optional Whether to sort chromosomes numerically, with "X", "Y", and "MT" at the end. Returns ------- `pandas.DataFrame` Table with rows corresponding to protein-coding genes. Notes ----- Annotation sources and redundant gene annotations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ According to the Ensembl website (1), the Ensembl gene annotation GTF files for human, mouse, zebrafish, rat and pig essentially contain two sets of annotations: One set consists of all annotations with the "ensembl" source annotation (column 2). These annotations are the product of the automated Ensembl "genebuild" pipeline. The other set consists of genes that are manually annotated by the HAVANA team (source "havana"), some of which have been merged with the automatic annotations (source "ensembl_havana"). There seems to be no overlap between genes annotated with "havana" and "ensembl_havana" sources, respectively. However, there are a few genes for which only annotations with source "ensembl" exist. Our policy is therefore to prefer annotations with source "ensembl_havana" and "havana" over those with source "ensembl", and to only keep annotations with source "ensembl" if there are no manually curated alternative annotations. A special case is represented by mitochondrial genes, which always have the source "insdc". (1) see http://www.ensembl.org/Help/Faq?id=152 Removal of duplicates ~~~~~~~~~~~~~~~~~~~~~ Unfortunately, the Ensembl gene annotations contain duplicates for a handful of genes. For example, for MATR3, there are ENSG00000015479 and ENSG00000280987, both of type "ensembl_havana". There seems to be no clear criterion by which we could rationally and automatically choose one ID over the other, at least based on information contained in the GTF file. We therefore remove duplicates according to following policy: - For genes on '+' strand, keep the gene with the left-most starting position. - For genes on '-' strand, keep the gene with the right-most starting position. (In case the starting positions are equal, we keep the one that occurs first in the GTF file.) We would like to use the pandas.DataFrame.drop_duplicates() function for this. So we're temporarily reordering genes using their signed position, and then we're using the original index (position) to restore the original order. """ chrompat = re.compile(chromosome_pattern) c = 0 num_lines = 0 num_chunks = 0 t0 = time.time() reader = pd.read_csv( path_or_buffer, encoding="ascii", sep="\t", header=None, comment="#", dtype={0: str}, chunksize=chunksize ) data = [] header = ["Gene", "Ensembl_ID", "Chromosome", "Position", "Lnegth", "Source", "Type"] valid_biotypes = set(["protein_coding"]) if include_polymorphic_pseudogenes: valid_biotypes.add("polymorphic_pseudogene") valid_sources = set(["ensembl_havana", "havana", "insdc"]) if not only_manual: valid_sources.add("ensembl") excluded_chromosomes = set() for j, df in enumerate(reader): num_chunks += 1 num_lines += df.shape[0] # "insdc" is required to catch the mitochondrial protein-coding genes sel = (df.iloc[:, 2] == "gene") & df.iloc[:, 1].isin(valid_sources) # c += sel.sum() for i, row in df.loc[sel].iterrows(): attr = gtf.parse_attributes(row[8].lstrip(" ")) biotype = attr["gene_biotype"] if biotype not in valid_biotypes: continue chrom = str(row[0]) source = row[1] match = chrompat.match(chrom) if match is None: excluded_chromosomes.add(chrom) continue c += 1 gene_name = attr["gene_name"] ensembl_id = attr["gene_id"] assert row[6] in ["+", "-"] if row[6] == "+": pos = int(row[3]) - 1 elif row[6] == "-": pos = -int(row[4]) else: raise ValueError("Invalid strand information: %s" % str(row[6])) length = abs(int(row[4]) - int(row[3])) data.append([gene_name, ensembl_id, chrom, pos, length, source, biotype]) t1 = time.time() df = pd.DataFrame(columns=header, data=data) if not only_manual: # keep only annotations with source "ensembl" # if no manual annotations are available sel = df["Source"] == "ensembl" redundant_ensembl_genes = set(df.loc[sel, "Gene"].values) & set(df.loc[~sel, "Gene"].values) sel = sel & df["Gene"].isin(redundant_ensembl_genes) num_genes_before = df.shape[0] df = df.loc[~sel] num_genes_after = df.shape[0] logger.info( 'Removed %d gene annotations with source "ensembl" that ' "also had manual annotations.", num_genes_before - num_genes_after, ) if remove_duplicates: # remove duplicate annotations (two or more Ensembl IDs for the same # gene) num_genes_before = df.shape[0] # sort by signed position value df.sort_values("Position", kind="mergesort", inplace=True) # remove duplicates by keeping the first occurrence df.drop_duplicates(["Chromosome", "Gene"], inplace=True) # restore original order using the numeric index df.sort_index(inplace=True) num_genes_after = df.shape[0] logger.info("Removed %d duplicate gene entries", num_genes_before - num_genes_after) # sort normally (first by chromsome, then by absolute position) df_sort = pd.concat([df["Chromosome"], df["Position"].abs()], axis=1) df_sort = df_sort.sort_values(["Chromosome", "Position"], kind="mergesort") df = df.loc[df_sort.index] if fancy_sorting: # Perform "fancy sorting" of genes. Chromosomes with numbers (1-22) # are ordered numerically, and followed by the X, Y, and MT # chromosomes. def transform_chrom(chrom): try: c = int(chrom) except: if chrom == "MT": return "_MT" else: return chrom else: return "%02d" % c chrom_for_sorting = df["Chromosome"].apply(transform_chrom) a = chrom_for_sorting.argsort(kind="mergesort") df = df.iloc[a] logger.info("Performed fancy sorting of chromosomes.") logger.info("Read %d lines (in %d chunks).", num_lines, num_chunks) logger.info("Found %d valid protein-coding gene entries.", c) logger.info("Final number of unique protein-coding genes: %d", df.shape[0]) logger.info("Parsing time: %.1f s", t1 - t0) # additional statistics all_chromosomes = list(df["Chromosome"].unique()) logger.info("Valid chromosomes (%d): %s", len(all_chromosomes), ", ".join(all_chromosomes)) logger.info("Excluded chromosomes (%d): %s", len(excluded_chromosomes), ", ".join(sorted(excluded_chromosomes))) logger.info("Sources:") for i, c in df["Source"].value_counts().iteritems(): logger.info("\t%s: %d", i, c) logger.info("Gene types:") for i, c in df["Type"].value_counts().iteritems(): logger.info("\t%s: %d", i, c) return df
def main(args=None): """Extracts gene-level expression data from StringTie output. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() stringtie_file = args.stringtie_file gene_file = args.gene_file no_novel_transcripts = args.no_novel_transcripts output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read list of gene symbols logger.info("Reading gene data...") genes = misc.read_single(gene_file) # read StringTie output file and summarize FPKM and TPM per gene logger.info("Parsing StringTie output...") logger.info("Associating StringTie gene IDs with gene symbols...") stringtie_genes = {} with open(stringtie_file) as fh: reader = csv.reader(fh, dialect="excel-tab") for l in reader: if l[0][0] == "#": continue assert len(l) == 9 if l[2] != "transcript": continue attr = parse_attributes(l[8]) try: ref_gene = attr["ref_gene_name"] except KeyError: continue else: # entry has a "ref_gene_name" attribute try: g = stringtie_genes[attr["gene_id"]] except KeyError: stringtie_genes[attr["gene_id"]] = {ref_gene} else: g.add(ref_gene) logger.info("Associated %d gene IDs with gene symbols.", len(stringtie_genes)) # C = Counter(len(v) for v in stringtie_genes.itervalues()) gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1] n = len(gene_ids_ambiguous) logger.info( "%d / %d associated with multiple gene symbols (%.1f%%).", n, len(stringtie_genes), 100 * (n / float(len(stringtie_genes))), ) # read StringTie output file and summarize FPKM and TPM per gene n = len(genes) fpkm = np.zeros(n, dtype=np.float64) tpm = np.zeros(n, dtype=np.float64) fpkm_novel_gene = 0 fpkm_unknown_gene_name = 0 fpkm_novel_trans = 0 fpkm_ambig = 0 with open(stringtie_file) as fh: reader = csv.reader(fh, dialect="excel-tab") for l in reader: if l[0][0] == "#": # skip header continue assert len(l) == 9 if l[2] != "transcript": # skip exon lines continue attr = parse_attributes(l[8]) f = float(attr["FPKM"]) try: g = attr["ref_gene_name"] except KeyError: if no_novel_transcripts: # ignore this transcript fpkm_novel_trans += f continue else: # see if we can assign a gene name based on the gene ID try: assoc = stringtie_genes[attr["gene_id"]] except KeyError: # gene_id not associated with any reference gene fpkm_novel_gene += f continue else: if len(assoc) > 1: # gene ID associated with multiple ref. genes # => ingored fpkm_ambig += f continue else: # gene ID associated with exactly one ref. gene g = list(assoc)[0] try: idx = misc.bisect_index(genes, g) except ValueError: fpkm_unknown_gene_name += f logger.warning('Unknown gene name: "%s".', g) continue t = float(attr["TPM"]) fpkm[idx] += f tpm[idx] += t # ignored_fpkm = None if no_novel_transcripts: ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name else: ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name total_fpkm = np.sum(fpkm) + ignored_fpkm logger.info("Ignored %.1f / %.1f FPKM (%.1f%%)", ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm)) if no_novel_transcripts and fpkm_novel_trans > 0: logger.info( "Ignored %.1f FPKM from novel transcripts (%.1f%%).", fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm), ) else: if fpkm_novel_gene > 0: logger.info( "Ignored %.1f FPKM from transcripts of novel genes " "(%.1f%%).", fpkm_novel_gene, 100 * (fpkm_novel_gene / total_fpkm), ) if fpkm_ambig > 0: logger.info( "Ignored %.1f FPKM from transcripts with ambiguous " "gene membership (%.1f%%).", fpkm_ambig, 100 * (fpkm_ambig / total_fpkm), ) if fpkm_unknown_gene_name > 0: logger.info( "Ignored %.1f FPKM from transcripts of genes with unknown " "names (%.1f%%).", fpkm_unknown_gene_name, 100 * (fpkm_unknown_gene_name / total_fpkm), ) # write output file E = np.c_[fpkm, tpm] with open(output_file, "w") as ofh: writer = csv.writer(ofh, dialect="excel-tab", lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for i, g in enumerate(genes): writer.writerow([g] + ["%.5f" % e for e in E[i, :]]) return 0
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect='excel-tab') writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE, quotechar='|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding', 'polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' % (exons)) return 0
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE , quotechar = '|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding','polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' %(exons)) return 0