def build_mapping(record): # Only use records on chromosomes we know. try: chromosome = next(c for c in chromosomes if c.name == 'chr' + record['chromosome']) except StopIteration: raise ValueError() accession, transcript = record['transcript'].split('t') transcript = int(transcript) orientation = 'reverse' if record['strand'] == '-1' else 'forward' if record['cds_start']: cds = record['cds_start'], record['cds_stop'] else: cds = None # TODO: Also take protein into account. For example, in LRG_321 (TP53) # some transcripts occur twice (with different CDSs and different # protein numbers). # https://github.com/mutalyzer/mutalyzer/issues/372 return TranscriptMapping.create_or_update( chromosome, 'lrg', accession, record['gene'], orientation, record['start'], record['stop'], [start for start, _ in record['exons']], [stop for _, stop in record['exons']], 'ebi', transcript=transcript, cds=cds, select_transcript=True)
def build_mappings(records): # We structure the records per transcript and per record type. This is # generalized to a list of records for each type, but we expect only # one GENE record (with `-` as transcript value). # Note that there can be more than one RNA record per transcript if it # is split over different reference contigs. by_transcript = defaultdict(lambda: defaultdict(list)) for r in records: by_transcript[r['transcript']][r['feature_type']].append(r) gene = by_transcript['-']['GENE'][0]['feature_name'] for transcript, by_type in by_transcript.items(): if transcript == '-': continue accession, version = transcript.split('.') version = int(version) chromosome = by_type['RNA'][0]['chromosome'] orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward' start = min(t['start'] for t in by_type['RNA']) stop = max(t['stop'] for t in by_type['RNA']) exon_starts = [] exon_stops = [] cds_positions = [] for exon in sorted(by_type['UTR'] + by_type['CDS'], key=itemgetter('start')): if exon_stops and exon_stops[-1] > exon['start'] - 1: # This exon starts before the end of the previous exon. We # have no idea what to do in this case, so we ignore it. # The number of transcripts affected is very small (e.g., # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). continue if exon['feature_type'] == 'CDS': cds_positions.extend([exon['start'], exon['stop']]) if exon_stops and exon_stops[-1] == exon['start'] - 1: # This exon must be merged with the previous one because # it is split over two entries (a CDS part and a UTR part # or split over different reference contigs). exon_stops[-1] = exon['stop'] else: exon_starts.append(exon['start']) exon_stops.append(exon['stop']) if cds_positions: cds = min(cds_positions), max(cds_positions) else: cds = None # If no exons are annotated, we create one spanning the entire # transcript. if not exon_starts: exon_starts = [start] exon_stops = [stop] yield TranscriptMapping.create_or_update( chromosome, 'refseq', accession, gene, orientation, start, stop, exon_starts, exon_stops, 'ncbi', cds=cds, version=version)
def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. """ connection = MySQLdb.connect(user='******', host='genome-mysql.cse.ucsc.edu', db=assembly.alias, charset='utf8', use_unicode=True) query = """ SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, name2 AS geneName, chrom, strand, protAcc FROM gbStatus, refGene, refLink WHERE type = "mRNA" AND refGene.name = acc AND acc = mrnaAcc AND name2 = %s """ parameters = gene, cursor = connection.cursor() cursor.execute(query, parameters) result = cursor.fetchall() cursor.close() # All ranges in the UCSC tables are zero-based and open-ended. We convert # this to one-based, inclusive for our database. for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() orientation = 'reverse' if strand == '-' else 'forward' exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] exon_stops = [int(i) for i in exonEnds.split(',') if i] if cdsStart and cdsEnd: cds = cdsStart + 1, cdsEnd else: cds = None mapping = TranscriptMapping.create_or_update(chromosome, 'refseq', acc, geneName, orientation, txStart + 1, txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) session.add(mapping) session.commit()
def import_from_reference(assembly, reference): """ Import transcript mappings from a genomic reference. .. todo: Also report how much was added/updated. .. note: Currently no exon locations are supported, this has only been tested on mtDNA. """ chromosome = assembly.chromosomes.filter_by(name='chrM').one() output = Output(__file__) retriever = Retriever.GenBankRetriever(output) record = retriever.loadrecord(reference) if record.molType != 'm': raise ValueError('Only mitochondial references are supported') select_transcript = len(record.geneList) > 1 for gene in record.geneList: # We support exactly one transcript per gene. try: transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] except IndexError: continue # We use gene.location for now, it is always present and the same # for our purposes. #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] start, stop = gene.location orientation = 'reverse' if gene.orientation == -1 else 'forward' try: cds = transcript.CDS.location except AttributeError: cds = None mapping = TranscriptMapping.create_or_update( chromosome, 'refseq', record.source_accession, gene.name, orientation, start, stop, [start], [stop], 'reference', cds=cds, select_transcript=select_transcript, version=int(record.source_version)) session.add(mapping) session.commit()
def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. """ connection = MySQLdb.connect(user='******', host='genome-mysql.cse.ucsc.edu', db=assembly.alias, charset='utf8', use_unicode=True) query = """ SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, name2 AS geneName, chrom, strand, protAcc FROM gbStatus, refGene, refLink WHERE type = "mRNA" AND refGene.name = acc AND acc = mrnaAcc AND name2 = %s """ parameters = gene, cursor = connection.cursor() cursor.execute(query, parameters) result = cursor.fetchall() cursor.close() # All ranges in the UCSC tables are zero-based and open-ended. We convert # this to one-based, inclusive for our database. for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() orientation = 'reverse' if strand == '-' else 'forward' exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] exon_stops = [int(i) for i in exonEnds.split(',') if i] if cdsStart and cdsEnd: cds = cdsStart + 1, cdsEnd else: cds = None mapping = TranscriptMapping.create_or_update( chromosome, 'refseq', acc, geneName, orientation, txStart + 1, txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) session.add(mapping) session.commit()