def uploadrecord(self, raw_data) : """ Write an uploaded record to a file. If the downloaded file is recognised by its hash, the old UD number is used. @arg raw_data: A GenBank record. @type raw_data: byte string @return: Accession number for the uploaded file. @rtype: unicode """ md5sum = self._calcHash(raw_data) try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: UD = self._newUD() if self.write(raw_data, UD, 0): reference = Reference(UD, md5sum) session.add(reference) session.commit() return UD else: if os.path.isfile(self._nametofile(reference.accession)): return reference.accession else: return self.write(raw_data, reference.accession, 0) and reference.accession
def addJob(self, email, queue, columns, job_type, argument=None, create_download_url=None): """ Add a job to the Database and start the BatchChecker. @arg email: e-mail address of batch supplier @type email: unicode @arg queue: A list of jobs @type queue: list @arg columns: The number of columns. @type columns: int @arg job_type: The type of Batch Job that should be run @type job_type: @arg argument: Batch Arguments, for now only build info @type argument: @arg create_download_url: Function accepting a result_id and returning the URL for downloading the batch job result. Can be None. @type create_download_url: function @return: result_id @rtype: """ # Add jobs to the database batch_job = BatchJob(job_type, email=email, argument=argument) if create_download_url: batch_job.download_url = create_download_url(batch_job.result_id) session.add(batch_job) for i, inputl in enumerate(queue): # NOTE: # This is a very dirty way to skip entries before they are fed # to the batch processes. This is needed for e.g. an empty line # or because the File Module noticed wrong formatting. These lines # used to be discarded but are now preserved by the escape string. # The benefit of this is that the users input will match the # output in terms of input line and outputline. if inputl.startswith("~!"): #Dirty Escape inputl = inputl[2:] if inputl: flag = "S0" # Flag for wrong format else: flag = "S9" # Flag for empty line inputl = " " #Database doesn't like an empty inputfield else: flag = None if (i + 1) % columns: # Add flag for continuing the current row flag = '%s%s' % (flag if flag else '', 'C0') item = BatchQueueItem(batch_job, inputl, flags=flag) session.add(item) session.commit() return batch_job.result_id
def _update_db_md5(self, raw_data, name, source): """ :arg str raw_data: :arg unicode name: :arg unicode source: :returns: filename :rtype: unicode """ # TODO: Documentation. try: reference = Reference.query.filter_by(accession=name).one() current_md5sum = reference.checksum except NoResultFound: current_md5sum = None if current_md5sum: md5sum = self._calculate_hash(raw_data) if md5sum != current_md5sum: self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( name, current_md5sum, md5sum)) Reference.query.filter_by(accession=name).update( {'checksum': md5sum}) session.commit() else: reference = Reference(name, self._calculate_hash(raw_data), source) session.add(reference) session.commit() return self._name_to_file(name)
def uploadrecord(self, raw_data): """ Write an uploaded record to a file. If the downloaded file is recognised by its hash, the old UD number is used. :arg str raw_data: A GenBank record. :returns: Accession number for the uploaded file. :rtype: unicode """ md5sum = self._calculate_hash(raw_data) try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: ud = self._new_ud() if self.write(raw_data, ud, 0): reference = Reference(ud, md5sum) session.add(reference) session.commit() return ud else: if os.path.isfile(self._name_to_file(reference.accession)): return reference.accession else: return (self.write(raw_data, reference.accession, 0) and reference.accession)
def sync_with_remote(self, remote_wsdl, url_template, days=DEFAULT_CREATED_SINCE_DAYS): """ Synchronize the local cache with the remote cache. :: >>> wsdl = 'https://mutalyzer.nl/mutalyzer/services/?wsdl' >>> template = 'https://mutalyzer.nl/mutalyzer/Reference/{file}' >>> self.sync_with_remote(wsdl, template) (14, 3) :arg remote_wsdl: The url of the remote SOAP WSDL description. :type remote_wsdl: unicode :arg url_template: Formatting string containing a ``{file}`` occurence, see example usage above. :string url_template: unicode :arg days: Only remote entries added this number of days ago or later are considered. :type days: int :return: The number of entries added to the local cache and the number cache files downloaded from the remote site. :rtype: tuple(int, int) """ self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync') created_since = datetime.today() - timedelta(days=days) remote_cache = self.remote_cache(remote_wsdl, created_since) inserted = downloaded = 0 for entry in remote_cache: try: reference = Reference.query.filter_by(accession=entry['name']).one() if reference.checksum is not None: continue except NoResultFound: pass if Reference.query.filter_by(checksum=entry['hash']).count() > 0: continue reference = Reference(entry['name'], entry['hash'], entry['source'], source_data=entry['source_data']) session.add(reference) session.commit() inserted += 1 if entry['source'] == 'upload' and entry['cached']: url = url_template.format(file=entry['cached']) self.store_remote_file(entry['name'], url) downloaded += 1 self._output.addMessage(__file__, -1, 'INFO', 'Inserted %d entries in the cache,' ' downloaded %d files.' \ % (inserted, downloaded)) self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync') return inserted, downloaded
def uploadrecord(self, raw_data): """ Write an uploaded record to a file. If the downloaded file is recognised by its hash, the old UD number is used. :arg str raw_data: A GenBank record. :returns: Accession number for the uploaded file. :rtype: unicode """ md5sum = self._calculate_hash(raw_data) try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: ud = self._new_ud() if self.write(raw_data, ud, 0): reference = Reference(ud, md5sum, 'upload') session.add(reference) session.commit() return ud else: if os.path.isfile(self._name_to_file(reference.accession)): return reference.accession else: return (self.write(raw_data, reference.accession, 0) and reference.accession)
def _update_db_md5(self, raw_data, name, gi): """ :arg str raw_data: :arg unicode name: :arg unicode gi: :returns: filename :rtype: unicode """ # TODO: Documentation. try: reference = Reference.query.filter_by(accession=name).one() current_md5sum = reference.checksum except NoResultFound: current_md5sum = None if current_md5sum: md5sum = self._calculate_hash(raw_data) if md5sum != current_md5sum: self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( name, current_md5sum, md5sum)) Reference.query.filter_by(accession=name).update( {'checksum': md5sum}) session.commit() else: reference = Reference( name, self._calculate_hash(raw_data), geninfo_identifier=gi) session.add(reference) session.commit() return self._name_to_file(name)
def downloadrecord(self, url): """ Download a GenBank record from a URL. If the downloaded file is recognised by its hash, the old UD number is used. :arg unicode url: Location of a GenBank record. :returns: UD or None. :rtype: unicode """ if not (url.startswith('http://') or url.startswith('https://') or url.startswith('ftp://')): self._output.addMessage( __file__, 4, 'ERECPARSE', 'Only HTTP(S) or FTP locations are allowed.') return None handle = urllib2.urlopen(url) info = handle.info() if info.gettype() == 'text/plain': length = int(info['Content-Length']) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() md5sum = self._calculate_hash(raw_data) ud = None try: reference = Reference.query.filter_by( checksum=md5sum).one() except NoResultFound: ud = self._new_ud() if not os.path.isfile(self._name_to_file(ud)): ud = self.write(raw_data, ud, 0) and ud if ud: # Parsing went OK, add to DB. reference = Reference(ud, md5sum, source='url', source_data=url) session.add(reference) session.commit() else: if (os.path.isfile(self._name_to_file(reference.accession)) or self.write(raw_data, reference.accession, 0)): ud = reference.accession # Returns the UD or None. return ud else: self._output.addMessage( __file__, 4, 'EFILESIZE', 'Filesize is not within the allowed boundaries.') return None else: self._output.addMessage(__file__, 4, 'ERECPARSE', 'This is not a GenBank record.') return None
def downloadrecord(self, url): """ Download a GenBank record from a URL. If the downloaded file is recognised by its hash, the old UD number is used. :arg unicode url: Location of a GenBank record. :returns: UD or None. :rtype: unicode """ if not (url.startswith('http://') or url.startswith('https://') or url.startswith('ftp://')): self._output.addMessage( __file__, 4, 'ERECPARSE', 'Only HTTP(S) or FTP locations are allowed.') return None handle = urllib2.urlopen(url) info = handle.info() if info['Content-Type'] == 'text/plain': length = int(info['Content-Length']) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() md5sum = self._calculate_hash(raw_data) ud = None try: reference = Reference.query.filter_by( checksum=md5sum).one() except NoResultFound: ud = self._new_ud() if not os.path.isfile(self._name_to_file(ud)): ud = self.write(raw_data, ud, 0) and ud if ud: # Parsing went OK, add to DB. reference = Reference(ud, md5sum, download_url=url) session.add(reference) session.commit() else: if not os.path.isfile( self._name_to_file(reference.accession)): ud = (self.write(raw_data, reference.accession, 0) and reference.accession) # Returns the UD or None. return ud else: self._output.addMessage( __file__, 4, 'EFILESIZE', 'Filesize is not within the allowed boundaries.') return None else: self._output.addMessage( __file__, 4, 'ERECPARSE', 'This is not a GenBank record.') return None
def downloadrecord(self, url) : """ Download a GenBank record from a URL. If the downloaded file is recognised by its hash, the old UD number is used. @arg url: Location of a GenBank record @type url: unicode @return: UD or None @rtype: unicode """ if not (url.startswith('http://') or url.startswith('https://') or url.startswith('ftp://')): self._output.addMessage(__file__, 4, "ERECPARSE", "Only HTTP(S) or FTP locations are allowed.") return None handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "text/plain" : length = int(info["Content-Length"]) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() md5sum = self._calcHash(raw_data) UD = None try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: UD = self._newUD() if not os.path.isfile(self._nametofile(UD)): UD = self.write(raw_data, UD, 0) and UD if UD: #Parsing went OK, add to DB reference = Reference(UD, md5sum, download_url=url) session.add(reference) session.commit() else: if not os.path.isfile(self._nametofile(reference.accession)): UD = self.write(raw_data, reference.accession, 0) and reference.accession return UD #Returns the UD or None #if else : self._output.addMessage(__file__, 4, "EFILESIZE", "Filesize is not within the allowed boundaries.") return None #else #if else : self._output.addMessage(__file__, 4, "ERECPARSE", "This is not a GenBank record.") return None
def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. """ connection = MySQLdb.connect(user='******', host='genome-mysql.cse.ucsc.edu', db=assembly.alias, charset='utf8', use_unicode=True) query = """ SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, name2 AS geneName, chrom, strand, protAcc FROM gbStatus, refGene, refLink WHERE type = "mRNA" AND refGene.name = acc AND acc = mrnaAcc AND name2 = %s """ parameters = gene, cursor = connection.cursor() cursor.execute(query, parameters) result = cursor.fetchall() cursor.close() # All ranges in the UCSC tables are zero-based and open-ended. We convert # this to one-based, inclusive for our database. for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() orientation = 'reverse' if strand == '-' else 'forward' exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] exon_stops = [int(i) for i in exonEnds.split(',') if i] if cdsStart and cdsEnd: cds = cdsStart + 1, cdsEnd else: cds = None mapping = TranscriptMapping.create_or_update(chromosome, 'refseq', acc, geneName, orientation, txStart + 1, txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) session.add(mapping) session.commit()
def import_from_reference(assembly, reference): """ Import transcript mappings from a genomic reference. .. todo: Also report how much was added/updated. .. note: Currently no exon locations are supported, this has only been tested on mtDNA. """ chromosome = assembly.chromosomes.filter_by(name='chrM').one() output = Output(__file__) retriever = Retriever.GenBankRetriever(output) record = retriever.loadrecord(reference) if record.molType != 'm': raise ValueError('Only mitochondial references are supported') select_transcript = len(record.geneList) > 1 for gene in record.geneList: # We support exactly one transcript per gene. try: transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] except IndexError: continue # We use gene.location for now, it is always present and the same # for our purposes. #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] start, stop = gene.location orientation = 'reverse' if gene.orientation == -1 else 'forward' try: cds = transcript.CDS.location except AttributeError: cds = None mapping = TranscriptMapping.create_or_update( chromosome, 'refseq', record.source_accession, gene.name, orientation, start, stop, [start], [stop], 'reference', cds=cds, select_transcript=select_transcript, version=int(record.source_version)) session.add(mapping) session.commit()
def hg19(): """ Fixture for GRCh37/hg19 genome assembly with chromosomes. """ assembly = Assembly('GRCh37', 9606, 'H**o sapiens', alias='hg19') session.add(assembly) session.add_all( Chromosome(assembly, name, accession, organelle) for accession, name, organelle in [('NC_000001.10', 'chr1', 'nucleus'), ('NC_000002.11', 'chr2', 'nucleus'), ('NC_000003.11', 'chr3', 'nucleus'), ('NC_000004.11', 'chr4', 'nucleus'), ('NC_000005.9', 'chr5', 'nucleus'), ('NC_000006.11', 'chr6', 'nucleus'), ('NC_000007.13', 'chr7', 'nucleus'), ('NC_000008.10', 'chr8', 'nucleus'), ('NC_000009.11', 'chr9', 'nucleus'), ('NC_000010.10', 'chr10', 'nucleus'), ('NC_000011.9', 'chr11', 'nucleus'), ('NC_000012.11', 'chr12', 'nucleus'), ('NC_000013.10', 'chr13', 'nucleus'), ('NC_000014.8', 'chr14', 'nucleus'), ('NC_000015.9', 'chr15', 'nucleus'), ('NC_000016.9', 'chr16', 'nucleus'), ('NC_000017.10', 'chr17', 'nucleus'), ('NC_000018.9', 'chr18', 'nucleus'), ('NC_000019.9', 'chr19', 'nucleus'), ('NC_000020.10', 'chr20', 'nucleus'), ('NC_000021.8', 'chr21', 'nucleus'), ('NC_000022.10', 'chr22', 'nucleus'), ('NC_000023.10', 'chrX', 'nucleus'), ('NC_000024.9', 'chrY', 'nucleus'), ('NT_167244.1', 'chr6_apd_hap1', 'nucleus'), ('NT_113891.2', 'chr6_cox_hap2', 'nucleus'), ('NT_167245.1', 'chr6_dbb_hap3', 'nucleus'), ('NT_167246.1', 'chr6_mann_hap4', 'nucleus'), ('NT_167247.1', 'chr6_mcf_hap5', 'nucleus'), ('NT_167248.1', 'chr6_qbl_hap6', 'nucleus'), ('NT_167249.1', 'chr6_ssto_hap7', 'nucleus'), ('NT_167250.1', 'chr4_ctg9_hap1', 'nucleus'), ( 'NT_167251.1', 'chr17_ctg5_hap1', 'nucleus'), ('NC_012920.1', 'chrM', 'mitochondrion')]) session.commit()
def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. """ connection = MySQLdb.connect(user='******', host='genome-mysql.cse.ucsc.edu', db=assembly.alias, charset='utf8', use_unicode=True) query = """ SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, name2 AS geneName, chrom, strand, protAcc FROM gbStatus, refGene, refLink WHERE type = "mRNA" AND refGene.name = acc AND acc = mrnaAcc AND name2 = %s """ parameters = gene, cursor = connection.cursor() cursor.execute(query, parameters) result = cursor.fetchall() cursor.close() # All ranges in the UCSC tables are zero-based and open-ended. We convert # this to one-based, inclusive for our database. for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() orientation = 'reverse' if strand == '-' else 'forward' exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] exon_stops = [int(i) for i in exonEnds.split(',') if i] if cdsStart and cdsEnd: cds = cdsStart + 1, cdsEnd else: cds = None mapping = TranscriptMapping.create_or_update( chromosome, 'refseq', acc, geneName, orientation, txStart + 1, txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) session.add(mapping) session.commit()
def hg19(): """ Fixture for GRCh37/hg19 genome assembly with chromosomes. """ assembly = Assembly('GRCh37', 9606, 'H**o sapiens', alias='hg19') session.add(assembly) session.add_all(Chromosome(assembly, name, accession, organelle) for accession, name, organelle in [ ('NC_000001.10', 'chr1', 'nucleus'), ('NC_000002.11', 'chr2', 'nucleus'), ('NC_000003.11', 'chr3', 'nucleus'), ('NC_000004.11', 'chr4', 'nucleus'), ('NC_000005.9', 'chr5', 'nucleus'), ('NC_000006.11', 'chr6', 'nucleus'), ('NC_000007.13', 'chr7', 'nucleus'), ('NC_000008.10', 'chr8', 'nucleus'), ('NC_000009.11', 'chr9', 'nucleus'), ('NC_000010.10', 'chr10', 'nucleus'), ('NC_000011.9', 'chr11', 'nucleus'), ('NC_000012.11', 'chr12', 'nucleus'), ('NC_000013.10', 'chr13', 'nucleus'), ('NC_000014.8', 'chr14', 'nucleus'), ('NC_000015.9', 'chr15', 'nucleus'), ('NC_000016.9', 'chr16', 'nucleus'), ('NC_000017.10', 'chr17', 'nucleus'), ('NC_000018.9', 'chr18', 'nucleus'), ('NC_000019.9', 'chr19', 'nucleus'), ('NC_000020.10', 'chr20', 'nucleus'), ('NC_000021.8', 'chr21', 'nucleus'), ('NC_000022.10', 'chr22', 'nucleus'), ('NC_000023.10', 'chrX', 'nucleus'), ('NC_000024.9', 'chrY', 'nucleus'), ('NT_167244.1', 'chr6_apd_hap1', 'nucleus'), ('NT_113891.2', 'chr6_cox_hap2', 'nucleus'), ('NT_167245.1', 'chr6_dbb_hap3', 'nucleus'), ('NT_167246.1', 'chr6_mann_hap4', 'nucleus'), ('NT_167247.1', 'chr6_mcf_hap5', 'nucleus'), ('NT_167248.1', 'chr6_qbl_hap6', 'nucleus'), ('NT_167249.1', 'chr6_ssto_hap7', 'nucleus'), ('NT_167250.1', 'chr4_ctg9_hap1', 'nucleus'), ('NT_167251.1', 'chr17_ctg5_hap1', 'nucleus'), ('NC_012920.1', 'chrM', 'mitochondrion')]) session.commit()
def update_transcript_protein_link(transcript_accession, protein_accession=None): """ Update cached link between a transcript and a protein, or create it if it doesn't exist yet. """ link = TranscriptProteinLink.query \ .filter_by(transcript_accession=transcript_accession) \ .first() if link is not None: link.protein_accession = protein_accession link.added = datetime.now() else: link = TranscriptProteinLink(transcript_accession, protein_accession) session.add(link) session.commit()
def cache_with_references(): for reference in references: entry = REFERENCES[reference] try: accession = entry['accession'] except KeyError: accession = reference geninfo_id = entry.get('geninfo_id') path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', entry['filename']) shutil.copy(path, settings.CACHE_DIR) session.add(Reference(accession, entry['checksum'], geninfo_identifier=geninfo_id)) for transcript, protein in entry.get('links', []): session.add(TranscriptProteinLink(transcript, protein)) session.commit()
def cache_with_references(): for reference in references: entry = REFERENCES[reference] try: accession = entry['accession'] except KeyError: accession = reference geninfo_id = entry.get('geninfo_id') path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', entry['filename']) shutil.copy(path, settings.CACHE_DIR) session.add( Reference(accession, entry['checksum'], geninfo_identifier=geninfo_id)) for transcript, protein in entry.get('links', []): session.add(TranscriptProteinLink(transcript, protein)) session.commit()
def _updateDBmd5(self, raw_data, name, GI): #TODO documentation """ @todo: documentation @arg raw_data: @type raw_data: @arg name: @type name: @arg GI: @type GI: @return: filename @rtype: unicode """ try: reference = Reference.query.filter_by(accession=name).one() currentmd5sum = reference.checksum except NoResultFound: currentmd5sum = None if currentmd5sum : md5sum = self._calcHash(raw_data) if md5sum != currentmd5sum : self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( name, currentmd5sum, md5sum)) Reference.query.filter_by(accession=name).update({'checksum': md5sum}) session.commit() #if else : reference = Reference(name, self._calcHash(raw_data), geninfo_identifier=GI) session.add(reference) session.commit() return self._nametofile(name)
def import_from_mapview_file(assembly, mapview_file, group_label): """ Import transcript mappings from an NCBI mapview file. We require that this file is first sorted on the `feature_id` column (#11), which always contains the gene identifier, and then on the `chromosome` column (#2). sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md Raises :exc:`ValueError` if `mapview_file` is not sorted this way. The NCBI mapping file consists of entries, one per line, in order of their location in the genome (more specifically by start location). Every entry has a 'group_label' column, denoting the assembly it is from. We only use entries where this value is `group_label`. There are four types of entries (for our purposes): - Gene: Name, identifier, and location of a gene. - Transcript: Name, gene id, and location of a transcript. - UTR: Location and transcript of a non-coding exon (or part of it). - CDS: Location and transcript of a coding exon (or part of it). A bit troublesome for us is that exons are split in UTR exons and CDS exons, with exons overlapping the UTR/CDS border defined as two separate entries (one of type UTR and one of type CDS). Another minor annoyance is that some transcripts (~ 15) are split over two contigs (NT_*). In that case, they are defined by two entries in the file, where we should merge them by taking the start position of the first and the stop position of the second. To complicate this annoyance, some genes (e.g. in the PAR) are mapped on both the X and Y chromosomes, but stored in the file just like the transcripts split over two contigs. However, these ones should of course not be merged. Our strategy is too sort by gene and chromosome and process the file grouped by these two fields. For transcripts without any UTR and CDS entries (seems to happen for predicted genes), we generate one exon spanning the entire transcript. All positions are one-based, inclusive, and that is what we also use in our database. """ columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', 'feature_name', 'feature_id', 'feature_type', 'group_label', 'transcript', 'evidence_code'] chromosomes = assembly.chromosomes.all() def read_records(mapview_file): for line in mapview_file: if line.startswith('#'): continue record = dict(zip(columns, line.rstrip().split('\t'))) # Only use records from the given assembly. if record['group_label'] != group_label: continue # Only use records on chromosomes we know. try: record['chromosome'] = next(c for c in chromosomes if c.name == 'chr' + record['chromosome']) except StopIteration: continue record['start'] = int(record['start']) record['stop'] = int(record['stop']) yield record def build_mappings(records): # We structure the records per transcript and per record type. This is # generalized to a list of records for each type, but we expect only # one GENE record (with `-` as transcript value). # Note that there can be more than one RNA record per transcript if it # is split over different reference contigs. by_transcript = defaultdict(lambda: defaultdict(list)) for r in records: by_transcript[r['transcript']][r['feature_type']].append(r) gene = by_transcript['-']['GENE'][0]['feature_name'] for transcript, by_type in by_transcript.items(): if transcript == '-': continue accession, version = transcript.split('.') version = int(version) chromosome = by_type['RNA'][0]['chromosome'] orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward' start = min(t['start'] for t in by_type['RNA']) stop = max(t['stop'] for t in by_type['RNA']) exon_starts = [] exon_stops = [] cds_positions = [] for exon in sorted(by_type['UTR'] + by_type['CDS'], key=itemgetter('start')): if exon_stops and exon_stops[-1] > exon['start'] - 1: # This exon starts before the end of the previous exon. We # have no idea what to do in this case, so we ignore it. # The number of transcripts affected is very small (e.g., # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). continue if exon['feature_type'] == 'CDS': cds_positions.extend([exon['start'], exon['stop']]) if exon_stops and exon_stops[-1] == exon['start'] - 1: # This exon must be merged with the previous one because # it is split over two entries (a CDS part and a UTR part # or split over different reference contigs). exon_stops[-1] = exon['stop'] else: exon_starts.append(exon['start']) exon_stops.append(exon['stop']) if cds_positions: cds = min(cds_positions), max(cds_positions) else: cds = None # If no exons are annotated, we create one spanning the entire # transcript. if not exon_starts: exon_starts = [start] exon_stops = [stop] yield TranscriptMapping.create_or_update( chromosome, 'refseq', accession, gene, orientation, start, stop, exon_starts, exon_stops, 'ncbi', cds=cds, version=version) processed_keys = set() for key, records in groupby(read_records(mapview_file), itemgetter('feature_id', 'chromosome')): if key in processed_keys: raise MapviewSortError('Mapview file must be sorted by feature_id ' 'and chromosome (try `sort -k 11,11 -k ' '2,2`)') processed_keys.add(key) for mapping in build_mappings(records): session.add(mapping) session.commit()
def retrieveslice(self, accno, start, stop, orientation) : """ Retrieve a slice of a chromosome. If the arguments are recognised (found in the internal database), we look if the associated file is still present and if so: return its UD number. If the arguments are recognised but no file was found, we download the new slice and update the hash (and log if the hash changes). If the arguments are not recognised, we download the new slice and make a new UD number. The content of the slice is placed in the cache with the UD number as filename. @arg accno: The accession number of the chromosome @type accno: unicode @arg start: Start position of the slice @type start: integer @arg stop: End position of the slice. @type stop: integer @arg orientation: Orientation of the slice: - 1 ; Forward - 2 ; Reverse complement @type orientation: integer @return: An UD number @rtype: unicode """ # Not a valid slice. if start >= stop : return None # The slice can not be too big. if stop - start > settings.MAX_FILE_SIZE: return None slice_orientation = ['forward', 'reverse'][orientation - 1] # Check whether we have seen this slice before. try: reference = Reference.query.filter_by( slice_accession=accno, slice_start=start, slice_stop=stop, slice_orientation=slice_orientation).one() except NoResultFound: reference = None else: if os.path.isfile(self._nametofile(reference.accession)) : # It's still present. return reference.accession # It's not present, so download it. try: handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text', id=accno, seq_start=start, seq_stop=stop, strand=orientation) raw_data = handle.read() handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None # Calculate the hash of the downloaded file. md5sum = self._calcHash(raw_data) if reference is not None: # We have seen this one before. currentmd5sum = reference.checksum if md5sum != currentmd5sum : self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( reference.accession, currentmd5sum, md5sum)) Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum}) session.commit() #if else : # We haven't seen it before, so give it a name. UD = self._newUD() slice_orientation = ['forward', 'reverse'][orientation - 1] reference = Reference(UD, md5sum, slice_accession=accno, slice_start=start, slice_stop=stop, slice_orientation=slice_orientation) session.add(reference) session.commit() #else if self.write(raw_data, reference.accession, 0): return reference.accession
def hg19_transcript_mappings(): """ Fixture for some selected transcript mappings in the GRCh37/hg19 genome assembly. Depends on the :func:`hg19` fixture. """ chromosome_1 = Chromosome.query.filter_by(accession='NC_000001.10').one() chromosome_3 = Chromosome.query.filter_by(accession='NC_000003.11').one() chromosome_6 = Chromosome.query.filter_by(accession='NC_000006.11').one() chromosome_7 = Chromosome.query.filter_by(accession='NC_000007.13').one() chromosome_8 = Chromosome.query.filter_by(accession='NC_000008.10').one() chromosome_11 = Chromosome.query.filter_by(accession='NC_000011.9').one() chromosome_20 = Chromosome.query.filter_by(accession='NC_000020.10').one() chromosome_22 = Chromosome.query.filter_by(accession='NC_000022.10').one() chromosome_x = Chromosome.query.filter_by(accession='NC_000023.10').one() chromosome_mt = Chromosome.query.filter_by(accession='NC_012920.1').one() session.add_all([chromosome_1, chromosome_6, chromosome_8, chromosome_11, chromosome_20, chromosome_22, chromosome_mt]) session.add(TranscriptMapping( chromosome_11, 'refseq', 'NM_003002', 'SDHD', 'forward', 111957571, 111966518, [111957571, 111958581, 111959591, 111965529], [111957683, 111958697, 111959735, 111966518], 'ncbi', transcript=1, cds=(111957632, 111965694), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_11, 'refseq', 'NM_012459', 'TIMM8B', 'reverse', 111955524, 111957522, [111955524, 111957364], [111956186, 111957522], 'ncbi', transcript=1, cds=(111956019, 111957492), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_11, 'refseq', 'NR_028383', 'TIMM8B', 'reverse', 111955524, 111957522, [111955524, 111956702, 111957364], [111956186, 111957034, 111957522], 'ncbi', transcript=1, cds=None, select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_6, 'refseq', 'NM_000500', 'CYP21A2', 'forward', 32006082, 32009419, [32006082, 32006499, 32006871, 32007133, 32007323, 32007526, 32007782, 32008183, 32008445, 32008646], [32006401, 32006588, 32007025, 32007234, 32007424, 32007612, 32007982, 32008361, 32008548, 32009419], 'ncbi', transcript=1, cds=(32006200, 32008911), select_transcript=False, version=5)) session.add(TranscriptMapping( chromosome_22, 'refseq', 'NM_001145134', 'CPT1B', 'reverse', 51007290, 51017096, [51007290, 51007765, 51008005, 51008722, 51009320, 51009587, 51009804, 51010435, 51010632, 51011304, 51011949, 51012764, 51012922, 51014464, 51014627, 51015286, 51015753, 51016204, 51016978], [51007510, 51007850, 51008097, 51008835, 51009472, 51009721, 51009968, 51010551, 51010737, 51011489, 51012144, 51012848, 51013029, 51014541, 51014764, 51015463, 51015892, 51016363, 51017096], 'ncbi', transcript=1, cds=(51007767, 51016344), select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_22, 'refseq', 'NR_021492', 'LOC100144603', 'forward', 51021455, 51022356, [51021455, 51022027], [51021752, 51022356], 'ncbi', transcript=1, cds=None, select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_1, 'refseq', 'NM_001007553', 'CSDE1', 'reverse', 115259538, 115300624, [115259538, 115261234, 115262200, 115263160, 115266504, 115267842, 115268832, 115269604, 115272879, 115273129, 115275225, 115276353, 115276610, 115277063, 115279379, 115280092, 115280584, 115282313, 115292442, 115300546], [115260837, 115261366, 115262363, 115263338, 115266623, 115267954, 115269007, 115269711, 115273043, 115273269, 115275437, 115276478, 115276738, 115277144, 115279476, 115280184, 115280693, 115282511, 115292828, 115300624], 'ncbi', transcript=1, cds=(115260790, 115282511), select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_1, 'refseq', 'NM_001130523', 'CSDE1', 'reverse', 115259538, 115300671, [115259538, 115261234, 115262200, 115263160, 115266504, 115267842, 115268832, 115269604, 115272879, 115273129, 115275225, 115276353, 115276610, 115277063, 115279379, 115280584, 115282313, 115284148, 115292442, 115300546], [115260837, 115261366, 115262363, 115263338, 115266623, 115267954, 115269007, 115269711, 115273043, 115273269, 115275437, 115276478, 115276738, 115277144, 115279476, 115280693, 115282511, 115284294, 115292828, 115300671], 'ncbi', transcript=1, cds=(115260790, 115284285), select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_1, 'refseq', 'NM_002241', 'KCNJ10', 'reverse', 160007257, 160040051, [160007257, 160039812], [160012322, 160040051], 'ncbi', transcript=1, cds=(160011183, 160012322), select_transcript=False, version=4)) session.add(TranscriptMapping( chromosome_20, 'refseq', 'NM_001162505', 'TMEM189', 'reverse', 48740274, 48770335, [48740274, 48744512, 48746083, 48747402, 48760039, 48770054], [48741716, 48744724, 48746227, 48747484, 48760158, 48770335], 'ncbi', transcript=1, cds=(48741595, 48770174), select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_8, 'refseq', 'NM_017780', 'CHD7', 'forward', 61591339, 61779465, [61591339, 61653818, 61693559, 61707545, 61712947, 61714087, 61720776, 61728946, 61732566, 61734349, 61734583, 61735062, 61736399, 61741222, 61742881, 61748632, 61749376, 61750227, 61750635, 61754203, 61754406, 61757423, 61757809, 61761074, 61761610, 61763052, 61763591, 61763821, 61764578, 61765057, 61765388, 61766922, 61768534, 61769004, 61773463, 61774755, 61775107, 61777575], [61591641, 61655656, 61693989, 61707686, 61713084, 61714152, 61720831, 61729060, 61732649, 61734486, 61734704, 61735305, 61736575, 61741365, 61743136, 61748842, 61749571, 61750394, 61750814, 61754313, 61754611, 61757622, 61757968, 61761163, 61761713, 61763181, 61763663, 61763878, 61764806, 61765265, 61766059, 61767082, 61768761, 61769447, 61773684, 61774895, 61775211, 61779465], 'ncbi', transcript=1, cds=(61653992, 61778492), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_mt, 'refseq', 'NC_012920', 'ND4', 'forward', 10760, 12137, [10760], [12137], 'reference', transcript=1, cds=(10760, 12137), select_transcript=True, version=1)) session.add(TranscriptMapping( chromosome_1, 'refseq', 'NM_002001', 'FCER1A', 'forward', 159259504, 159278014, [159259504, 159272096, 159272644, 159273718, 159275778, 159277538], [159259543, 159272209, 159272664, 159273972, 159276035, 159278014], 'ncbi', transcript=1, cds=(159272155, 159277722), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_7, 'refseq', 'XM_001715131', 'LOC100132858', 'reverse', 19828, 36378, [19828, 20834, 31060, 32957, 35335, 36224], [19895, 21029, 31437, 33107, 35541, 36378], 'ncbi', transcript=1, cds=(19828, 36378), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_x, 'refseq', 'NM_004011', 'DMD', 'reverse', 31137345, 32430371, [31137345, 31144759, 31152219, 31164408, 31165392, 31187560, 31190465, 31191656, 31196049, 31196786, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31341715, 31366673, 31462598, 31496223, 31497100, 31514905, 31525398, 31645790, 31676107, 31697492, 31747748, 31792077, 31838092, 31854835, 31893305, 31947713, 31950197, 31986456, 32235033, 32305646, 32328199, 32360217, 32361251, 32364060, 32366523, 32380905, 32382699, 32383137, 32398627, 32404427, 32407618, 32408188, 32429869, 32430279], [31140047, 31144790, 31152311, 31164531, 31165635, 31187718, 31190530, 31191721, 31196087, 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31341775, 31366751, 31462744, 31496491, 31497220, 31515061, 31525570, 31645979, 31676261, 31697703, 31747865, 31792309, 31838200, 31854936, 31893490, 31947862, 31950344, 31986631, 32235180, 32305818, 32328393, 32360399, 32361403, 32364197, 32366645, 32381075, 32382827, 32383316, 32398797, 32404582, 32407791, 32408298, 32430030, 32430371], 'ncbi', transcript=1, cds=(31140036, 32430326), select_transcript=False, version=3)) session.add(TranscriptMapping( chromosome_x, 'refseq', 'NM_004019', 'DMD', 'reverse', 31196312, 31285024, [31196312, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31284927], [31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31285024], 'ncbi', transcript=1, cds=(31196782, 31284946), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_x, 'refseq', 'NM_004007', 'DMD', 'reverse', 31137345, 33038317, [31137345, 31144759, 31152219, 31164408, 31165392, 31187560, 31190465, 31191656, 31196049, 31196786, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31341715, 31366673, 31462598, 31496223, 31497100, 31514905, 31525398, 31645790, 31676107, 31697492, 31747748, 31792077, 31838092, 31854835, 31893305, 31947713, 31950197, 31986456, 32235033, 32305646, 32328199, 32360217, 32361251, 32364060, 32366523, 32380905, 32382699, 32383137, 32398627, 32404427, 32407618, 32408188, 32429869, 32456358, 32459297, 32466573, 32472779, 32481556, 32482703, 32486615, 32490281, 32503036, 32509394, 32519872, 32536125, 32563276, 32583819, 32591647, 32591862, 32613874, 32632420, 32662249, 32663081, 32715987, 32717229, 32827610, 32834585, 32841412, 32862900, 32867845, 33038256], [31140047, 31144790, 31152311, 31164531, 31165635, 31187718, 31190530, 31191721, 31196087, 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31341775, 31366751, 31462744, 31496491, 31497220, 31515061, 31525570, 31645979, 31676261, 31697703, 31747865, 31792309, 31838200, 31854936, 31893490, 31947862, 31950344, 31986631, 32235180, 32305818, 32328393, 32360399, 32361403, 32364197, 32366645, 32381075, 32382827, 32383316, 32398797, 32404582, 32407791, 32408298, 32430030, 32456507, 32459431, 32466755, 32472949, 32481711, 32482816, 32486827, 32490426, 32503216, 32509635, 32519959, 32536248, 32563451, 32583998, 32591754, 32591963, 32613993, 32632570, 32662430, 32663269, 32716115, 32717410, 32827728, 32834757, 32841504, 32862977, 32867937, 33038317], 'ncbi', transcript=1, cds=(31140036, 32834745), select_transcript=False, version=2)) session.add(TranscriptMapping( chromosome_x, 'refseq', 'NM_203473', 'PORCN', 'forward', 48367371, 48379202, [48367371, 48368172, 48369683, 48370280, 48370714, 48370977, 48371223, 48372628, 48372913, 48374105, 48374278, 48374449, 48375571, 48378763], [48367491, 48368344, 48369875, 48370323, 48370895, 48371107, 48371240, 48372753, 48373013, 48374181, 48374341, 48374534, 48375681, 48379202], 'ncbi', transcript=1, cds=(48368209, 48378864), select_transcript=False, version=1)) session.add(TranscriptMapping( chromosome_x, 'refseq', 'NM_000132', 'F8', 'reverse', 154064063, 154250998, [154064063, 154088707, 154089993, 154091358, 154124352, 154128141, 154129646, 154130326, 154132181, 154132571, 154133086, 154134695, 154156846, 154175973, 154182167, 154185232, 154189350, 154194245, 154194701, 154197606, 154212962, 154215512, 154221211, 154225248, 154227754, 154250685], [154066027, 154088883, 154090141, 154091502, 154124507, 154128226, 154129717, 154130442, 154132363, 154132799, 154133298, 154134848, 154159951, 154176182, 154182317, 154185446, 154189443, 154194416, 154194962, 154197827, 154213078, 154215580, 154221423, 154225370, 154227875, 154250998], 'ncbi', transcript=1, cds=(154065872, 154250827), select_transcript=False, version=3)) session.add(TranscriptMapping( chromosome_3, 'refseq', 'NM_000249', 'MLH1', 'forward', 37034841, 37092337, [37034841, 37038110, 37042446, 37045892, 37048482, 37050305, 37053311, 37053502, 37055923, 37058997, 37061801, 37067128, 37070275, 37081677, 37083759, 37089010, 37090008, 37090395, 37091977], [37035154, 37038200, 37042544, 37045965, 37048554, 37050396, 37053353, 37053590, 37056035, 37059090, 37061954, 37067498, 37070423, 37081785, 37083822, 37089174, 37090100, 37090508, 37092337], 'ncbi', transcript=1, cds=(37035039, 37092144), select_transcript=False, version=3)) session.commit()
def downloadrecord(self, url, name=None): """ Download an LRG record from an URL. :arg unicode url: Location of the LRG record. :returns: The full path to the file or Nonein case of failure. :rtype: unicode """ lrg_id = name or os.path.splitext(os.path.split(url)[1])[0] # if not lrg_id.startswith('LRG'): # return None filename = self._name_to_file(lrg_id) # TODO: Properly read the file contents to a unicode string and write # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if (info['Content-Type'] == 'application/xml' and 'Content-length' in info): # Looks like a valid LRG file. length = int(info['Content-Length']) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() handle.close() # Do an md5 check. md5sum = self._calculate_hash(raw_data) try: reference = Reference.query.filter_by( accession=lrg_id).one() md5_db = reference.checksum except NoResultFound: md5_db = None if md5_db is None: reference = Reference(lrg_id, md5sum, download_url=url) session.add(reference) session.commit() elif md5sum != md5_db: # Hash has changed for the LRG ID. self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( lrg_id, md5_db, md5sum)) Reference.query.filter_by(accession=lrg_id).update( {'checksum': md5sum}) session.commit() else: # Hash the same as in db. pass if not os.path.isfile(filename): return self.write(raw_data, lrg_id) else: # This can only occur if synchronus calls to mutalyzer are # made to recover a file that did not exist. Still leaves # a window in between the check and the write. return filename else: self._output.addMessage( __file__, 4, 'EFILESIZE', 'Filesize is not within the allowed boundaries.') else: self._output.addMessage( __file__, 4, 'ERECPARSE', 'This is not an LRG record.') handle.close()
def import_from_lrgmap_file(assembly, lrgmap_file): """ Import transcript mappings from an EBI LRG transcripts map file. All positions are one-based, inclusive, and that is what we also use in our database. """ columns = ['transcript', 'gene', 'chromosome', 'strand', 'start', 'stop', 'exons', 'protein', 'cds_start', 'cds_stop'] chromosomes = assembly.chromosomes.all() def read_mappings(lrgmap_file): for line in lrgmap_file: if line.startswith('#'): continue record = dict(zip(columns, line.rstrip('\r\n').split('\t'))) record['start'] = int(record['start']) record['stop'] = int(record['stop']) try: record['cds_start'] = int(record['cds_start']) except ValueError: record['cds_start'] = None try: record['cds_stop'] = int(record['cds_stop']) except ValueError: record['cds_stop'] = None record['exons'] = [[int(pos) for pos in exon.split('-')] for exon in record['exons'].split(',')] try: yield build_mapping(record) except ValueError: pass def build_mapping(record): # Only use records on chromosomes we know. try: chromosome = next(c for c in chromosomes if c.name == 'chr' + record['chromosome']) except StopIteration: raise ValueError() accession, transcript = record['transcript'].split('t') transcript = int(transcript) orientation = 'reverse' if record['strand'] == '-1' else 'forward' if record['cds_start']: cds = record['cds_start'], record['cds_stop'] else: cds = None # TODO: Also take protein into account. For example, in LRG_321 (TP53) # some transcripts occur twice (with different CDSs and different # protein numbers). # https://github.com/mutalyzer/mutalyzer/issues/372 return TranscriptMapping.create_or_update( chromosome, 'lrg', accession, record['gene'], orientation, record['start'], record['stop'], [start for start, _ in record['exons']], [stop for _, stop in record['exons']], 'ebi', transcript=transcript, cds=cds, select_transcript=True) for mapping in read_mappings(lrgmap_file): session.add(mapping) session.commit()
def downloadrecord(self, url, name=None): """ Download an LRG record from an URL. :arg unicode url: Location of the LRG record. :returns: The full path to the file or Nonein case of failure. :rtype: unicode """ lrg_id = name or os.path.splitext(os.path.split(url)[1])[0] # if not lrg_id.startswith('LRG'): # return None filename = self._name_to_file(lrg_id) # TODO: Properly read the file contents to a unicode string and write # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if (info['Content-Type'] == 'application/xml' and 'Content-length' in info): # Looks like a valid LRG file. length = int(info['Content-Length']) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() handle.close() # Do an md5 check. md5sum = self._calculate_hash(raw_data) try: reference = Reference.query.filter_by( accession=lrg_id).one() md5_db = reference.checksum except NoResultFound: md5_db = None if md5_db is None: # Note: The abstraction seems a bit off here, but we # prefer to set `Reference.source` to `lrg` and not to # `url`, since the former is more specific. reference = Reference(lrg_id, md5sum, 'lrg') session.add(reference) session.commit() elif md5sum != md5_db: # Hash has changed for the LRG ID. self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( lrg_id, md5_db, md5sum)) Reference.query.filter_by(accession=lrg_id).update( {'checksum': md5sum}) session.commit() else: # Hash the same as in db. pass if not os.path.isfile(filename): return self.write(raw_data, lrg_id) else: # This can only occur if synchronus calls to mutalyzer are # made to recover a file that did not exist. Still leaves # a window in between the check and the write. return filename else: self._output.addMessage( __file__, 4, 'EFILESIZE', 'Filesize is not within the allowed boundaries.') else: self._output.addMessage(__file__, 4, 'ERECPARSE', 'This is not an LRG record.') handle.close()
def downloadrecord(self, url, name = None) : """ Download an LRG record from an URL. @arg url: Location of the LRG record @type url: unicode @return: - filename ; The full path to the file - None ; in case of failure @rtype: unicode """ lrgID = name or os.path.splitext(os.path.split(url)[1])[0] #if not lrgID.startswith("LRG"): # return None filename = self._nametofile(lrgID) # Todo: Properly read the file contents to a unicode string and write # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "application/xml" and info.has_key("Content-length"): length = int(info["Content-Length"]) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() handle.close() #Do an md5 check md5sum = self._calcHash(raw_data) try: reference = Reference.query.filter_by(accession=lrgID).one() md5db = reference.checksum except NoResultFound: md5db = None if md5db is None: reference = Reference(lrgID, md5sum, download_url=url) session.add(reference) session.commit() elif md5sum != md5db: #hash has changed for the LRG ID self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( lrgID, md5db, md5sum)) Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum}) session.commit() else: #hash the same as in db pass if not os.path.isfile(filename) : return self.write(raw_data, lrgID) else: # This can only occur if synchronus calls to mutalyzer are # made to recover a file that did not exist. Still leaves # a window in between the check and the write. return filename #if else : self._output.addMessage(__file__, 4, "EFILESIZE", "Filesize is not within the allowed boundaries.") #if else : self._output.addMessage(__file__, 4, "ERECPARSE", "This is not an LRG record.") handle.close()
def import_from_lrgmap_file(assembly, lrgmap_file): """ Import transcript mappings from an EBI LRG transcripts map file. All positions are one-based, inclusive, and that is what we also use in our database. """ columns = [ 'transcript', 'gene', 'chromosome', 'strand', 'start', 'stop', 'exons', 'protein', 'cds_start', 'cds_stop' ] chromosomes = assembly.chromosomes.all() def read_mappings(lrgmap_file): for line in lrgmap_file: if line.startswith('#'): continue record = dict(zip(columns, line.rstrip('\r\n').split('\t'))) record['start'] = int(record['start']) record['stop'] = int(record['stop']) try: record['cds_start'] = int(record['cds_start']) except ValueError: record['cds_start'] = None try: record['cds_stop'] = int(record['cds_stop']) except ValueError: record['cds_stop'] = None record['exons'] = [[int(pos) for pos in exon.split('-')] for exon in record['exons'].split(',')] try: yield build_mapping(record) except ValueError: pass def build_mapping(record): # Only use records on chromosomes we know. try: chromosome = next(c for c in chromosomes if c.name == 'chr' + record['chromosome']) except StopIteration: raise ValueError() accession, transcript = record['transcript'].split('t') transcript = int(transcript) orientation = 'reverse' if record['strand'] == '-1' else 'forward' if record['cds_start']: cds = record['cds_start'], record['cds_stop'] else: cds = None # TODO: Also take protein into account. For example, in LRG_321 (TP53) # some transcripts occur twice (with different CDSs and different # protein numbers). # https://github.com/mutalyzer/mutalyzer/issues/372 return TranscriptMapping.create_or_update( chromosome, 'lrg', accession, record['gene'], orientation, record['start'], record['stop'], [start for start, _ in record['exons']], [stop for _, stop in record['exons']], 'ebi', transcript=transcript, cds=cds, select_transcript=True) for mapping in read_mappings(lrgmap_file): session.add(mapping) session.commit()
def retrieveslice(self, accno, start, stop, orientation): """ Retrieve a slice of a chromosome. If the arguments are recognised (found in the internal database), we look if the associated file is still present and if so: return its UD number. If the arguments are recognised but no file was found, we download the new slice and update the hash (and log if the hash changes). If the arguments are not recognised, we download the new slice and make a new UD number. The content of the slice is placed in the cache with the UD number as filename. :arg unicode accno: The accession number of the chromosome. :arg int start: Start position of the slice (one-based, inclusive, in reference orientation). :arg int stop: End position of the slice (one-based, inclusive, in reference orientation). :arg int orientation: Orientation of the slice: - 1 ; Forward. - 2 ; Reverse complement. :returns: An UD number. :rtype: unicode """ # Not a valid slice. if start > stop: self._output.addMessage( __file__, 4, 'ERETR', 'Could not retrieve slice for start ' 'position greater than stop position.') return None # The slice can not be too big. if stop - start + 1 > settings.MAX_FILE_SIZE: self._output.addMessage( __file__, 4, 'ERETR', 'Could not retrieve slice (request ' 'exceeds maximum of %d bases)' % settings.MAX_FILE_SIZE) return None # Value of the Reference.source_data field for this slice. source_data = '{}:{}:{}:{}'.format(accno, start, stop, ['forward', 'reverse'][orientation - 1]) # Check whether we have seen this slice before. reference = Reference.query.filter_by(source='ncbi_slice', source_data=source_data).first() if reference and os.path.isfile(self._name_to_file( reference.accession)): # It's still present. return reference.accession # It's not present, so download it. try: # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and # in reference orientation. handle = Entrez.efetch(db='nuccore', rettype='gbwithparts', retmode='text', id=accno, seq_start=start, seq_stop=stop, strand=orientation) raw_data = handle.read() handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage( __file__, -1, 'INFO', 'Error connecting to Entrez nuccore database: {}'.format( unicode(e))) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None # Calculate the hash of the downloaded file. md5sum = self._calculate_hash(raw_data) if reference is not None: # We have seen this one before. current_md5sum = reference.checksum if md5sum != current_md5sum: self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( reference.accession, current_md5sum, md5sum)) Reference.query.filter_by( accession=reference.accession).update({'checksum': md5sum}) session.commit() else: # We haven't seen it before, so give it a name. ud = self._new_ud() reference = Reference(ud, md5sum, source='ncbi_slice', source_data=source_data) session.add(reference) session.commit() if self.write(raw_data, reference.accession, 0): return reference.accession
def hg19_transcript_mappings(): """ Fixture for some selected transcript mappings in the GRCh37/hg19 genome assembly. Depends on the :func:`hg19` fixture. """ chromosome_1 = Chromosome.query.filter_by(accession='NC_000001.10').one() chromosome_3 = Chromosome.query.filter_by(accession='NC_000003.11').one() chromosome_6 = Chromosome.query.filter_by(accession='NC_000006.11').one() chromosome_7 = Chromosome.query.filter_by(accession='NC_000007.13').one() chromosome_8 = Chromosome.query.filter_by(accession='NC_000008.10').one() chromosome_11 = Chromosome.query.filter_by(accession='NC_000011.9').one() chromosome_20 = Chromosome.query.filter_by(accession='NC_000020.10').one() chromosome_22 = Chromosome.query.filter_by(accession='NC_000022.10').one() chromosome_x = Chromosome.query.filter_by(accession='NC_000023.10').one() chromosome_mt = Chromosome.query.filter_by(accession='NC_012920.1').one() session.add_all([ chromosome_1, chromosome_6, chromosome_8, chromosome_11, chromosome_20, chromosome_22, chromosome_mt ]) session.add( TranscriptMapping(chromosome_11, 'refseq', 'NM_003002', 'SDHD', 'forward', 111957571, 111966518, [111957571, 111958581, 111959591, 111965529], [111957683, 111958697, 111959735, 111966518], 'ncbi', transcript=1, cds=(111957632, 111965694), select_transcript=False, version=2)) session.add( TranscriptMapping(chromosome_11, 'refseq', 'NM_012459', 'TIMM8B', 'reverse', 111955524, 111957522, [111955524, 111957364], [111956186, 111957522], 'ncbi', transcript=1, cds=(111956019, 111957492), select_transcript=False, version=2)) session.add( TranscriptMapping(chromosome_11, 'refseq', 'NR_028383', 'TIMM8B', 'reverse', 111955524, 111957522, [111955524, 111956702, 111957364], [111956186, 111957034, 111957522], 'ncbi', transcript=1, cds=None, select_transcript=False, version=1)) session.add( TranscriptMapping(chromosome_6, 'refseq', 'NM_000500', 'CYP21A2', 'forward', 32006082, 32009419, [ 32006082, 32006499, 32006871, 32007133, 32007323, 32007526, 32007782, 32008183, 32008445, 32008646 ], [ 32006401, 32006588, 32007025, 32007234, 32007424, 32007612, 32007982, 32008361, 32008548, 32009419 ], 'ncbi', transcript=1, cds=(32006200, 32008911), select_transcript=False, version=5)) session.add( TranscriptMapping(chromosome_22, 'refseq', 'NM_001145134', 'CPT1B', 'reverse', 51007290, 51017096, [ 51007290, 51007765, 51008005, 51008722, 51009320, 51009587, 51009804, 51010435, 51010632, 51011304, 51011949, 51012764, 51012922, 51014464, 51014627, 51015286, 51015753, 51016204, 51016978 ], [ 51007510, 51007850, 51008097, 51008835, 51009472, 51009721, 51009968, 51010551, 51010737, 51011489, 51012144, 51012848, 51013029, 51014541, 51014764, 51015463, 51015892, 51016363, 51017096 ], 'ncbi', transcript=1, cds=(51007767, 51016344), select_transcript=False, version=1)) session.add( TranscriptMapping(chromosome_22, 'refseq', 'NR_021492', 'LOC100144603', 'forward', 51021455, 51022356, [51021455, 51022027], [51021752, 51022356], 'ncbi', transcript=1, cds=None, select_transcript=False, version=1)) session.add( TranscriptMapping( chromosome_1, 'refseq', 'NM_001007553', 'CSDE1', 'reverse', 115259538, 115300624, [ 115259538, 115261234, 115262200, 115263160, 115266504, 115267842, 115268832, 115269604, 115272879, 115273129, 115275225, 115276353, 115276610, 115277063, 115279379, 115280092, 115280584, 115282313, 115292442, 115300546 ], [ 115260837, 115261366, 115262363, 115263338, 115266623, 115267954, 115269007, 115269711, 115273043, 115273269, 115275437, 115276478, 115276738, 115277144, 115279476, 115280184, 115280693, 115282511, 115292828, 115300624 ], 'ncbi', transcript=1, cds=(115260790, 115282511), select_transcript=False, version=1)) session.add( TranscriptMapping( chromosome_1, 'refseq', 'NM_001130523', 'CSDE1', 'reverse', 115259538, 115300671, [ 115259538, 115261234, 115262200, 115263160, 115266504, 115267842, 115268832, 115269604, 115272879, 115273129, 115275225, 115276353, 115276610, 115277063, 115279379, 115280584, 115282313, 115284148, 115292442, 115300546 ], [ 115260837, 115261366, 115262363, 115263338, 115266623, 115267954, 115269007, 115269711, 115273043, 115273269, 115275437, 115276478, 115276738, 115277144, 115279476, 115280693, 115282511, 115284294, 115292828, 115300671 ], 'ncbi', transcript=1, cds=(115260790, 115284285), select_transcript=False, version=1)) session.add( TranscriptMapping(chromosome_1, 'refseq', 'NM_002241', 'KCNJ10', 'reverse', 160007257, 160040051, [160007257, 160039812], [160012322, 160040051], 'ncbi', transcript=1, cds=(160011183, 160012322), select_transcript=False, version=4)) session.add( TranscriptMapping( chromosome_20, 'refseq', 'NM_001162505', 'TMEM189', 'reverse', 48740274, 48770335, [48740274, 48744512, 48746083, 48747402, 48760039, 48770054], [48741716, 48744724, 48746227, 48747484, 48760158, 48770335], 'ncbi', transcript=1, cds=(48741595, 48770174), select_transcript=False, version=1)) session.add( TranscriptMapping( chromosome_8, 'refseq', 'NM_017780', 'CHD7', 'forward', 61591339, 61779465, [ 61591339, 61653818, 61693559, 61707545, 61712947, 61714087, 61720776, 61728946, 61732566, 61734349, 61734583, 61735062, 61736399, 61741222, 61742881, 61748632, 61749376, 61750227, 61750635, 61754203, 61754406, 61757423, 61757809, 61761074, 61761610, 61763052, 61763591, 61763821, 61764578, 61765057, 61765388, 61766922, 61768534, 61769004, 61773463, 61774755, 61775107, 61777575 ], [ 61591641, 61655656, 61693989, 61707686, 61713084, 61714152, 61720831, 61729060, 61732649, 61734486, 61734704, 61735305, 61736575, 61741365, 61743136, 61748842, 61749571, 61750394, 61750814, 61754313, 61754611, 61757622, 61757968, 61761163, 61761713, 61763181, 61763663, 61763878, 61764806, 61765265, 61766059, 61767082, 61768761, 61769447, 61773684, 61774895, 61775211, 61779465 ], 'ncbi', transcript=1, cds=(61653992, 61778492), select_transcript=False, version=2)) session.add( TranscriptMapping(chromosome_mt, 'refseq', 'NC_012920', 'ND4', 'forward', 10760, 12137, [10760], [12137], 'reference', transcript=1, cds=(10760, 12137), select_transcript=True, version=1)) session.add( TranscriptMapping( chromosome_1, 'refseq', 'NM_002001', 'FCER1A', 'forward', 159259504, 159278014, [159259504, 159272096, 159272644, 159273718, 159275778, 159277538], [159259543, 159272209, 159272664, 159273972, 159276035, 159278014], 'ncbi', transcript=1, cds=(159272155, 159277722), select_transcript=False, version=2)) session.add( TranscriptMapping(chromosome_7, 'refseq', 'XM_001715131', 'LOC100132858', 'reverse', 19828, 36378, [19828, 20834, 31060, 32957, 35335, 36224], [19895, 21029, 31437, 33107, 35541, 36378], 'ncbi', transcript=1, cds=(19828, 36378), select_transcript=False, version=2)) session.add( TranscriptMapping( chromosome_x, 'refseq', 'NM_004011', 'DMD', 'reverse', 31137345, 32430371, [ 31137345, 31144759, 31152219, 31164408, 31165392, 31187560, 31190465, 31191656, 31196049, 31196786, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31341715, 31366673, 31462598, 31496223, 31497100, 31514905, 31525398, 31645790, 31676107, 31697492, 31747748, 31792077, 31838092, 31854835, 31893305, 31947713, 31950197, 31986456, 32235033, 32305646, 32328199, 32360217, 32361251, 32364060, 32366523, 32380905, 32382699, 32383137, 32398627, 32404427, 32407618, 32408188, 32429869, 32430279 ], [ 31140047, 31144790, 31152311, 31164531, 31165635, 31187718, 31190530, 31191721, 31196087, 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31341775, 31366751, 31462744, 31496491, 31497220, 31515061, 31525570, 31645979, 31676261, 31697703, 31747865, 31792309, 31838200, 31854936, 31893490, 31947862, 31950344, 31986631, 32235180, 32305818, 32328393, 32360399, 32361403, 32364197, 32366645, 32381075, 32382827, 32383316, 32398797, 32404582, 32407791, 32408298, 32430030, 32430371 ], 'ncbi', transcript=1, cds=(31140036, 32430326), select_transcript=False, version=3)) session.add( TranscriptMapping(chromosome_x, 'refseq', 'NM_004019', 'DMD', 'reverse', 31196312, 31285024, [ 31196312, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31284927 ], [ 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31285024 ], 'ncbi', transcript=1, cds=(31196782, 31284946), select_transcript=False, version=2)) session.add( TranscriptMapping( chromosome_x, 'refseq', 'NM_004007', 'DMD', 'reverse', 31137345, 33038317, [ 31137345, 31144759, 31152219, 31164408, 31165392, 31187560, 31190465, 31191656, 31196049, 31196786, 31198487, 31200855, 31222078, 31224699, 31227615, 31241164, 31279072, 31341715, 31366673, 31462598, 31496223, 31497100, 31514905, 31525398, 31645790, 31676107, 31697492, 31747748, 31792077, 31838092, 31854835, 31893305, 31947713, 31950197, 31986456, 32235033, 32305646, 32328199, 32360217, 32361251, 32364060, 32366523, 32380905, 32382699, 32383137, 32398627, 32404427, 32407618, 32408188, 32429869, 32456358, 32459297, 32466573, 32472779, 32481556, 32482703, 32486615, 32490281, 32503036, 32509394, 32519872, 32536125, 32563276, 32583819, 32591647, 32591862, 32613874, 32632420, 32662249, 32663081, 32715987, 32717229, 32827610, 32834585, 32841412, 32862900, 32867845, 33038256 ], [ 31140047, 31144790, 31152311, 31164531, 31165635, 31187718, 31190530, 31191721, 31196087, 31196922, 31198598, 31201021, 31222235, 31224784, 31227816, 31241238, 31279133, 31341775, 31366751, 31462744, 31496491, 31497220, 31515061, 31525570, 31645979, 31676261, 31697703, 31747865, 31792309, 31838200, 31854936, 31893490, 31947862, 31950344, 31986631, 32235180, 32305818, 32328393, 32360399, 32361403, 32364197, 32366645, 32381075, 32382827, 32383316, 32398797, 32404582, 32407791, 32408298, 32430030, 32456507, 32459431, 32466755, 32472949, 32481711, 32482816, 32486827, 32490426, 32503216, 32509635, 32519959, 32536248, 32563451, 32583998, 32591754, 32591963, 32613993, 32632570, 32662430, 32663269, 32716115, 32717410, 32827728, 32834757, 32841504, 32862977, 32867937, 33038317 ], 'ncbi', transcript=1, cds=(31140036, 32834745), select_transcript=False, version=2)) session.add( TranscriptMapping(chromosome_x, 'refseq', 'NM_203473', 'PORCN', 'forward', 48367371, 48379202, [ 48367371, 48368172, 48369683, 48370280, 48370714, 48370977, 48371223, 48372628, 48372913, 48374105, 48374278, 48374449, 48375571, 48378763 ], [ 48367491, 48368344, 48369875, 48370323, 48370895, 48371107, 48371240, 48372753, 48373013, 48374181, 48374341, 48374534, 48375681, 48379202 ], 'ncbi', transcript=1, cds=(48368209, 48378864), select_transcript=False, version=1)) session.add( TranscriptMapping( chromosome_x, 'refseq', 'NM_000132', 'F8', 'reverse', 154064063, 154250998, [ 154064063, 154088707, 154089993, 154091358, 154124352, 154128141, 154129646, 154130326, 154132181, 154132571, 154133086, 154134695, 154156846, 154175973, 154182167, 154185232, 154189350, 154194245, 154194701, 154197606, 154212962, 154215512, 154221211, 154225248, 154227754, 154250685 ], [ 154066027, 154088883, 154090141, 154091502, 154124507, 154128226, 154129717, 154130442, 154132363, 154132799, 154133298, 154134848, 154159951, 154176182, 154182317, 154185446, 154189443, 154194416, 154194962, 154197827, 154213078, 154215580, 154221423, 154225370, 154227875, 154250998 ], 'ncbi', transcript=1, cds=(154065872, 154250827), select_transcript=False, version=3)) session.add( TranscriptMapping(chromosome_3, 'refseq', 'NM_000249', 'MLH1', 'forward', 37034841, 37092337, [ 37034841, 37038110, 37042446, 37045892, 37048482, 37050305, 37053311, 37053502, 37055923, 37058997, 37061801, 37067128, 37070275, 37081677, 37083759, 37089010, 37090008, 37090395, 37091977 ], [ 37035154, 37038200, 37042544, 37045965, 37048554, 37050396, 37053353, 37053590, 37056035, 37059090, 37061954, 37067498, 37070423, 37081785, 37083822, 37089174, 37090100, 37090508, 37092337 ], 'ncbi', transcript=1, cds=(37035039, 37092144), select_transcript=False, version=3)) session.commit()
def import_from_mapview_file(assembly, mapview_file, group_label): """ Import transcript mappings from an NCBI mapview file. We require that this file is first sorted on the `feature_id` column (#11), which always contains the gene identifier, and then on the `chromosome` column (#2). sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md Raises :exc:`ValueError` if `mapview_file` is not sorted this way. The NCBI mapping file consists of entries, one per line, in order of their location in the genome (more specifically by start location). Every entry has a 'group_label' column, denoting the assembly it is from. We only use entries where this value is `group_label`. There are four types of entries (for our purposes): - Gene: Name, identifier, and location of a gene. - Transcript: Name, gene id, and location of a transcript. - UTR: Location and transcript of a non-coding exon (or part of it). - CDS: Location and transcript of a coding exon (or part of it). A bit troublesome for us is that exons are split in UTR exons and CDS exons, with exons overlapping the UTR/CDS border defined as two separate entries (one of type UTR and one of type CDS). Another minor annoyance is that some transcripts (~ 15) are split over two contigs (NT_*). In that case, they are defined by two entries in the file, where we should merge them by taking the start position of the first and the stop position of the second. To complicate this annoyance, some genes (e.g. in the PAR) are mapped on both the X and Y chromosomes, but stored in the file just like the transcripts split over two contigs. However, these ones should of course not be merged. Our strategy is too sort by gene and chromosome and process the file grouped by these two fields. For transcripts without any UTR and CDS entries (seems to happen for predicted genes), we generate one exon spanning the entire transcript. All positions are one-based, inclusive, and that is what we also use in our database. """ columns = [ 'taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', 'feature_name', 'feature_id', 'feature_type', 'group_label', 'transcript', 'evidence_code' ] chromosomes = assembly.chromosomes.all() def read_records(mapview_file): for line in mapview_file: if line.startswith('#'): continue record = dict(zip(columns, line.rstrip().split('\t'))) # Only use records from the given assembly. if record['group_label'] != group_label: continue # Only use records on chromosomes we know. try: record['chromosome'] = next(c for c in chromosomes if c.name == 'chr' + record['chromosome']) except StopIteration: continue record['start'] = int(record['start']) record['stop'] = int(record['stop']) yield record def build_mappings(records): # We structure the records per transcript and per record type. This is # generalized to a list of records for each type, but we expect only # one GENE record (with `-` as transcript value). # Note that there can be more than one RNA record per transcript if it # is split over different reference contigs. by_transcript = defaultdict(lambda: defaultdict(list)) for r in records: by_transcript[r['transcript']][r['feature_type']].append(r) gene = by_transcript['-']['GENE'][0]['feature_name'] for transcript, by_type in by_transcript.items(): if transcript == '-': continue accession, version = transcript.split('.') version = int(version) chromosome = by_type['RNA'][0]['chromosome'] orientation = 'reverse' if by_type['RNA'][0][ 'orientation'] == '-' else 'forward' start = min(t['start'] for t in by_type['RNA']) stop = max(t['stop'] for t in by_type['RNA']) exon_starts = [] exon_stops = [] cds_positions = [] for exon in sorted(by_type['UTR'] + by_type['CDS'], key=itemgetter('start')): if exon_stops and exon_stops[-1] > exon['start'] - 1: # This exon starts before the end of the previous exon. We # have no idea what to do in this case, so we ignore it. # The number of transcripts affected is very small (e.g., # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). continue if exon['feature_type'] == 'CDS': cds_positions.extend([exon['start'], exon['stop']]) if exon_stops and exon_stops[-1] == exon['start'] - 1: # This exon must be merged with the previous one because # it is split over two entries (a CDS part and a UTR part # or split over different reference contigs). exon_stops[-1] = exon['stop'] else: exon_starts.append(exon['start']) exon_stops.append(exon['stop']) if cds_positions: cds = min(cds_positions), max(cds_positions) else: cds = None # If no exons are annotated, we create one spanning the entire # transcript. if not exon_starts: exon_starts = [start] exon_stops = [stop] yield TranscriptMapping.create_or_update(chromosome, 'refseq', accession, gene, orientation, start, stop, exon_starts, exon_stops, 'ncbi', cds=cds, version=version) processed_keys = set() for key, records in groupby(read_records(mapview_file), itemgetter('feature_id', 'chromosome')): if key in processed_keys: raise MapviewSortError('Mapview file must be sorted by feature_id ' 'and chromosome (try `sort -k 11,11 -k ' '2,2`)') processed_keys.add(key) for mapping in build_mappings(records): session.add(mapping) session.commit()
def retrieveslice(self, accno, start, stop, orientation): """ Retrieve a slice of a chromosome. If the arguments are recognised (found in the internal database), we look if the associated file is still present and if so: return its UD number. If the arguments are recognised but no file was found, we download the new slice and update the hash (and log if the hash changes). If the arguments are not recognised, we download the new slice and make a new UD number. The content of the slice is placed in the cache with the UD number as filename. :arg unicode accno: The accession number of the chromosome. :arg int start: Start position of the slice (one-based, inclusive, in reference orientation). :arg int stop: End position of the slice (one-based, inclusive, in reference orientation). :arg int orientation: Orientation of the slice: - 1 ; Forward. - 2 ; Reverse complement. :returns: An UD number. :rtype: unicode """ # Not a valid slice. if start > stop: self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice for start ' 'position greater than stop position.') return None # The slice can not be too big. if stop - start + 1 > settings.MAX_FILE_SIZE: self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice (request ' 'exceeds maximum of %d bases)' % settings.MAX_FILE_SIZE) return None slice_orientation = ['forward', 'reverse'][orientation - 1] # Check whether we have seen this slice before. try: reference = Reference.query.filter_by( slice_accession=accno, slice_start=start, slice_stop=stop, slice_orientation=slice_orientation).one() except NoResultFound: reference = None else: if os.path.isfile(self._name_to_file(reference.accession)): # It's still present. return reference.accession # It's not present, so download it. try: # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and # in reference orientation. handle = Entrez.efetch( db='nuccore', rettype='gb', retmode='text', id=accno, seq_start=start, seq_stop=stop, strand=orientation) raw_data = handle.read() handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage( __file__, -1, 'INFO', 'Error connecting to Entrez nuccore database: {}'.format( unicode(e))) self._output.addMessage( __file__, 4, 'ERETR', 'Could not retrieve slice.') return None # Calculate the hash of the downloaded file. md5sum = self._calculate_hash(raw_data) if reference is not None: # We have seen this one before. current_md5sum = reference.checksum if md5sum != current_md5sum: self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( reference.accession, current_md5sum, md5sum)) Reference.query.filter_by( accession=reference.accession).update({'checksum': md5sum}) session.commit() else: # We haven't seen it before, so give it a name. ud = self._new_ud() slice_orientation = ['forward', 'reverse'][orientation - 1] reference = Reference( ud, md5sum, slice_accession=accno, slice_start=start, slice_stop=stop, slice_orientation=slice_orientation) session.add(reference) session.commit() if self.write(raw_data, reference.accession, 0): return reference.accession