class NCBI(Base): """ETL class for NCBI source""" def __init__(self, database: Database, host='ftp.ncbi.nlm.nih.gov', data_dir='gene/DATA/', src_data_dir=APP_ROOT / 'data' / 'ncbi'): """Construct the NCBI ETL instance. :param Database database: gene database for adding new data :param str host: FTP host name :param str data_dir: FTP data directory to use :param Path src_data_dir: Data directory for NCBI """ super().__init__(database, host, data_dir, src_data_dir) self._sequence_location = SequenceLocation() self._chromosome_location = ChromosomeLocation() self._data_url = f"ftp://{host}" self._assembly = None self._date_today = datetime.today().strftime('%Y%m%d') def perform_etl(self): """Perform ETL methods. :return: Concept IDs of concepts successfully loaded """ self._extract_data() self._transform_data() self._database.flush_batch() return self._processed_ids def _download_data(self): """Download NCBI info, history, and GRCh38 files. :param str ncbi_dir: The NCBI data directory """ # Download info data_dir = f'{self._data_dir}GENE_INFO/Mammalia/' fn = f'ncbi_info_{self._date_today}.tsv' data_fn = 'Homo_sapiens.gene_info.gz' logger.info('Downloading NCBI gene_info....') self._ftp_download(self._host, data_dir, fn, self.src_data_dir, data_fn) logger.info('Successfully downloaded NCBI gene_info.') # Download history fn = f'ncbi_history_{self._date_today}.tsv' data_fn = 'gene_history.gz' logger.info('Downloading NCBI gene_history...') self._ftp_download(self._host, self._data_dir, fn, self.src_data_dir, data_fn) logger.info('Successfully downloaded NCBI gene_history.') # Download gff self._download_gff() def _download_gff(self) -> None: """Download latest gff data""" regex_patern = r"GCF_\d+\.\d+_(?P<assembly>GRCh\d+\.\S+)_genomic.gff.gz" regex = re.compile(regex_patern) with FTP(self._host) as ftp: ftp.login() ftp.cwd("genomes/refseq/vertebrate_mammalian/Homo_sapiens/" "latest_assembly_versions") dir = ftp.nlst()[0] ftp.cwd(dir) for f in ftp.nlst(): match = regex.match(f) if match: resp = match.groupdict() self._assembly = resp["assembly"] new_fn = f"ncbi_{self._assembly}.gff" if not (self.src_data_dir / new_fn).exists(): self._ftp_download_file(ftp, f, self.src_data_dir, new_fn) logger.info(f"Successfully downloaded NCBI {f} data.") else: logger.info(f"NCBI {f} already exists.") break def _files_downloaded(self, data_dir: Path) -> bool: """Check whether needed source files exist. :param Path data_dir: source data directory :return: true if all needed files exist, false otherwise """ files = data_dir.iterdir() info_downloaded: bool = False history_downloaded: bool = False gff_downloaded: bool = False for f in files: if f.name.startswith(f'ncbi_info_{self._date_today}'): info_downloaded = True elif f.name.startswith(f'ncbi_history_{self._date_today}'): history_downloaded = True elif f.name.startswith('ncbi_GRCh38.p13'): gff_downloaded = True return info_downloaded and history_downloaded and gff_downloaded def _extract_data(self): """Gather data from local files or download from source. - Data is expected to be in <PROJECT ROOT>/data/ncbi. - For now, data files should all be from the same source data version. """ self._create_data_directory() if not self._files_downloaded(self.src_data_dir): self._download_data() local_files = [ f for f in self.src_data_dir.iterdir() if f.name.startswith('ncbi') ] local_files.sort(key=lambda f: f.name.split('_')[-1], reverse=True) self._info_src = [ f for f in local_files if f.name.startswith('ncbi_info') ][0] self._history_src = [ f for f in local_files if f.name.startswith('ncbi_history') ][0] self._gff_src = [ f for f in local_files if f.name.startswith('ncbi_GRCh') ][0] self._version = self._info_src.stem.split('_')[-1] def _get_prev_symbols(self): """Store a gene's symbol history. :return: A dictionary of a gene's previous symbols """ # get symbol history history_file = open(self._history_src, 'r') history = csv.reader(history_file, delimiter='\t') next(history) prev_symbols = {} with self._database.genes.batch_writer() as batch: for row in history: # Only interested in rows that have h**o sapiens tax id if row[0] == '9606': if row[1] != '-': gene_id = row[1] if gene_id in prev_symbols.keys(): prev_symbols[gene_id].append(row[3]) else: prev_symbols[gene_id] = [row[3]] else: # Load discontinued genes params = { 'concept_id': f'{NamespacePrefix.NCBI.value.lower()}:' f'{row[2]}', 'symbol': row[3], 'symbol_status': SymbolStatus.DISCONTINUED.value } self._load_gene(params, batch) history_file.close() return prev_symbols def _add_xrefs_associated_with(self, val, params): """Add xrefs and associated_with refs to a transformed gene. :param list val: A list of source ids for a given gene :param dict params: A transformed gene record """ params['xrefs'] = [] params['associated_with'] = [] for src in val: src_name = src.split(':')[0].upper() src_id = src.split(':')[-1] if src_name == "GENEID": params['concept_id'] = f"{NamespacePrefix.NCBI.value}:{src_id}" elif src_name in NamespacePrefix.__members__ and \ NamespacePrefix[src_name].value in PREFIX_LOOKUP: params['xrefs'].append(f"{NamespacePrefix[src_name].value}" f":{src_id}") else: if src_name.startswith("MIM"): prefix = NamespacePrefix.OMIM.value elif src_name.startswith("IMGT/GENE-DB"): prefix = NamespacePrefix.IMGT_GENE_DB.value elif src_name.startswith("MIRBASE"): prefix = NamespacePrefix.MIRBASE.value else: prefix = None if prefix: params['associated_with'].append(f"{prefix}:{src_id}") else: logger.info(f"{src_name} is not in NameSpacePrefix.") if not params['xrefs']: del params['xrefs'] if not params['associated_with']: del params['associated_with'] def _get_gene_info(self, prev_symbols): """Store genes from NCBI info file. :param dict prev_symbols: A dictionary of a gene's previous symbols :return: A dictionary of gene's from the NCBI info file. """ # open info file, skip headers info_file = open(self._info_src, 'r') info = csv.reader(info_file, delimiter='\t') next(info) info_genes = dict() for row in info: params = dict() params['concept_id'] = f"{NamespacePrefix.NCBI.value}:{row[1]}" # get symbol params['symbol'] = row[2] # get aliases if row[4] != '-': params['aliases'] = row[4].split('|') else: params['aliases'] = [] # get associated_with if row[5] != '-': associated_with = row[5].split('|') self._add_xrefs_associated_with(associated_with, params) # get chromosome location vrs_chr_location = self._get_vrs_chr_location(row, params) if 'exclude' in vrs_chr_location: # Exclude genes with multiple distinct locations (e.g. OMS) continue if not vrs_chr_location: vrs_chr_location = [] params['locations'] = vrs_chr_location # get label if row[8] != '-': params['label'] = row[8] # add prev symbols if row[1] in prev_symbols.keys(): params['previous_symbols'] = prev_symbols[row[1]] info_genes[params['symbol']] = params # get type params['gene_type'] = row[9] return info_genes def _get_gene_gff(self, db, info_genes, sr): """Store genes from NCBI gff file. :param FeatureDB db: GFF database :param dict info_genes: A dictionary of gene's from the NCBI info file. :param SeqRepo sr: Access to the seqrepo """ for f in db.all_features(): if f.attributes.get('ID'): f_id = f.attributes.get('ID')[0] if f_id.startswith('gene'): symbol = f.attributes['Name'][0] if symbol in info_genes: # Just need to add SequenceLocation params = info_genes.get(symbol) vrs_sq_location = \ self._get_vrs_sq_location(db, sr, params, f_id) if vrs_sq_location: params['locations'].append(vrs_sq_location) else: # Need to add entire gene gene = self._add_gff_gene(db, f, sr, f_id) info_genes[gene['symbol']] = gene def _add_gff_gene(self, db, f, sr, f_id): """Create a transformed gene recor from NCBI gff file. :param FeatureDB db: GFF database :param Feature f: A gene from the gff data file :param SeqRepo sr: Access to the seqrepo :param str f_id: The feature's ID :return: A gene dictionary if the ID attribute exists. Else return None. """ params = dict() params['src_name'] = SourceName.NCBI.value self._add_attributes(f, params) sq_loc = self._get_vrs_sq_location(db, sr, params, f_id) if sq_loc: params['locations'] = [sq_loc] else: params['locations'] = list() params['label_and_type'] = \ f"{params['concept_id'].lower()}##identity" return params def _add_attributes(self, f, gene): """Add concept_id, symbol, and xrefs/associated_with to a gene record. :param gffutils.feature.Feature f: A gene from the data :param gene: A transformed gene record """ attributes = ['ID', 'Name', 'description', 'Dbxref'] for attribute in f.attributes.items(): key = attribute[0] if key in attributes: val = attribute[1] if len(val) == 1 and key != 'Dbxref': val = val[0] if key == 'Dbxref': self._add_xrefs_associated_with(val, gene) elif key == 'Name': gene['symbol'] = val def _get_vrs_sq_location(self, db, sr, params, f_id): """Store GA4GH VRS SequenceLocation in a gene record. https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation :param FeatureDB db: GFF database :param SeqRepo sr: Access to the seqrepo :param dict params: A transformed gene record :param str f_id: The feature's ID :return: A GA4GH VRS SequenceLocation """ gene = db[f_id] params['strand'] = gene.strand return self._sequence_location.add_location(gene.seqid, gene, params, sr) def _get_xref_associated_with(self, src_name, src_id): """Get xref or associated_with ref. :param str src_name: Source name :param src_id: The source's accession number :return: A dict containing an xref or associated_with ref """ source = dict() if src_name.startswith('HGNC'): source['xrefs'] = \ [f"{NamespacePrefix.HGNC.value}:{src_id}"] elif src_name.startswith('NCBI'): source['xrefs'] = \ [f"{NamespacePrefix.NCBI.value}:{src_id}"] elif src_name.startswith('UniProt'): source['associated_with'] = [ f"{NamespacePrefix.UNIPROT.value}:{src_id}" ] # noqa E501 elif src_name.startswith('miRBase'): source['associated_with'] = [ f"{NamespacePrefix.MIRBASE.value}:{src_id}" ] # noqa E501 elif src_name.startswith('RFAM'): source['associated_with'] = [ f"{NamespacePrefix.RFAM.value}:{src_id}" ] # noqa E501 return source def _get_vrs_chr_location(self, row, params): """Store GA4GH VRS ChromosomeLocation in a gene record. https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#chromosomelocation :param list row: A row in NCBI data file :param dict params: A transformed gene record :return: A list of GA4GH VRS ChromosomeLocations """ params['location_annotations'] = list() chromosomes_locations = self._set_chromsomes_locations(row, params) locations = chromosomes_locations['locations'] chromosomes = chromosomes_locations['chromosomes'] if chromosomes_locations['exclude']: return ['exclude'] location_list = list() if chromosomes and not locations: for chromosome in chromosomes: if chromosome == 'MT': params['location_annotations'].append( Chromosome.MITOCHONDRIA.value) else: params['location_annotations'].append(chromosome.strip()) elif locations: self._add_chromosome_location(locations, location_list, params) if not params['location_annotations']: del params['location_annotations'] return location_list def _set_chromsomes_locations(self, row, params): """Set chromosomes and locations for a given gene record. :param list row: A gene row in the NCBI data file :param dict params: A transformed gene record :return: A dictionary containing a gene's chromosomes and locations """ chromosomes = None if row[6] != '-': if '|' in row[6]: chromosomes = row[6].split('|') else: chromosomes = [row[6]] if len(chromosomes) >= 2: if chromosomes and 'X' not in chromosomes and \ 'Y' not in chromosomes: logger.info(f'{row[2]} contains multiple distinct ' f'chromosomes: {chromosomes}.') chromosomes = None locations = None exclude = False if row[7] != '-': if '|' in row[7]: locations = row[7].split('|') elif ';' in row[7]: locations = row[7].split(';') elif 'and' in row[7]: locations = row[7].split('and') else: locations = [row[7]] # Sometimes locations will store the same location twice if len(locations) == 2: if locations[0] == locations[1]: locations = [locations[0]] # Exclude genes where there are multiple distinct locations # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3' if len(locations) > 2: logger.info(f'{row[2]} contains multiple distinct ' f'locations: {locations}.') locations = None exclude = True # NCBI sometimes contains invalid map locations if locations: for i in range(len(locations)): loc = locations[i].strip() if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc): logger.info(f'{row[2]} contains invalid map location:' f'{loc}.') params['location_annotations'].append(loc) del locations[i] return { 'locations': locations, 'chromosomes': chromosomes, 'exclude': exclude } def _add_chromosome_location(self, locations, location_list, params): """Add a chromosome location to the location list. :param list locations: NCBI map locations for a gene record. :param list location_list: A list to store chromosome locations. :param dict params: A transformed gene record """ for i in range(len(locations)): loc = locations[i].strip() location = dict() if Annotation.ALT_LOC.value in loc: loc = loc.split(f"{Annotation.ALT_LOC.value}")[0].strip() params['location_annotations'].append(Annotation.ALT_LOC.value) contains_centromere = False if 'cen' in loc: contains_centromere = True arm_match = re.search("[pq]", loc) if arm_match and not contains_centromere: arm_ix = arm_match.start() chromosome = loc[:arm_ix].strip() # NCBI sometimes stores invalid map locations # i.e. 7637 stores 'map from Rosati ref via FISH [AFS]' if not re.match("^([1-9][0-9]?|X|Y|MT)$", chromosome): continue location['chr'] = chromosome # Check to see if there is a band / sub band included if arm_ix != len(loc) - 1: if '-' in loc: self._chromosome_location.set_interval_range( loc, arm_ix, location) else: # Location only gives start start = loc[arm_ix:] location['start'] = start location['end'] = start else: # Only arm is included location['start'] = loc[arm_ix] location['end'] = loc[arm_ix] elif contains_centromere: self._set_centromere_location(loc, location) else: # Location only gives chr params['location_annotations'].append(loc) chr_location = \ self._chromosome_location.get_location(location, params) if chr_location: location_list.append(chr_location) def _set_centromere_location(self, loc, location): """Set centromere location for a gene. :param str loc: A gene location :param dict location: GA4GH location """ centromere_ix = re.search("cen", loc).start() if '-' in loc: # Location gives both start and end range_ix = re.search('-', loc).start() if 'q' in loc: location['chr'] = loc[:centromere_ix].strip() location['start'] = "cen" location['end'] = loc[range_ix + 1:] elif 'p' in loc: p_ix = re.search("p", loc).start() location['chr'] = loc[:p_ix].strip() location['end'] = "cen" location['start'] = loc[:range_ix] else: location['chr'] = loc[:centromere_ix].strip() location['start'] = "cen" location['end'] = "cen" def _transform_data(self): """Modify data and pass to loading functions.""" logger.info('Transforming NCBI...') self._add_meta() prev_symbols = self._get_prev_symbols() info_genes = self._get_gene_info(prev_symbols) # create db for gff file db = gffutils.create_db(str(self._gff_src), dbfn=":memory:", force=True, merge_strategy="create_unique", keep_order=True) self._get_gene_gff(db, info_genes, self.seqrepo) with self._database.genes.batch_writer() as batch: for gene in info_genes.keys(): self._load_gene(info_genes[gene], batch) logger.info('Successfully transformed NCBI.') def _add_meta(self): """Load metadata""" metadata = SourceMeta( data_license="custom", data_license_url="https://www.ncbi.nlm.nih.gov/home/" "about/policies/", version=self._version, data_url=self._data_url, rdp_url="https://reusabledata.org/ncbi-gene.html", data_license_attributes={ 'non_commercial': False, 'share_alike': False, 'attribution': False }, genome_assemblies=[self._assembly]) self._load_meta(self._database, metadata, SourceName.NCBI.value)
class Ensembl(Base): """ETL the Ensembl source into the normalized database.""" def __init__(self, database: Database, host="ftp.ensembl.org", data_dir="pub/current_gff3/homo_sapiens/", src_data_dir=APP_ROOT / "data" / "ensembl") -> None: """Initialize Ensembl ETL class. :param Database database: DynamoDB database :param str host: FTP host name :param str data_dir: FTP data directory to use :param Path src_data_dir: Data directory for Ensembl """ super().__init__(database, host, data_dir, src_data_dir) self._sequence_location = SequenceLocation() self._host = host self._data_dir = data_dir self._version = None self._fn = None self._data_url = None self._assembly = None def _download_data(self) -> None: """Download latest Ensembl GFF3 data file.""" logger.info("Downloading latest Ensembl data file...") self._create_data_directory() regex_pattern = r"Homo_sapiens\.(?P<assembly>GRCh\d+)\.(?P<version>\d+)\.gff3\.gz" # noqa: E501 regex = re.compile(regex_pattern) with FTP(self._host) as ftp: ftp.login() ftp.cwd(self._data_dir) files = ftp.nlst() for f in files: match = regex.match(f) if match: resp = match.groupdict() self._assembly = resp["assembly"] self._version = resp["version"] self._fn = f self._data_url = f"ftp://{self._host}/{self._data_dir}{self._fn}" new_fn = f"ensembl_{self._version}.gff3" if not (self.src_data_dir / new_fn).exists(): self._ftp_download_file(ftp, self._fn, self.src_data_dir, new_fn) logger.info( f"Successfully downloaded Ensembl {self._version}" f" data.") else: logger.info( f"Ensembl {self._version} data already exists.") break def _extract_data(self, *args, **kwargs): """Extract data from the Ensembl source.""" if "data_path" in kwargs: self._data_src = kwargs["data_path"] else: self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1] def _transform_data(self, *args, **kwargs): """Transform the Ensembl source.""" logger.info("Transforming Ensembl...") db = gffutils.create_db(str(self._data_src), dbfn=":memory:", force=True, merge_strategy="create_unique", keep_order=True) # Get accession numbers accession_numbers = dict() for item in db.features_of_type("scaffold"): accession_numbers[item[0]] = item[8]["Alias"][-1] for item in db.features_of_type("chromosome"): accession_numbers[item[0]] = item[8]["Alias"][-1] with self._database.genes.batch_writer() as batch: for f in db.all_features(): if f.attributes.get("ID"): f_id = f.attributes.get("ID")[0].split(":")[0] if f_id == "gene": gene = \ self._add_gene(f, self.seqrepo, accession_numbers) if gene: self._load_gene(gene, batch) logger.info("Successfully transformed Ensembl.") def _add_gene(self, f, sr, accession_numbers): """Create a transformed gene record. :param gffutils.feature.Feature f: A gene from the data :param SeqRepo sr: Access to the seqrepo :param dict accession_numbers: Accession numbers for each chromosome and scaffold :return: A gene dictionary if the ID attribute exists. Else return None. """ gene = dict() if f.strand == "-": gene["strand"] = Strand.REVERSE elif f.strand == "+": gene["strand"] = Strand.FORWARD gene["src_name"] = SourceName.ENSEMBL.value self._add_attributes(f, gene) location = self._add_location(f, gene, sr, accession_numbers) if location: gene["locations"] = [location] gene["label_and_type"] = \ f"{gene['concept_id'].lower()}##identity" gene["item_type"] = "identity" return gene def _add_attributes(self, f, gene): """Add concept_id, symbol, xrefs, and associated_with to a gene record. :param gffutils.feature.Feature f: A gene from the data :param gene: A transformed gene record """ attributes = { "ID": "concept_id", "Name": "symbol", "description": "xrefs", "biotype": "gene_type" } for attribute in f.attributes.items(): key = attribute[0] if key in attributes.keys(): val = attribute[1] if len(val) == 1: val = val[0] if key == "ID": if val.startswith("gene"): val = f"{NamespacePrefix.ENSEMBL.value}:" \ f"{val.split(':')[1]}" if key == "description": gene["label"] = val.split("[")[0].strip() if "Source:" in val: src_name = val.split("[")[-1].split( "Source:")[-1].split("Acc")[0].split(";")[0] src_id = val.split("Acc:")[-1].split("]")[0] if ":" in src_id: src_id = src_id.split(":")[-1] source = self._get_xref_associated_with( src_name, src_id) if "xrefs" in source: gene["xrefs"] = source["xrefs"] elif "associated_with" in source: gene["associated_with"] = source["associated_with"] continue gene[attributes[key]] = val def _add_location(self, f, gene, sr, accession_numbers): """Add GA4GH SequenceLocation to a gene record. https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation :param gffutils.feature.Feature f: A gene from the data :param dict gene: A transformed gene record :param dict accession_numbers: Accession numbers for each chromosome and scaffold :param SeqRepo sr: Access to the seqrepo """ return self._sequence_location.add_location(accession_numbers[f.seqid], f, gene, sr) def _get_xref_associated_with(self, src_name, src_id): """Get xref or associated_with concept. :param str src_name: Source name :param src_id: The source's accession number :return: A dict containing an other identifier or xref """ source = dict() if src_name.startswith("HGNC"): source["xrefs"] = \ [f"{NamespacePrefix.HGNC.value}:{src_id}"] elif src_name.startswith("NCBI"): source["xrefs"] = \ [f"{NamespacePrefix.NCBI.value}:{src_id}"] elif src_name.startswith("UniProt"): source["associated_with"] = [ f"{NamespacePrefix.UNIPROT.value}:{src_id}" ] elif src_name.startswith("miRBase"): source["associated_with"] = [ f"{NamespacePrefix.MIRBASE.value}:{src_id}" ] elif src_name.startswith("RFAM"): source["associated_with"] = [ f"{NamespacePrefix.RFAM.value}:{src_id}" ] return source def perform_etl(self, *args, **kwargs): """Extract, Transform, and Load data into DynamoDB database. :return: Concept IDs of concepts successfully loaded """ self._download_data() self._extract_data() self._add_meta() self._transform_data() self._database.flush_batch() return self._processed_ids def _add_meta(self, *args, **kwargs): """Add Ensembl metadata.""" metadata = SourceMeta( data_license="custom", data_license_url="https://useast.ensembl.org/info/about" "/legal/disclaimer.html", version=self._version, data_url=self._data_url, rdp_url=None, data_license_attributes={ "non_commercial": False, "share_alike": False, "attribution": False }, genome_assemblies=[self._assembly]) self._database.metadata.put_item( Item={ "src_name": SourceName.ENSEMBL.value, "data_license": metadata.data_license, "data_license_url": metadata.data_license_url, "version": metadata.version, "data_url": metadata.data_url, "rdp_url": metadata.rdp_url, "data_license_attributes": metadata.data_license_attributes, "genome_assemblies": metadata.genome_assemblies }) self._load_meta(self._database, metadata, SourceName.ENSEMBL.value)
class Ensembl(Base): """ETL the Ensembl source into the normalized database.""" def __init__(self, database: Database, host='ftp.ensembl.org', data_dir='pub/', src_data_dir=APP_ROOT / 'data' / 'ensembl', version='104'): """Initialize Ensembl ETL class. :param Database database: DynamoDB database :param str host: FTP host name :param str data_dir: FTP data directory to use :param Path src_data_dir: Data directory for Ensembl :param int version: Version for fn """ super().__init__(database, host, data_dir, src_data_dir) self._sequence_location = SequenceLocation() self._host = host self._data_dir = data_dir self._version = version self._fn = f'Homo_sapiens.GRCh38.{self._version}.gff3.gz' self._data_url = f"ftp://{self._host}/{self._data_dir}{self._fn}" self._data_file_url = None self._assembly = 'GRCh38' def _download_data(self): """Download Ensembl GFF3 data file.""" logger.info('Downloading Ensembl data file...') self._create_data_directory() new_fn = f'ensembl_{self._version}.gff3' if not (self.src_data_dir / new_fn).exists(): self._ftp_download( self._host, f'{self._data_dir}release-{self._version}' f'/gff3/homo_sapiens/', new_fn, self.src_data_dir, self._fn) logger.info('Successfully downloaded Ensembl data file.') def _extract_data(self, *args, **kwargs): """Extract data from the Ensembl source.""" if 'data_path' in kwargs: self._data_src = kwargs['data_path'] else: self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1] def _transform_data(self, *args, **kwargs): """Transform the Ensembl source.""" logger.info('Transforming Ensembl...') db = gffutils.create_db(str(self._data_src), dbfn=":memory:", force=True, merge_strategy="create_unique", keep_order=True) # Get accession numbers accession_numbers = dict() for item in db.features_of_type('scaffold'): accession_numbers[item[0]] = item[8]['Alias'][-1] for item in db.features_of_type('chromosome'): accession_numbers[item[0]] = item[8]['Alias'][-1] with self._database.genes.batch_writer() as batch: for f in db.all_features(): if f.attributes.get('ID'): f_id = f.attributes.get('ID')[0].split(':')[0] if f_id == 'gene': gene = \ self._add_gene(f, self.seqrepo, accession_numbers) if gene: self._load_gene(gene, batch) logger.info('Successfully transformed Ensembl.') def _add_gene(self, f, sr, accession_numbers): """Create a transformed gene record. :param gffutils.feature.Feature f: A gene from the data :param SeqRepo sr: Access to the seqrepo :param dict accession_numbers: Accession numbers for each chromosome and scaffold :return: A gene dictionary if the ID attribute exists. Else return None. """ gene = dict() if f.strand == '-': gene['strand'] = Strand.REVERSE elif f.strand == '+': gene['strand'] = Strand.FORWARD gene['src_name'] = SourceName.ENSEMBL.value self._add_attributes(f, gene) location = self._add_location(f, gene, sr, accession_numbers) if location: gene['locations'] = [location] gene['label_and_type'] = \ f"{gene['concept_id'].lower()}##identity" gene['item_type'] = 'identity' return gene def _add_attributes(self, f, gene): """Add concept_id, symbol, xrefs, and associated_with to a gene record. :param gffutils.feature.Feature f: A gene from the data :param gene: A transformed gene record """ attributes = { 'ID': 'concept_id', 'Name': 'symbol', 'description': 'xrefs' } for attribute in f.attributes.items(): key = attribute[0] if key in attributes.keys(): val = attribute[1] if len(val) == 1: val = val[0] if key == 'ID': if val.startswith('gene'): val = f"{NamespacePrefix.ENSEMBL.value}:" \ f"{val.split(':')[1]}" if key == 'description': gene['label'] = val.split('[')[0].strip() if 'Source:' in val: src_name = val.split('[')[-1].split( 'Source:')[-1].split('Acc')[0].split(';')[0] src_id = val.split('Acc:')[-1].split(']')[0] if ':' in src_id: src_id = src_id.split(':')[-1] source = self._get_xref_associated_with( src_name, src_id) # noqa: E501 if 'xrefs' in source: gene['xrefs'] = source['xrefs'] elif 'associated_with' in source: gene['associated_with'] = source['associated_with'] continue gene[attributes[key]] = val def _add_location(self, f, gene, sr, accession_numbers): """Add GA4GH SequenceLocation to a gene record. https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation :param gffutils.feature.Feature f: A gene from the data :param dict gene: A transformed gene record :param dict accession_numbers: Accession numbers for each chromosome and scaffold :param SeqRepo sr: Access to the seqrepo """ return self._sequence_location.add_location(accession_numbers[f.seqid], f, gene, sr) def _get_xref_associated_with(self, src_name, src_id): """Get xref or associated_with concept. :param str src_name: Source name :param src_id: The source's accession number :return: A dict containing an other identifier or xref """ source = dict() if src_name.startswith('HGNC'): source['xrefs'] = \ [f"{NamespacePrefix.HGNC.value}:{src_id}"] elif src_name.startswith('NCBI'): source['xrefs'] = \ [f"{NamespacePrefix.NCBI.value}:{src_id}"] elif src_name.startswith('UniProt'): source['associated_with'] = [ f"{NamespacePrefix.UNIPROT.value}:{src_id}" ] # noqa: E501 elif src_name.startswith('miRBase'): source['associated_with'] = [ f"{NamespacePrefix.MIRBASE.value}:{src_id}" ] # noqa: E501 elif src_name.startswith('RFAM'): source['associated_with'] = [ f"{NamespacePrefix.RFAM.value}:{src_id}" ] # noqa: E501 return source def perform_etl(self, *args, **kwargs): """Extract, Transform, and Load data into DynamoDB database. :return: Concept IDs of concepts successfully loaded """ self._download_data() self._extract_data() self._add_meta() self._transform_data() self._database.flush_batch() return self._processed_ids def _add_meta(self, *args, **kwargs): """Add Ensembl metadata.""" metadata = SourceMeta( data_license='custom', data_license_url='https://useast.ensembl.org/info/about' '/legal/disclaimer.html', version=self._version, data_url=self._data_url, rdp_url=None, data_license_attributes={ 'non_commercial': False, 'share_alike': False, 'attribution': False }, genome_assemblies=[self._assembly]) self._database.metadata.put_item( Item={ 'src_name': SourceName.ENSEMBL.value, 'data_license': metadata.data_license, 'data_license_url': metadata.data_license_url, 'version': metadata.version, 'data_url': metadata.data_url, 'rdp_url': metadata.rdp_url, 'data_license_attributes': metadata.data_license_attributes, 'genome_assemblies': metadata.genome_assemblies }) self._load_meta(self._database, metadata, SourceName.ENSEMBL.value)