class MirtarbaseParser(ReturnParser): def __init__(self): """ """ super(MirtarbaseParser, self).__init__() # RelationshipSets self.mirna_targets_gene = RelationshipSet('TARGETS', ['Mirna'], ['Gene'], ['name'], ['sid']) def run_with_mounted_arguments(self): self.run() def run(self): log.debug("Run {}".format(self.__class__.__name__)) mirtarbase_instance = self.get_instance_by_name('Mirtarbase') mirtarbase_file = mirtarbase_instance.get_file(FILE_NAME) df = pandas.read_excel(mirtarbase_file, index_col=None, header=0) # rename columns for easier access df.columns = ['mirtarbase_id', 'mirna', 'species_mirna', 'target_genesymbol', 'target_entrez', 'species_target', 'experiments', 'support_type', 'references'] for row in df.itertuples(): self.mirna_targets_gene.add_relationship( {'name': row.mirna.strip()}, {'sid': str(row.target_entrez).strip()}, {'experiments': row.experiments, 'support_type': row.support_type, 'references': row.references, 'source': mirtarbase_instance.datasource.name} )
class DummyParser(ReturnParser): def __init__(self): super(DummyParser, self).__init__() # arguments self.arguments = ['taxid'] # output data self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid']) self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid']) self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): dummy_instance = self.get_instance_by_name('Dummy') dummyfile = dummy_instance.get_file('file.txt') target_sids = list(string.ascii_lowercase) # Fummy nodes for i in range(10): self.fummy_nodes.add_node({'sid': i, 'taxid': taxid}) with open(dummyfile) as f: for l in f: letter = l.strip() self.dummy_nodes.add_node({'sid': letter, 'taxid': taxid}) self.dummy_knows_fummy.add_relationship({'sid': letter}, {'sid': randint(0, 9)}, {'key': 'value'})
def read_daily_report_data_csv_JHU(file): """ Extract data from a single daile report file from JHU. :param file: Path to the CSV file :return: """ log.info('Read JHU CSV file {}'.format(file)) countries = NodeSet(['Country'], ['name']) provinces = NodeSet(['Province'], ['name']) updates = NodeSet(['DailyReport'], ['uuid']) province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'], ['name'], ['name']) province_in_country.unique = True province_rep_update = RelationshipSet('REPORTED', ['Province'], ['DailyReport'], ['name'], ['uuid']) with open(file, 'rt') as csvfile: rows = csv.reader(csvfile, delimiter=',', quotechar='"') # skip header next(rows) for row in rows: country = row[1] province = row[0] # if no name for province, use country name if not province: province = '{}_complete'.format(country) date = parse(row[2]) uuid = country+province+str(date) confirmed = int(row[3]) if row[3] else 'na' death = int(row[4]) if row[4] else 'na' recovered = int(row[5]) if row[5] else 'na' lat = row[6] if len(row) >= 7 else None long = row[7] if len(row) >= 8 else None province_dict = {'name': province} if lat and long: province_dict['latitude'] = lat province_dict['longitude'] = long provinces.add_unique(province_dict) countries.add_unique({'name': country}) updates.add_unique( {'date': date, 'confirmed': confirmed, 'death': death, 'recovered': recovered, 'uuid': uuid}) province_in_country.add_relationship({'name': province}, {'name': country}, {'source': 'jhu'}) province_rep_update.add_relationship({'name': province}, {'uuid': uuid}, {'source': 'jhu'}) return countries, provinces, updates, province_in_country, province_rep_update
class DependingTestParser(ReturnParser): def __init__(self): super(DependingTestParser, self).__init__() self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id']) def run_with_mounted_arguments(self): self.run() def run(self): for i in range(100): self.rels.add_relationship({'source_id': i}, {'target_id': i}, {'source': 'test'})
class NcbiLegacyGeneParser(ReturnParser): """ Parse legacy gene IDs from gene_history.gz #tax_id GeneID Discontinued_GeneID Discontinued_Symbol Discontinue_Date 9 - 1246494 repA1 20031113 9 - 1246495 repA2 20031113 9 - 1246496 leuA 20031113 """ def __init__(self): super(NcbiLegacyGeneParser, self).__init__() self.arguments = ['taxid'] self.legacy_genes = NodeSet(['Gene', 'Legacy'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.legacy_gene_now_gene = RelationshipSet( 'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'], default_props={'source': 'ncbigene'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): log.debug(f'Run parser {self.__class__.__name__} for taxID: {taxid}.') ncbigene_instance = self.get_instance_by_name('NcbiGene') gene_history_file = ncbigene_instance.get_file('gene_history.gz') with gzip.open(gene_history_file, 'rt') as f: # skip header next(f) for l in f: flds = l.strip().split('\t') this_taxid = flds[0] if this_taxid == taxid: new_gene_id = flds[1] discontinued_gene_id = flds[2] discontinued_symbol = flds[3] date = flds[4] self.legacy_genes.add_node({ 'sid': discontinued_gene_id, 'date': date, 'symbol': discontinued_symbol, 'taxid': taxid }) if new_gene_id != '-': self.legacy_gene_now_gene.add_relationship( {'sid': discontinued_gene_id}, {'sid': new_gene_id}, {})
class GtexDataParser(ReturnParser): def __init__(self): """ :param mesh_instance: NcbiGene Instance :type mesh_instance: DataSourceInstance """ super(GtexDataParser, self).__init__() self.gene_expressed_tissue = RelationshipSet('EXPRESSED', ['Gene'], ['GtexDetailedTissue'], ['sid'], ['name']) self.object_sets = [self.gene_expressed_tissue] self.container.add_all(self.object_sets) def run_with_mounted_arguments(self): self.run() def run(self): gtex_instance = self.get_instance_by_name('Gtex') gtex_mean_gene = gtex_instance.get_file( 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz') with gzip.open(gtex_mean_gene, 'rt') as f: lines = f.readlines() # remove first two lines lines = lines[2:] # get header line header = lines.pop(0) header_fields = header.split('\t') # iterate data lines for line in lines: flds = line.split('\t') gene_id = flds[0].split('.')[0] data_flds = flds[2:] # iterate the other elements with index # have the index start at 2 to match the header which also includes the first two columns for i, value in enumerate(data_flds, start=2): tissue_detailed_name = header_fields[i] self.gene_expressed_tissue.add_relationship( {'sid': gene_id}, {'name': tissue_detailed_name}, {'val': value})
class SomeParser(ReturnParser): def __init__(self): super(SomeParser, self).__init__() self.source = NodeSet(['Source'], merge_keys=['source_id']) self.target = NodeSet(['Target'], merge_keys=['target_id']) self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id']) def run_with_mounted_arguments(self): self.run() def run(self): for i in range(100): self.source.add_node({'source_id': i}) self.target.add_node({'target_id': i}) self.rels.add_relationship({'source_id': i}, {'target_id': i}, {'source': 'test'})
class HGNCParser(ReturnParser): def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(HGNCParser, self).__init__() # output data self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'], ['GeneSymbol'], ['sid'], ['sid', 'taxid']) def run_with_mounted_arguments(self): self.run() def run(self): hgnc_instance = self.get_instance_by_name('HGNC') hgnc_complete_file = hgnc_instance.get_file('hgnc_complete_set.txt') self.parse_hgnc_complete_file(hgnc_complete_file) def parse_hgnc_complete_file(self, hgnc_complete_file): with open(hgnc_complete_file, 'rt') as f: header = next(f) for l in f: flds = l.strip().split('\t') sid = flds[0] gene_symbol = flds[1] ncbi_id = flds[18] if len(flds) > 18 else None ensembl_id = flds[19] if len(flds) > 19 else None all_props = dict(zip(header, flds)) all_props['sid'] = sid all_props['source'] = 'hgnc' self.genes.add_node(all_props) if ncbi_id: self.gene_maps_gene.add_relationship({'sid': sid}, {'sid': ncbi_id}, {'source': 'hgnc'}) if ensembl_id: self.gene_maps_gene.add_relationship({'sid': sid}, {'sid': ensembl_id}, {'source': 'hgnc'}) if gene_symbol: self.gene_maps_genesymbol.add_relationship({'sid': sid}, { 'sid': gene_symbol, 'taxid': '9606' }, {'source': 'hgnc'})
class MirdbParser(ReturnParser): def __init__(self): super(MirdbParser, self).__init__() # arguments self.arguments = ['taxid'] # RelationshipSets self.mirna_targets_transcript = RelationshipSet( 'TARGETS', ['Mirna'], ['Transcript'], ['name'], ['sid']) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): mirdb_instance = self.get_instance_by_name('Mirdb') mirdb_file = mirdb_instance.datasource.get_prediction_file( mirdb_instance) datasource_name = mirdb_instance.datasource.name mir_prefix = TAXID_2_MIRPREFIX[taxid] with gzip.open(mirdb_file, 'rt') as f: for l in f: flds = l.split() mir_name = flds[0] if mir_name.startswith(mir_prefix): target = flds[1] score = float(flds[2]) self.mirna_targets_transcript.add_relationship( {'name': mir_name}, {'sid': target}, { 'score': score, 'source': datasource_name })
class NcbiGeneOrthologParser(ReturnParser): def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(NcbiGeneOrthologParser, self).__init__() self.gene_ortholog_gene = RelationshipSet('ORTHOLOG', ['Gene'], ['Gene'], ['sid'], ['sid']) self.object_sets = [self.gene_ortholog_gene] self.container.add_all(self.object_sets) def run_with_mounted_arguments(self): self.run() def run(self): """ Get the Gene-ORTHOLOG-Gene relationships. This is currently not filteres for taxid. """ ncbigene_instance = self.get_instance_by_name('NcbiGene') ortholog_file = ncbigene_instance.get_file('gene_orthologs.gz') with gzip.open(ortholog_file, 'rt') as f: # skip first line next(f) for l in f: flds = l.strip().split() g1 = flds[1] g2 = flds[4] self.gene_ortholog_gene.add_relationship({'sid': g1}, {'sid': g2}, {})
def load_wpp_data(base_path, graph): """ Load UN population data. :param base_path: Path where file was downloaded. """ un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv') log.info('Parse UN population data file: {}'.format(un_wpp_csv_file)) country = NodeSet(['Country'], ['name']) age_group_nodes = NodeSet(['AgeGroup'], ['group']) country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group']) country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group']) country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group']) countries_added = set() age_groups_added = set() with open(un_wpp_csv_file, 'rt') as f: csv_file = csv.reader(f, delimiter=',', quotechar='"') # skip header next(csv_file) for row in csv_file: # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal loc_id = row[0] location = row[1] time = int(row[4]) age_group = row[6] age_group_start = int(row[7]) age_group_span = row[8] pop_male = int(float((row[9])) * 1000) pop_female = int(float((row[10])) * 1000) pop_total = int(float((row[11])) * 1000) # only take 2019 if time == 2019: if location not in countries_added: country.add_node({'name': location, 'un_id': loc_id}) countries_added.add(location) if age_group not in age_groups_added: age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span}) country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total}) country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male}) country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female}) log.info('Load data to Neo4j') country.merge(graph) age_group_nodes.merge(graph) country_total_group.merge(graph) country_male_group.merge(graph) country_female_group.merge(graph)
class ChebiParser(ReturnParser): def __init__(self): super(ChebiParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'chebi'}) self.metabolite_isa_metabolite = RelationshipSet( 'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_rel_metabolite = RelationshipSet( 'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_maps_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) def run_with_mounted_arguments(self): self.run() def run(self): chebi_instance = self.get_instance_by_name('Chebi') obo_file = chebi_instance.get_file('chebi.obo') cleaned_obo_file = clean_obo_file(obo_file) chebi_ontology = pronto.Ontology(cleaned_obo_file) reltypes = set() # iterate terms for term in chebi_ontology.terms(): term_sid = (term.id).split(':')[1] ontology_id = term.id self.metabolites.add_node({ 'name': (term.name), 'sid': term_sid, 'ontology_id': ontology_id, 'definition': term.definition, 'alt_ids': list(term.alternate_ids) }) for parent in term.superclasses(distance=1, with_self=False): self.metabolite_isa_metabolite.add_relationship( {'sid': term_sid}, {'sid': parent.id}, {}) ## other named relationships try: for reltype, targets in term.relationships.items(): for target in targets: self.metabolite_rel_metabolite.add_relationship( {'sid': term_sid}, {'sid': target.id}, {'type': reltype.id}) except KeyError as e: log.error(f"Cannot iterate relationshis of term {term_sid}") log.error(e) # metabolite-MAPS-metabolite for xref in term.xrefs: if 'HMDB:' in xref.id: hmdb_id = xref.id.strip().split('HMDB:')[1] self.metabolite_maps_metabolite.add_relationship( {'sid': term_sid}, {'sid': hmdb_id}, {})
class EnsemblEntityParser(ReturnParser): def __init__(self): """ :param ensembl_instance: The ENSEMBL DataSource instance. """ super(EnsemblEntityParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSets self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'ensembl'}) # RelationshipSets self.gene_codes_transcript = RelationshipSet( 'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) self.transcript_codes_protein = RelationshipSet( 'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): ensembl_instance = self.get_instance_by_name('Ensembl') datasource_name = ensembl_instance.datasource.name # try patched path, if not available take flat ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid, ensembl_instance, patched=True) if not os.path.exists(ensembl_gtf_file_path): ensembl_gtf_file_path = Ensembl.get_gtf_file_path(taxid, ensembl_instance, patched=False) annotation = GffReader(ensembl_gtf_file_path) check_gene_ids = set() check_transcript_ids = set() check_protein_ids = set() check_gene_transcript_rels = set() check_transcript_protein_rels = set() log.info("Start parsing ENSEMBL gtf file, taxid {}, {}".format( taxid, ensembl_gtf_file_path)) for r in annotation.records: # add gene node gene_id = r.attributes['gene_id'] if gene_id not in check_gene_ids: props = { 'sid': gene_id, 'name': r.attributes['gene_name'], 'taxid': taxid } self.genes.add_node(props) check_gene_ids.add(gene_id) # add transcript node if r.type == 'transcript': transcript_id = r.attributes['transcript_id'] if transcript_id not in check_transcript_ids: props = {'sid': transcript_id, 'taxid': taxid} self.transcripts.add_node(props) check_transcript_ids.add(transcript_id) # add protein node if r.type == 'CDS': protein_id = r.attributes['protein_id'] if protein_id not in check_protein_ids: props = {'sid': protein_id, 'taxid': taxid} self.proteins.add_node(props) check_protein_ids.add(protein_id) # Gene-CODES-Transcript if r.type == 'transcript': transcript_id = r.attributes['transcript_id'] gene_id = r.attributes['gene_id'] # add gene-transcript rel if gene_id + transcript_id not in check_gene_transcript_rels: self.gene_codes_transcript.add_relationship( {'sid': gene_id}, {'sid': transcript_id}, {}) check_gene_transcript_rels.add(gene_id + transcript_id) # Transcript-CODES-Protein if r.type == 'CDS': protein_id = r.attributes['protein_id'] transcript_id = r.attributes['transcript_id'] # add transcript-protein rel if transcript_id + protein_id not in check_transcript_protein_rels: self.transcript_codes_protein.add_relationship( {'sid': transcript_id}, {'sid': protein_id}, {}) check_transcript_protein_rels.add(transcript_id + protein_id) log.info("Finished parsing ENSEMBL gtf file.")
class NcbiGeneParser(ReturnParser): def __init__(self): super(NcbiGeneParser, self).__init__() # arguments self.arguments = ['taxid'] # output data # both gene IDs and GeneSymbols have the label 'Gene' # two different NodeSets are used because only the GeneSymbol nodes need taxid for uniqueness self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.genesymbols = NodeSet(['Gene'], merge_keys=['sid', 'taxid'], default_props={ 'source': 'ncbigene', 'type': 'symbol' }) self.genesymbol_synonym_genesymbol = RelationshipSet( 'SYNONYM', ['Gene'], ['Gene'], ['sid', 'taxid'], ['sid', 'taxid'], default_props={'source': 'ncbigene'}) self.gene_maps_genesymbol = RelationshipSet( 'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid', 'taxid'], default_props={'source': 'ncbigene'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): log.info(f"Run {self.__class__.__name__}") ncbigene_instance = self.get_instance_by_name('NcbiGene') # get org specific gene_info file if available if taxid in TAXID_SPECIFIC_GENEINFO: gene_info_file = ncbigene_instance.get_file( TAXID_SPECIFIC_GENEINFO[taxid]) else: gene_info_file = ncbigene_instance.get_file('gene_info.gz') log.info(gene_info_file) self.parse_gene_info(gene_info_file, taxid) def parse_gene_info(self, gene_info_file, taxid): # check sets check_ids = set() check_ids_symbols = set() with gzip.open(gene_info_file, 'rt') as f: header = next(f) # account for different formatting of header line (leading "#Format: " or not) if header.startswith('#Format:'): header_fields = tuple( header.split(':')[1].split('(') [0].rstrip().lstrip().split()) elif header.startswith('#tax'): header_fields = tuple(header[1:].strip().split('\t')) else: raise AttributeError( "File header was reformatted: {0}".format(header)) for l in f: flds = l.rstrip().split('\t') this_taxid = flds[0] if this_taxid == taxid: # (Gene) entrez_gene_id = flds[1] if entrez_gene_id not in check_ids: props = {'sid': entrez_gene_id, 'taxid': taxid} # update with all fields props.update(zip(header_fields, flds)) check_ids.add(entrez_gene_id) self.genes.add_node(props) # (GeneSymbol) and (GeneSymbol)-[SYNONYM]-(GeneSymbol) primary_symbol = flds[2] synonym_symbols = flds[4].split('|') # add primary symbol node if primary_symbol not in check_ids_symbols and primary_symbol != '-': check_ids_symbols.add(primary_symbol) self.genesymbols.add_node({ 'sid': primary_symbol, 'taxid': taxid }) for synonym in synonym_symbols: # GeneSymbol-[SYNONYM]-GeneSymbol self.genesymbol_synonym_genesymbol.add_relationship( { 'sid': synonym, 'taxid': taxid }, { 'sid': primary_symbol, 'taxid': taxid }, {}) if synonym not in check_ids_symbols and synonym != '-': check_ids_symbols.add(synonym) self.genesymbols.add_node({ 'sid': synonym, 'status': 'synonym', 'taxid': taxid }) # (Gene)-[MAPS]-(GeneSymbol) # primary self.gene_maps_genesymbol.add_relationship( {'sid': entrez_gene_id}, { 'sid': primary_symbol, 'taxid': taxid }, {'status': 'primary'}) # synonym for symbol in synonym_symbols: self.gene_maps_genesymbol.add_relationship( {'sid': entrez_gene_id}, { 'sid': symbol, 'taxid': taxid }, {'status': 'synonym'})
class UniprotKnowledgebaseParser(ReturnParser): """ Uniprot has extensive mapping data to other data sources. Data is in the main Uniprot data file (referred to as Uniprot knowledge base). Ensembl: DR Ensembl; ENST00000353703; ENSP00000300161; ENSG00000166913. [P31946-1] DR Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1] Refseq: DR RefSeq; NP_006752.1; NM_006761.4. [P62258-1] The mapping parser returns transcript-protein relationships for both ENSEMBL and RefSeq. """ def __init__(self): """ :param uniprot_instance: The Uniprot instance :param taxid: The taxid """ super(UniprotKnowledgebaseParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSet self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'uniprot'}) # RelationshipSet self.protein_primary_protein = RelationshipSet('PRIMARY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'}) self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'}) self.protein_maps_protein = RelationshipSet('MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): uniprot_instance = self.get_instance_by_name('Uniprot') knowledgebase_files = uniprot_instance.datasource.get_knowledgebase_files_for_taxid(taxid, uniprot_instance) datasource_name = uniprot_instance.datasource.name # get organims name from taxid os_string_id = TAXID_OS_NAME[taxid] check_protein = set() check_p_p_p = set() check_t_c_p = set() check_p_m_p = set() # for now we always run on SPROT and TREMBL for kb_file in knowledgebase_files: log.debug(f"Parsing {kb_file}") with gzip.open(kb_file, 'rt') as f: up_parser = EMBLReaderUniProt(f) for record in up_parser.records: # check taxon if os_string_id in record['OS']: # acc acc_list = record['AC'] primary_acc = acc_list[0] secondary = acc_list[1:] # (Protein) # make primary protein with full data desc = record['DE'] rec_name = desc.split(';')[0].split('Full=')[1] primary_props = {'sid': primary_acc, 'name': rec_name, 'desc': desc, 'category': 'primary', 'taxid': taxid} if primary_acc not in check_protein: self.proteins.add_node(primary_props) check_protein.add(primary_acc) for secondary_acc in secondary: if secondary_acc not in check_protein: self.proteins.add_node( {'sid': secondary_acc, 'category': 'secondary', 'taxid': taxid}) check_protein.add(secondary_acc) # (Protein)-[PRIMARY]-(Protein) if frozenset([primary_acc, secondary_acc]) not in check_p_p_p: self.protein_primary_protein.add_relationship( {'sid': primary_acc}, {'sid': secondary}, {} ) check_p_p_p.add(frozenset([primary_acc, secondary_acc])) # (Transcript)-[CODES]-(Protein) # (Protein)-[MAPS]-(Protein) ## RefSeq # ('RefSeq', ['NP_003395']), refseq_mappings = [x[1] for x in record['DR'] if x[0] == 'RefSeq'] for map in refseq_mappings: for refseq_id in map: # remove version from refseq ID refseq_id = refseq_id.split('.')[0] second_letter = refseq_id[1] # (Transcript)-[CODES]-(Protein) if second_letter == 'M' or second_letter == 'R': for uniprot_acc in acc_list: if refseq_id + uniprot_acc not in check_t_c_p: self.transcript_codes_protein.add_relationship( {'sid': refseq_id}, {'sid': uniprot_acc}, {'source': datasource_name} ) check_t_c_p.add(refseq_id + uniprot_acc) # (Protein)-[MAPS]-(Protein) if second_letter == 'P': for uniprot_acc in acc_list: if uniprot_acc + refseq_id not in check_p_m_p: self.protein_maps_protein.add_relationship( {'sid': uniprot_acc}, {'sid': refseq_id}, {} ) check_p_m_p.add(uniprot_acc + refseq_id) ## ensembl ensembl_mappings = [x[1] for x in record['DR'] if x[0] == 'Ensembl'] for map in ensembl_mappings: ensembl_transcript_id = map[0] ensembl_protein_id = map[1] for uniprot_acc in acc_list: # (Transcript)-[CODES]-(Protein) if ensembl_transcript_id + uniprot_acc not in check_t_c_p: self.transcript_codes_protein.add_relationship( {'sid': ensembl_transcript_id}, {'sid': uniprot_acc}, {} ) check_t_c_p.add(ensembl_transcript_id + uniprot_acc) # (Protein)-[MAPS]-(Protein) if ensembl_protein_id + uniprot_acc not in check_p_m_p: self.protein_maps_protein.add_relationship( {'sid': uniprot_acc}, {'sid': ensembl_protein_id}, {} ) check_p_m_p.add(ensembl_protein_id + uniprot_acc)
class RefseqCodesParser(ReturnParser): """ Get mappings from NCBI Gene to Refseq transcripts. Refseq provides a mapping file that contains a gene-transcript-protein mapping per line: release86.accession2geneid.gz Example line: TaxID, NCBI Gene ID, RefSeq transcript ID, RefSeq protein ID 9606 100008586 NM_001098405.2 NP_001091875.1 :param refseq_mapping_file: The release86.accession2geneid.gz mapping file :param taxid: TaxID :return: List of (Gene)-[CODES]-(Transcript) Relationships """ def __init__(self): """ :param refseq_instance: The RefSeq DataSource instance. """ super(RefseqCodesParser, self).__init__() # arguments self.arguments = ['taxid'] # define NodeSet and RelationshipSet self.gene_codes_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): refseq_instance = self.get_instance_by_name('Refseq') datasource_name = refseq_instance.datasource.name refseq_accession2geneid_file = refseq_instance.datasource.get_accession2geneid_file_path(refseq_instance) # check sets to avoid duplicates check_g_t_rels = set() check_t_p_rels = set() with gzip.open(refseq_accession2geneid_file, 'rt') as f: for l in f: flds = l.strip().split('\t') this_taxid = flds[0] gene_id = flds[1] transcript_id = flds[2].split('.')[0] protein_id = flds[3].split('.')[0] if this_taxid == taxid: # gene-transcript or transcript-protein pairs can be duplicate # if e.g. a gene has one transcript which gives rise to two proteins # we thus check for each pair if it was added already if gene_id + transcript_id not in check_g_t_rels: self.gene_codes_transcript.add_relationship( {'sid': gene_id}, {'sid': transcript_id}, {'taxid': taxid} ) check_g_t_rels.add(gene_id + transcript_id) # the gene/transcript relationship is mostly clear # but often there are no proteins associated if protein_id != 'na': if transcript_id + protein_id not in check_t_p_rels: self.transcript_codes_protein.add_relationship( {'sid': transcript_id}, {'sid': protein_id}, {'taxid': taxid} ) check_t_p_rels.add(transcript_id + protein_id)
class RefseqRemovedRecordsParser(ReturnParser): """ Parse all removed records from the removed_records files. The mappings to gene IDs are in the accession2geneid of the *previous* release. Simple approach is to collect all files from all previous releases and collect all records (relationships are filtered locally). """ def __init__(self): super(RefseqRemovedRecordsParser, self).__init__() self.arguments = ['taxid'] self.legacy_ids = set() self.legacy_transcripts = NodeSet(['Transcript', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.legacy_transcript_now_transcript = RelationshipSet('REPLACED_BY', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.legacy_proteins = NodeSet(['Protein', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.legacy_protein_now_protein = RelationshipSet('REPLACED_BY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.gene_codes_legacy_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript', 'Legacy'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.legacy_transcript_codes_protein = RelationshipSet('CODES', ['Transcript', 'Legacy'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): # get the nodes first, this also creates a set of all legacy IDs self.get_legacy_nodes(taxid) # then get the relationnships to gene IDs, this uses the set of legacy IDs to not recreate existing relationships self.get_legacy_gene_rels(taxid) def get_legacy_nodes(self, taxid): """ ========================================== release#.removed-records ========================================== Content: Tab-delimited report of records that were included in the previous release but are not included in the current release. Columns: 1. taxonomy ID 2. species name 3. accession.version 4. refseq release directory accession is included in complete + other directories '|' delimited 5. refseq status na - not available; status codes are not applied to most genomic records INFERRED PREDICTED PROVISIONAL VALIDATED REVIEWED MODEL UNKNOWN - status code not provided; however usually is provided for this type of record 6. length 7. removed status dead protein: protein was removed when genomic record was reloaded and protein was not found on the nucleotide update. This is an implied permanent suppress. temporarily suppressed: record was temporarily removed and may be restored at a later date. permanently suppressed: record was permanently removed. It is possible to restore this type of record however at the time of removal that action is not anticipated. replaced by accession: the accession in column 3 has become a secondary accession that cited in column 8. :param taxid: :return: """ refseq_instance = self.get_instance_by_name('Refseq') removed_records_files = refseq_instance.find_files(lambda x: 'removed-records' in x and x.endswith('.gz')) for file in removed_records_files: log.debug(f"Parse {file}") release = file.split('/')[-1].split('.')[0].replace('release', '') with gzip.open(file, 'rt') as f: for l in f: flds = l.strip().split('\t') this_taxid = flds[0] if this_taxid == taxid: refseq_acc, version = flds[2].split('.') reason = flds[-1] # transcript if refseq_acc.startswith('NM') or refseq_acc.startswith('NR') or refseq_acc.startswith( "XM") or refseq_acc.startswith("XR"): if refseq_acc not in self.legacy_ids: self.legacy_transcripts.add_node( {'sid': refseq_acc, 'version': version, 'status': 'removed', 'removed_in': release, 'reason': reason, 'taxid': taxid} ) self.legacy_ids.add(refseq_acc) if 'replaced by' in reason: # replaced by NM_022375 -> NM_022375 new_accession = (reason.rsplit(' ', 1)[1]).split('.')[0] self.legacy_transcript_now_transcript.add_relationship( {'sid': refseq_acc}, {'sid': new_accession}, {} ) # protein if refseq_acc.startswith('NP') or refseq_acc.startswith('XP'): if refseq_acc not in self.legacy_ids: self.legacy_proteins.add_node( {'sid': refseq_acc, 'version': version, 'status': 'removed', 'removed_in': release, 'reason': reason, 'taxid': taxid}) self.legacy_ids.add(refseq_acc) if 'replaced by' in reason: # replaced by NM_022375 -> NM_022375 new_accession = (reason.rsplit(' ', 1)[1]).split('.')[0] self.legacy_protein_now_protein.add_relationship( {'sid': refseq_acc}, {'sid': new_accession}, {} ) def get_legacy_gene_rels(self, taxid): """ Get the gene/protein relationships for the legacy Transcripts. ========================================== release#.accession2geneid ========================================== Content: Report of GeneIDs available at the time of the RefSeq release. Limited to GeneIDs that are associated with RNA or mRNA records with accession prefix N[M|R] and X[M|R]. Columns (tab delimited): 1: Taxonomic ID 2: Entrez GeneID 3: Transcript accession.version 4: Protein accession.version na if no data --for example, the NR_ accession prefix is used for RNA so there is no corresponding protein record :param taxid: :return: """ log.debug("Get relationships from legacy RefSeq IDs to genes.") refseq_instance = self.get_instance_by_name('Refseq') archived_accession2geneid = refseq_instance.find_files(lambda x: 'accession2geneid' in x and x.endswith('.gz')) check_set = set() for file in archived_accession2geneid: log.debug(f"Parse {file}") with gzip.open(file, 'rt') as f: for l in f: flds = l.strip().split('\t') this_taxid = flds[0] if this_taxid == taxid: """ 1: Taxonomic ID 2: Entrez GeneID 3: Transcript accession.version 4: Protein accession.version na if no data --for example, the NR_ accession prefix is used for RNA so there is no corresponding protein record """ gene_id = flds[1].strip() transcript_accession = flds[2].strip().split('.')[0] protein_accession = flds[3].strip().split('.')[0] if transcript_accession in self.legacy_ids: if (gene_id, transcript_accession) not in check_set: self.gene_codes_legacy_transcript.add_relationship( {'sid': gene_id}, {'sid': transcript_accession}, {} ) check_set.add((gene_id, transcript_accession)) if transcript_accession != 'na': if (transcript_accession, protein_accession) not in check_set: self.legacy_transcript_codes_protein.add_relationship( {'sid': transcript_accession}, {'sid': protein_accession}, {} ) check_set.add((transcript_accession, protein_accession))
class GeneOntologyAssociationParser(ReturnParser): """ Parse GeneOntology Associations from the official UniProt association files. There are three different files available: - goa_uniprot_all.gaf.gz - goa_uniprot_all.gpa.gz - goa_uniprot_all.gpi.gz The GPA (Gene Product Association) file contains one gene product - GO Term tuple per line. There is additional information about the gene products in the GPI (Gene Product Information) file which augments the GPA file. The GAF file merges GPA and GPI (by adding the gene product information to each line with an association) and thus contains a lot of redundant information on the gene product. The GAF file is parsed for the mappings because it contains the mapping as well as the taxonomy ID. It is easier to iterate one file instead of generating a gene product - taxonomy ID mapping from the GPI file and then read the GPA file. More information from the header of the GPA file: !This file contains all GO annotations for proteins in the UniProt KnowledgeBase (UniProtKB). ! !It also contains all annotations for protein complexes, identified by ComplexPortal identifiers, !and for non-coding RNAs, identified by RNAcentral identifiers ! !Columns: ! ! name required? cardinality GAF column # ! DB required 1 1 ! DB_Object_ID required 1 2 / 17 ! Qualifier required 1 or greater 4 ! GO ID required 1 5 ! DB:Reference(s) required 1 or greater 6 ! ECO evidence code required 1 7 (GO evidence code) ! With optional 0 or greater 8 ! Interacting taxon ID optional 0 or 1 13 ! Date required 1 14 ! Assigned_by required 1 15 ! Annotation Extension optional 0 or greater 16 ! Annotation Properties optional 0 or 1 n/a And from the header of the GPI file: !This file contains additional information for proteins in the UniProt KnowledgeBase (UniProtKB). !Protein accessions are represented in this file even if there is no associated GO annotation. ! !Columns: ! ! name required? cardinality GAF column # Example content ! DB required 1 1 UniProtKB ! DB_Object_ID required 1 2/17 Q4VCS5-1 ! DB_Object_Symbol required 1 3 AMOT ! DB_Object_Name optional 0 or greater 10 Angiomotin ! DB_Object_Synonym(s) optional 0 or greater 11 AMOT|KIAA1071 ! DB_Object_Type required 1 12 protein ! Taxon required 1 13 taxon:9606 ! Parent_Object_ID optional 0 or 1 - UniProtKB:Q4VCS5 ! DB_Xref(s) optional 0 or greater - WB:WBGene00000035 ! Properties optional 0 or greater - db_subset=Swiss-Prot|target_set=KRUK,BHFL """ def __init__(self): super(GeneOntologyAssociationParser, self).__init__() self.arguments = ['taxid'] # RelationshipSets self.protein_associates_goterm = RelationshipSet( 'ASSOCIATION', ['Protein'], ['Term'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, ref_taxid): go_instance = self.get_instance_by_name('GeneOntology') log.debug("Run for {}".format(ref_taxid)) if ref_taxid in TAXID_2_ORG_FILE_NAME: goa_uniprot_gaf_file_name = 'goa_{0}.gaf.gz'.format( TAXID_2_ORG_FILE_NAME[ref_taxid]) goa_uniprot_gaf_file = go_instance.get_file( goa_uniprot_gaf_file_name) else: goa_uniprot_gaf_file = go_instance.get_file( 'goa_uniprot_all.gaf.gz') self.parse_goa_uniprot_gaf_file(goa_uniprot_gaf_file, ref_taxid) def parse_goa_uniprot_gaf_file(self, goa_uniprot_gaf_file, ref_taxid): with gzip.open(goa_uniprot_gaf_file, 'rt') as f: for line in f: if not line.startswith('!'): line = line.strip() flds = line.split('\t') db = flds[0] try: taxid = flds[12].split(':')[1] except IndexError: continue if taxid == ref_taxid: if db == 'UniProtKB': db_id = flds[1] qualifier = flds[3] go_id = flds[4] evidence = flds[6] rel_properties = {'evidence': evidence} if qualifier: rel_properties['qualifier'] = qualifier self.protein_associates_goterm.add_relationship( {'sid': db_id}, {'sid': go_id}, rel_properties)
class GtexMetadataParser(ReturnParser): def __init__(self): """ :param mesh_instance: NcbiGene Instance :type mesh_instance: DataSourceInstance """ super(GtexMetadataParser, self).__init__() # NodeSets self.tissues = NodeSet(['GtexTissue'], merge_keys=['name']) self.detailed_tissues = NodeSet(['GtexDetailedTissue'], merge_keys=['name']) self.sample = NodeSet(['GtexSample'], merge_keys=['sid']) self.sample_measures_tissue = RelationshipSet('MEASURES', ['GtexSample'], ['GtexTissue'], ['sid'], ['name']) self.sample_measures_detailed_tissue = RelationshipSet( 'MEASURES', ['GtexSample'], ['GtexDetailedTissue'], ['sid'], ['name']) self.tissue_parent_detailed_tissue = RelationshipSet( 'PARENT', ['GtexTissue'], ['GtexDetailedTissue'], ['name'], ['name']) self.tissue_parent_detailed_tissue.unique = True def run_with_mounted_arguments(self): self.run() def run(self): gtex_instance = self.get_instance_by_name('Gtex') gtext_sample_attribute_file = gtex_instance.get_file( 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt') gtex_df = pandas.read_csv(gtext_sample_attribute_file, sep='\t', header=0, index_col=False, encoding="utf-8-sig") for row in gtex_df.itertuples(): sid = row.SAMPID tissue_name = row.SMTS detailed_tissue_name = row.SMTSD props = { 'sid': sid, 'SMATSSCR': row.SMATSSCR, 'SMCENTER': row.SMCENTER, 'SMPTHNTS': row.SMPTHNTS, 'SMRIN': row.SMRIN, 'SMTS': row.SMTS, 'SMTSD': row.SMTSD, 'SMUBRID': row.SMUBRID, 'SMTSISCH': row.SMTSISCH, 'SMTSPAX': row.SMTSPAX, 'SMNABTCH': row.SMNABTCH, 'SMNABTCHT': row.SMNABTCHT, 'SMNABTCHD': row.SMNABTCHD, 'SMGEBTCH': row.SMGEBTCH, 'SMGEBTCHD': row.SMGEBTCHD, 'SMGEBTCHT': row.SMGEBTCHT, 'SMAFRZE': row.SMAFRZE, 'SMGTC': row.SMGTC, 'SME2MPRT': row.SME2MPRT, 'SMCHMPRS': row.SMCHMPRS, 'SMNTRART': row.SMNTRART, 'SMNUMGPS': row.SMNUMGPS, 'SMMAPRT': row.SMMAPRT, 'SMEXNCRT': row.SMEXNCRT, 'SM550NRM': row.SM550NRM, 'SMGNSDTC': row.SMGNSDTC, 'SMUNMPRT': row.SMUNMPRT, 'SM350NRM': row.SM350NRM, 'SMRDLGTH': row.SMRDLGTH, 'SMMNCPB': row.SMMNCPB, 'SME1MMRT': row.SME1MMRT, 'SMSFLGTH': row.SMSFLGTH, 'SMESTLBS': row.SMESTLBS, 'SMMPPD': row.SMMPPD, 'SMNTERRT': row.SMNTERRT, 'SMRRNANM': row.SMRRNANM, 'SMRDTTL': row.SMRDTTL, 'SMVQCFL': row.SMVQCFL, 'SMMNCV': row.SMMNCV, 'SMTRSCPT': row.SMTRSCPT, 'SMMPPDPR': row.SMMPPDPR, 'SMCGLGTH': row.SMCGLGTH, 'SMGAPPCT': row.SMGAPPCT, 'SMUNPDRD': row.SMUNPDRD, 'SMNTRNRT': row.SMNTRNRT, 'SMMPUNRT': row.SMMPUNRT, 'SMEXPEFF': row.SMEXPEFF, 'SMMPPDUN': row.SMMPPDUN, 'SME2MMRT': row.SME2MMRT, 'SME2ANTI': row.SME2ANTI, 'SMALTALG': row.SMALTALG, 'SME2SNSE': row.SME2SNSE, 'SMMFLGTH': row.SMMFLGTH, 'SME1ANTI': row.SME1ANTI, 'SMSPLTRD': row.SMSPLTRD, 'SMBSMMRT': row.SMBSMMRT, 'SME1SNSE': row.SME1SNSE, 'SME1PCTS': row.SME1PCTS, 'SMRRNART': row.SMRRNART, 'SME1MPRT': row.SME1MPRT, 'SMNUM5CD': row.SMNUM5CD, 'SMDPMPRT': row.SMDPMPRT, 'SME2PCTS': row.SME2PCTS } self.sample.add_node(props) self.tissues.add_unique({'name': tissue_name}) self.detailed_tissues.add_unique({'name': detailed_tissue_name}) self.sample_measures_tissue.add_relationship({'sid': sid}, {'name': tissue_name}, {}) self.sample_measures_detailed_tissue.add_relationship( {'sid': sid}, {'name': detailed_tissue_name}, {}) self.tissue_parent_detailed_tissue.add_relationship( {'name': tissue_name}, {'name': detailed_tissue_name}, {})
# create nodes and relationships # add NCBI gene node ncbi_gene_nodes.add_node({'gene_id': ncbi_gene_id, 'db': 'ncbi'}) # add ENSEMBL gene nodes if they not exist already for ensembl_gene_id in mapped_ensembl_gene_ids: if ensembl_gene_id not in ensembl_gene_ids_added: ensembl_gene_nodes.add_node({ 'gene_id': ensembl_gene_id, 'db': 'ensembl' }) ensembl_gene_ids_added.add(ensembl_gene_id) # add (:Gene)-[:MAPS]->(:Gene) relationship for ensembl_gene_id in mapped_ensembl_gene_ids: gene_mapping_rels.add_relationship({'gene_id': ncbi_gene_id}, {'gene_id': ensembl_gene_id}, {'db': 'ncbi'}) # load data to Neo4j print(len(ncbi_gene_nodes.nodes)) print(len(ensembl_gene_nodes.nodes)) print(len(gene_mapping_rels.relationships)) # create index for property 'gene_id' on (Gene) nodes first print('Create index on Gene nodes') try: graph.schema.create_index('Gene', 'gene_id') except py2neo.database.ClientError: pass # load data, first nodes then relationships
def read_daily_report_data_csv_JHU(file): """ Extract data from a single daile report file from JHU. Old format (until 03-21-2020) Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude New format: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key :param file: Path to the CSV file :return: """ log.info('Read JHU CSV file {}'.format(file)) # understand if old fromat ( countries = NodeSet(['Country'], ['name']) provinces = NodeSet(['Province'], ['name']) updates = NodeSet(['DailyReport'], ['uuid']) province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'], ['name'], ['name']) province_in_country.unique = True province_rep_update = RelationshipSet('REPORTED', ['Province'], ['DailyReport'], ['name'], ['uuid']) with open(file, 'rt') as csvfile: rows = csv.reader(csvfile, delimiter=',', quotechar='"') # skip header header = next(rows) if len(header) > 8: file_type = 'new' else: file_type = 'old' log.info("File type: {}".format(file_type)) for row in rows: if file_type == 'old': country, province, date, confirmed, death, recovered, lat, long = parse_jhu_old_file_row( row) elif file_type == 'new': country, province, date, confirmed, death, recovered, lat, long = parse_jhu_new_file_row( row) province_dict = {'name': province} if lat and long: province_dict['latitude'] = lat province_dict['longitude'] = long uuid = country + province + str(date) provinces.add_unique(province_dict) countries.add_unique({'name': country}) updates.add_unique({ 'date': date, 'confirmed': confirmed, 'death': death, 'recovered': recovered, 'uuid': uuid }) province_in_country.add_relationship({'name': province}, {'name': country}, {'source': 'jhu'}) province_rep_update.add_relationship({'name': province}, {'uuid': uuid}, {'source': 'jhu'}) return countries, provinces, updates, province_in_country, province_rep_update
class LncipediaParser(ReturnParser): """ Parse Lncipedia GFF file. 'lnc_RNA' entries contain gene and transcript IDs as well as mappings to ENSEMBL 'exon' entries don't have different IDs, they reuse the gene/transcript IDs from their parent 'lnc_RNA' entries chr16 lncipedia.org lnc_RNA 52005479 52026435 . - . ID=lnc-TOX3-1:20;gene_id=lnc-TOX3-1;transcript_id=lnc-TOX3-1:20;gene_alias_1=XLOC_011939;gene_alias_2=linc-SALL1-6;transcript_alias_1=TCONS_00025002;transcript_alias_2=NONHSAT142490; chr10 lncipedia.org exon 8052243 8052735 . - . Parent=GATA3-AS1:5;gene_id=GATA3-AS1;transcript_id=GATA3-AS1:5;gene_alias_1=XLOC_008724;gene_alias_2=linc-KIN-5;gene_alias_3=ENSG00000243350;gene_alias_4=RP11-379F12.3;gene_alias_5=ENSG00000243350.1;gene_alias_6=OTTHUMG00000017641.1;gene_alias_7=ENSG00000197308.9;gene_alias_8=GATA3-AS1;transcript_alias_1=TCONS_00017730;transcript_alias_2=ENST00000458727;transcript_alias_3=ENST00000458727.1;transcript_alias_4=RP11-379F12.3-001;transcript_alias_5=OTTHUMT00000046722.1;transcript_alias_6=NONHSAT011314;transcript_alias_7=NR_104327;transcript_alias_8=NR_104327.1; """ def __init__(self): super(LncipediaParser, self).__init__() self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid']) self.gene_codes_transcripts = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.transcript_maps_transcript = RelationshipSet('MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run() def run(self): log.debug(f"Run {self.__class__.__name__}") lncipedia_instance = self.get_instance_by_name('Lncipedia') lncipedia_datasource_name = lncipedia_instance.datasource.name gff_file = lncipedia_instance.get_file('lncipedia_5_2_hg38.gff') annotation = GffReader(gff_file) check_ids = set() for r in annotation.records: if r.type == 'lnc_RNA': # create gene gene_id = r.attributes['gene_id'] if gene_id not in check_ids: self.genes.add_node({'sid': gene_id, 'source': lncipedia_datasource_name}) check_ids.add(gene_id) transcript_id = r.attributes['transcript_id'] if transcript_id not in check_ids: self.transcripts.add_node({'sid': transcript_id, 'source': lncipedia_datasource_name}) check_ids.add(transcript_id) if frozenset((gene_id, transcript_id)) not in check_ids: self.gene_codes_transcripts.add_relationship( {'sid': gene_id}, {'sid': transcript_id}, {} ) check_ids.add(frozenset((gene_id, transcript_id))) for k,v in r.attributes.items(): if k.startswith('gene_alias'): ref_gene_id = v.split('.')[0] # don't create MAPS relationship if same name like mapped entity if gene_id != ref_gene_id: if frozenset((gene_id, ref_gene_id)) not in check_ids: self.gene_maps_gene.add_relationship( {'sid': gene_id}, {'sid': ref_gene_id}, {'source': lncipedia_datasource_name} ) check_ids.add(frozenset((gene_id, ref_gene_id))) if k.startswith('transcript_alias'): ref_transcript_id = v.split('.')[0] # don't create MAPS relationship if same name like mapped entity if transcript_id != ref_transcript_id: if frozenset((transcript_id, ref_transcript_id)) not in check_ids: self.transcript_maps_transcript.add_relationship( {'sid': transcript_id}, {'sid': ref_transcript_id}, {'source': lncipedia_datasource_name} ) check_ids.add(frozenset((transcript_id, ref_transcript_id)))
class HmdbParser(ReturnParser): def __init__(self): super(HmdbParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'hmdb'}) self.metabolite_map_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'hmdb'}) self.metabolite_associates_protein = RelationshipSet( 'HAS_ASSOCIATION', ['Metabolite'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'hmdb'}) def run_with_mounted_arguments(self): self.run() def run(self, *args, **kwargs): hmdb_instance = self.get_instance_by_name('Hmdb') all_metabolites_file = hmdb_instance.get_file('hmdb_metabolites.xml') all_metabolites = etree.parse(all_metabolites_file) for metabolite in all_metabolites.getroot(): # TODO just iterate over property list, this code snippet was copied from manually testing stuff in Spyder # TODO filter empty properties sid = metabolite.findtext('{http://www.hmdb.ca}accession') name = metabolite.findtext('{http://www.hmdb.ca}name') chebi_id = metabolite.findtext('{http://www.hmdb.ca}chebi_id') chemspider_id = metabolite.findtext( '{http://www.hmdb.ca}chemspider_id') cs_description = metabolite.findtext( '{http://www.hmdb.ca}cs_description') description = metabolite.findtext( '{http://www.hmdb.ca}description') chemical_formula = metabolite.findtext( '{http://www.hmdb.ca}chemical_formula') average_molecular_weight = metabolite.findtext( '{http://www.hmdb.ca}average_molecular_weight') iupac_name = metabolite.findtext('{http://www.hmdb.ca}iupac_name') cas_registry_number = metabolite.findtext( '{http://www.hmdb.ca}cas_registry_number') smiles = metabolite.findtext('{http://www.hmdb.ca}smiles') inchi = metabolite.findtext('{http://www.hmdb.ca}inchi') kegg_id = metabolite.findtext('{http://www.hmdb.ca}kegg_id') metabolite_properties = { 'sid': sid, 'name': name, 'chebi_id': chebi_id, 'chemspider_id': chemspider_id, 'cs_description': cs_description, 'description': description, 'chemical_formula': chemical_formula, 'average_molecular_weight': average_molecular_weight, 'iupac_name': iupac_name, 'cas_registry_number': cas_registry_number, 'smiles': smiles, 'inchi': inchi, 'kegg_id': kegg_id } self.metabolites.add_node(metabolite_properties) # add mapping to Chebi if chebi_id: self.metabolite_map_metabolite.add_relationship( {'sid': sid}, {'sid': chebi_id}, {}) # add association to Proteins for protein in metabolite.find( '{http://www.hmdb.ca}protein_associations'): uniprot_id = protein.findtext('{http://www.hmdb.ca}uniprot_id') self.metabolite_associates_protein.add_relationship( {'sid': sid}, {'sid': uniprot_id}, {})
class MeshParser(ReturnParser): def __init__(self): super(MeshParser, self).__init__() # NodeSets self.descriptor = NodeSet(['MeshDescriptor'], merge_keys=['sid']) self.qualifier = NodeSet(['MeshQualifier'], merge_keys=['sid']) self.concept = NodeSet(['MeshConcept'], merge_keys=['sid']) self.term = NodeSet(['MeshTerm'], merge_keys=['sid']) self.descriptor_allowed_qualifier = RelationshipSet('ALLOWED', ['MeshDescriptor'], ['MeshQualifier'], ['sid'], ['sid']) self.descriptor_has_concept = RelationshipSet('HAS', ['MeshDescriptor'], ['MeshConcept'], ['sid'], ['sid']) self.descriptor_has_concept.unique = True self.concept_has_term = RelationshipSet('HAS', ['MeshConcept'], ['MeshTerm'], ['sid'], ['sid']) self.concept_has_term.unique = True self.concept_related_concept = RelationshipSet('RELATED', ['MeshConcept'], ['MeshConcept'], ['sid'], ['sid']) self.concept_related_concept.unique = True def run_with_mounted_arguments(self): self.run() def run(self): self.parse_xml() def parse_xml(self): """ Parse descriptor XML file. """ mesh_instance = self.get_instance_by_name('Mesh') version = DataSourceVersion.version_from_string( mesh_instance.version ) descriptor_xml = mesh_instance.get_file('desc{}.xml'.format(str(version))) log.debug("XML file {}".format(descriptor_xml)) tree = ET.parse(descriptor_xml) root = tree.getroot() check_qualifier = set() check_concepts = set() check_terms = set() for descriptor_record in root.getchildren(): descriptor_ui = descriptor_record.find('DescriptorUI').text # <DescriptorName> # <String>Calcimycin</String> # </DescriptorName> descriptor_name = descriptor_record.find('.DescriptorName/String').text self.descriptor.add_node({'sid': descriptor_ui, 'name': descriptor_name}) # <AllowableQualifiersList> # <AllowableQualifier> # <QualifierReferredTo> # <QualifierUI>Q000302</QualifierUI> # <QualifierName> # <String>isolation & purification</String> # </QualifierName> # </QualifierReferredTo> # <Abbreviation>IP</Abbreviation> # </AllowableQualifier> # </AllowableQualifiersList> allowed_qualifiers = descriptor_record.findall( '.AllowableQualifiersList/AllowableQualifier/QualifierReferredTo') for qualifier in allowed_qualifiers: qualifier_ui = qualifier.find('.QualifierUI').text # add qualifier node id not exists if qualifier_ui not in check_qualifier: qualifier_name = qualifier.find('.QualifierName/String').text self.qualifier.add_node({'sid': qualifier_ui, 'name': qualifier_name}) check_qualifier.add(qualifier_ui) # add descriptor -> qualifier relationship self.descriptor_allowed_qualifier.add_relationship( {'sid': descriptor_ui}, {'sid': qualifier_ui}, {'source': 'mesh'} ) # <ConceptList> # <Concept PreferredConceptYN="Y"> # <ConceptUI>M0000001</ConceptUI> # <ConceptName> # <String>Calcimycin</String> # </ConceptName> # <CASN1Name>4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-</CASN1Name> # <RegistryNumber>37H9VM9WZL</RegistryNumber> # <ScopeNote>An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems. # </ScopeNote> # <RelatedRegistryNumberList> # <RelatedRegistryNumber>52665-69-7 (Calcimycin)</RelatedRegistryNumber> # </RelatedRegistryNumberList> # <ConceptRelationList> # <ConceptRelation RelationName="NRW"> # <Concept1UI>M0000001</Concept1UI> # <Concept2UI>M0353609</Concept2UI> # </ConceptRelation> # </ConceptRelationList> # <TermList> # <Term ConceptPreferredTermYN="Y" IsPermutedTermYN="N" LexicalTag="NON" RecordPreferredTermYN="Y"> # <TermUI>T000002</TermUI> # <String>Calcimycin</String> # <DateCreated> # <Year>1999</Year> # <Month>01</Month> # <Day>01</Day> # </DateCreated> # <ThesaurusIDlist> # <ThesaurusID>FDA SRS (2014)</ThesaurusID> # <ThesaurusID>NLM (1975)</ThesaurusID> # </ThesaurusIDlist> # </Term> # </TermList> # </Concept> concepts = descriptor_record.findall('.ConceptList/Concept') for concept in concepts: preferred_concept = concept.attrib['PreferredConceptYN'] concept_ui = concept.find('.ConceptUI').text # concept node if not exists if concept_ui not in check_concepts: concept_properties = {} concept_properties['sid'] = concept_ui concept_properties['name'] = concept.find('.ConceptName/String').text try: concept_properties['scope_note'] = concept.find('.ScopeNote').text except AttributeError as e: pass self.concept.add_node(concept_properties) check_concepts.add(concept_ui) # (Descriptor)--(Concept) relation self.descriptor_has_concept.add_relationship({'sid': descriptor_ui}, {'sid': concept_ui}, {'preferred': preferred_concept}) # concept relations for concept_relation in concept.findall('.ConceptRelationList/ConceptRelation'): left = concept_relation.find('.Concept1UI').text right = concept_relation.find('.Concept2UI').text name = concept_relation.attrib['RelationName'] self.concept_related_concept.add_relationship({'sid': left}, {'sid': right}, {'name': name}) # iterate Terms for concept for term in concept.findall('.TermList/Term'): term_ui = term.find('TermUI').text concept_preferred_term = term.attrib['ConceptPreferredTermYN'] # Term node if not exists if term_ui not in check_terms: term_name = term.find('.String').text self.term.add_node({'sid': term_ui, 'name': term_name}) check_terms.add(term_ui) # (Concept)--(Term) self.concept_has_term.add_relationship({'sid': concept_ui}, {'sid': term_ui}, {'preferred': concept_preferred_term})
class EnsemblMappingParser(ReturnParser): """ Get mappings from ENSEMBL IDs to other databases. ENSEMBL dumps common mapping data to files in the `tsv` directory. ### Transcripts Extract (Transcript {ensembl})-[MAPS]-(Transcript {refseq}) mappings from ENSEMBL. Mappings to NCBI Gene are from: Homo_sapiens.GRCh38.91.refseq.tsv.gz Example: gene_stable_id|transcript_stable_id|protein_stable_id|xref|db_name|info_type|source_identity|xref_identity|linkage_type ENSG00000223972 ENST00000456328 - 102725121 EntrezGene DEPENDENT - - - ### Genes Extract (Gene {ensembl})-[MAPS]-(Gene {ncbigene}) mappings from ENSEMBL. Mappings to NCBI Gene are from: Homo_sapiens.GRCh38.91.entrez.tsv.gz Example: gene_stable_id|transcript_stable_id|protein_stable_id|xref|db_name|info_type|source_identity|xref_identity|linkage_type ENSG00000223972 ENST00000456328 - 102725121 EntrezGene DEPENDENT - - - ### Proteins Extract (Protein {ensembl})-[MAPS]-(Protein {refseq}) mappings from ENSEMBL. Mappings to NCBI Gene are from: Homo_sapiens.GRCh38.91.uniprot.tsv.gz Example: gene_stable_id transcript_stable_id protein_stable_id xref db_name info_type source_identity xref_identity linkage_type ENSG00000186092 ENST00000335137 ENSP00000334393 Q8NH21 Uniprot/SWISSPROT DIRECT 100 100 - """ def __init__(self): super(EnsemblMappingParser, self).__init__() # arguments self.arguments = ['taxid'] # define NodeSet and RelationshipSet self.gene_maps_gene = RelationshipSet( 'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) self.transcript_maps_transcript = RelationshipSet( 'MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) self.protein_maps_protein = RelationshipSet( 'MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) # define properties that are used in multiple parsing functions @property def ensembl_instance(self): return self.get_instance_by_name('Ensembl') @property def datasource_name(self): return self.ensembl_instance.datasource.name def run_gene(self, taxid): ensembl_tsv_entrez_file_path = Ensembl.get_tsv_file_path( taxid, 'entrez', self.ensembl_instance) log.debug( 'Ensembl TSV file path: {}'.format(ensembl_tsv_entrez_file_path)) check_rels = set() with gzip.open(ensembl_tsv_entrez_file_path, 'rt') as f: lines = f.readlines() for l in lines[1:]: flds = l.strip().split() ensembl_gene_id = flds[0] ncbi_gene_id = flds[3] if frozenset([ensembl_gene_id, ncbi_gene_id]) not in check_rels: self.gene_maps_gene.add_relationship( {'sid': ensembl_gene_id}, {'sid': ncbi_gene_id}, {}) check_rels.add(frozenset([ensembl_gene_id, ncbi_gene_id])) def run_transcript(self, taxid): ensembl_tsv_refseq_file_path = Ensembl.get_tsv_file_path( taxid, 'refseq', self.ensembl_instance) log.debug( 'Ensembl TSV file path: {}'.format(ensembl_tsv_refseq_file_path)) check_rels = set() with gzip.open(ensembl_tsv_refseq_file_path, 'rt') as f: lines = f.readlines() for l in lines[1:]: flds = l.strip().split() ensembl_transcript_id = flds[1] xref_id = flds[3] # filter transcripts second_letter = xref_id[1] if second_letter == 'M' or second_letter == 'R': if frozenset([ensembl_transcript_id, xref_id]) not in check_rels: self.transcript_maps_transcript.add_relationship( {'sid': ensembl_transcript_id}, {'sid': xref_id}, {}) check_rels.add( frozenset([ensembl_transcript_id, xref_id])) def run_protein(self, taxid): ensembl_tsv_uniprot_file_path = Ensembl.get_tsv_file_path( taxid, 'uniprot', self.ensembl_instance) log.debug( 'Ensembl TSV file path: {}'.format(ensembl_tsv_uniprot_file_path)) check_rels = set() with gzip.open(ensembl_tsv_uniprot_file_path, 'rt') as f: lines = f.readlines() for l in lines[1:]: flds = l.strip().split() ensembl_protein_id = flds[2] xref_id = flds[3] if frozenset([ensembl_protein_id, xref_id]) not in check_rels: self.protein_maps_protein.add_relationship( {'sid': ensembl_protein_id}, {'sid': xref_id}, {'taxid': self.taxid}) check_rels.add(frozenset([ensembl_protein_id, xref_id])) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): self.run_gene(taxid) self.run_transcript(taxid) self.run_protein(taxid)
class SwissLipidsParser(ReturnParser): def __init__(self): super(SwissLipidsParser, self).__init__() # define NodeSet and RelationshipSet self.lipids = NodeSet(['Lipid'], merge_keys=['sid']) self.lipid_fromclass_lipid = RelationshipSet('FROM_LIPID_CLASS', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_parent_lipid = RelationshipSet('HAS_PARENT', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_component_lipid = RelationshipSet('HAS_COMPONENT', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_maps_metabolite = RelationshipSet('MAPS', ['Lipid'], ['Metabolite'], ['sid'], ['sid']) self.lipid_associates_protein = RelationshipSet('HAS_ASSOCIATION', ['Lipid'], ['Protein'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run() def run(self): swisslipids_instance = self.get_instance_by_name('SwissLipids') self.get_lipids(swisslipids_instance) self.get_lipid_to_protein(swisslipids_instance) def get_lipids(self, instance): """ Lipid ID Level Name Abbreviation* Synonyms* Lipid class* Parent Components* SMILES (pH7.3) InChI (pH7.3) InChI key (pH7.3) Formula (pH7.3) Charge (pH7.3) Mass (pH7.3) Exact Mass (neutral form) Exact m/z of [M.]+ Exact m/z of [M+H]+ Exact m/z of [M+K]+ Exact m/z of [M+Na]+ Exact m/z of [M+Li]+ Exact m/z of [M+NH4]+ Exact m/z of [M-H]- Exact m/z of [M+Cl]- Exact m/z of [M+OAc]- CHEBI LIPID MAPS HMDB PMID SLM:000000002 Class Ceramide (iso-d17:1(4E)) Cer(iso-d17:1(4E)) N-acyl-15-methylhexadecasphing-4-enine SLM:000399814 CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@H](CO)NC([*])=O InChI=none C18H34NO3R 0 70846 14685263 | 21325339 | 9603947 | 21926990 SLM:000000003 Isomeric subspecies 15-methylhexadecasphing-4-enine SLM:000390097 CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@@H]([NH3+])CO InChI=1S/C17H35NO2/c1-15(2)12-10-8-6-4-3-5-7-9-11-13-17(20)16(18)14-19/h11,13,15-17,19-20H,3-10,12,14,18H2,1-2H3/p+1/b13-11+/t16-,17+/m0/s1 InChIKey=LZKPPSAEINBHRP-KORIGIIASA-O C17H36NO2 1 286.473200 285.266779 285.266231 286.274056 324.229938 308.256000 292.282235 303.300605 284.259503 320.236181 344.280632 70771 19372430 Columns: - difficult to select column names, use index - the * means the field is a list - different field separators in list fields 0 Lipid ID 1 Level 2 Name 3 Abbreviation* 4 Synonyms* 5 Lipid class* 6 Parent 7 Components* 8 SMILES (pH7.3) 9 InChI (pH7.3) 10 InChI key (pH7.3) 11 Formula (pH7.3) 12 Charge (pH7.3) 13 Mass (pH7.3) 14 Exact Mass (neutral form) 15 Exact m/z of [M.]+ 16 Exact m/z of [M+H]+ 17 Exact m/z of [M+K]+ 18 Exact m/z of [M+Na]+ 19 Exact m/z of [M+Li]+ 20 Exact m/z of [M+NH4]+ 21 Exact m/z of [M-H]- 22 Exact m/z of [M+Cl]- 23 Exact m/z of [M+OAc]- 24 CHEBI 25 LIPID MAPS 26 HMDB 27 PMID """ lipids_file = instance.get_file('lipids.tsv.gz') # get header header = None with gzip.open(lipids_file, 'rt') as f: header = next(f) header = header.strip().split('\t') def safe_string(s): for char in [' ', '[', ']', '(', ')', '*', '/']: s = s.replace(char, '_') return s header_cypher_safe = [safe_string(s) for s in header] log.debug(header_cypher_safe) # iterate file with gzip.open(lipids_file, 'rt', errors="replace") as f: # skip header next(f) for l in f: flds = l.strip().split('\t') lipid_sid = flds[0] # (Lipid) node props = {'source': 'swisslipids'} props['sid'] = lipid_sid # add all properties, some are empty but contain whitespaces for i, fld in enumerate(flds): fld = fld.strip() if fld: props[header_cypher_safe[i]] = fld # # print( # dict(zip(header, flds)) # ) self.lipids.add_node(props) # (Lipid)-[FROM_LIPID_CLASS]-(Lipid) for lipid_class_sid in flds[5].strip().split('|'): # strip leading/trailing spaces, not always existing lipid_class_sid = lipid_class_sid.strip() self.lipid_fromclass_lipid.add_relationship( {'sid': lipid_sid}, {'sid': lipid_class_sid}, {'source': 'swisslipids'} ) # (Lipid)-[HAS_PARENT]-(Lipid) self.lipid_parent_lipid.add_relationship( {'sid': lipid_sid}, {'sid': flds[6].strip()}, {'source': 'swisslipids'} ) # (Lipid)-[COMPONENT]-(Lipid) ## e.g. SLM:000000510 (sn1) / SLM:000000418 (sn2) for lipid_component in flds[7].strip().split('/'): # get sid and type of lipid component, type does not always exist try: lipid_component_sid, lipid_component_type = lipid_component.strip().split(' ', 1) self.lipid_component_lipid.add_relationship( {'sid': lipid_sid}, {'sid': lipid_component_sid}, {'type': lipid_component_type} ) # some empty fields contain extra spaces except ValueError: pass # (Lipid)-[MAPS]-(Metabolite) try: chebi_id = flds[24].strip() if chebi_id: self.lipid_maps_metabolite.add_relationship( {'sid': lipid_sid}, {'sid': chebi_id}, {'source': 'swisslipids'} ) except IndexError: pass try: hmdb_id = flds[26].strip() if hmdb_id: self.lipid_maps_metabolite.add_relationship( {'sid': lipid_sid}, {'sid': hmdb_id}, {'source': 'swisslipids'} ) except IndexError: pass def get_lipid_to_protein(self, instance): """ File: lipids2uniprot.tsv.gz Columns: - difficult to select column names, use index 0 metabolite id 1 UniprotKB IDs 2 level 3 metabolite name 4 abbreviations 5 synonyms 6 lipid class 7 components 8 PMIDs 9 SMILES (pH7.3) 10 InChI (pH7.3) 11 InChI key (pH7.3) 12 Formula (pH7.3) 13 Mass (pH7.3) 14 Charge (pH7.3) 15 Exact Mass (neutral form) 16 Exact m/z of [M.]+ 17 Exact m/z of [M+H]+ 18 Exact m/z of [M+K]+Exact m/z of [M+Na]+ 19 Exact m/z of [M+Li]+ 20 Exact m/z of [M+NH4]+ 21 Exact m/z of [M-H]- 22 Exact m/z of [M+Cl]- 23 Exact m/z of [M+OAc]- 24 ChEBI 25 LipidMaps 26 HMDB 27 Mapping level :param instance: The datasource instance. """ lipids_2_protein_file = instance.get_file('lipids2uniprot.tsv.gz') # iterate file with gzip.open(lipids_2_protein_file, 'rt', errors="replace") as f: next(f) for l in f: flds = l.strip().split('\t') swisslipids_id = flds[0].strip() mapping_level = flds[27].strip() # collect UniProt IDs from uniprot fields, contains a '|' separated list # G5EC84 | O18037 | P91079 | Q09517 | Q10916 | Q20735 | Q21054 | Q23498 | Q9U3D4 # note: not always formatted with space: ' | ' uniprot_id_string = flds[1] uniprot_ids = set() for u in uniprot_id_string.split('|'): u = u.strip() if u: uniprot_ids.add(u) for up in uniprot_ids: self.lipid_associates_protein.add_relationship( {'sid': swisslipids_id}, {'sid': up}, {'source': 'swisslipids', 'level': mapping_level} )
class NcbiHomoloGeneParser(ReturnParser): """ The NCBI HomoloGene parser reads the basic datafile `homologene.data` from HomoloGene. The file `homolgene.data` is a tab separated list of homology groups. Fields: group ID, tax ID, gene ID, gene symbol, unclear?, refseq ID Example:: 3 9606 34 ACADM 4557231 NP_000007.1 3 9598 469356 ACADM 160961497 NP_001104286.1 3 9544 705168 ACADM 109008502 XP_001101274.1 3 9615 490207 ACADM 545503811 XP_005622188.1 """ def __init__(self): super(NcbiHomoloGeneParser, self).__init__() # output data self.gene_homolog_gene = RelationshipSet('HOMOLOG', ['Gene'], ['Gene'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run() def run(self): ncbihomologene_instance = self.get_instance_by_name('NcbiHomoloGene') datafile = ncbihomologene_instance.get_file('homologene.data') with open(datafile) as f: current_group_id = None current_group_genes = set() for l in f: # iterate and collect groups (identified by forst column) # take gene IDs from group and create all pairwise relationships flds = l.strip().split('\t') group_id = flds[0] gene_id = flds[2] # set group_id on first line if not current_group_id: current_group_id = group_id if current_group_id == group_id: current_group_genes.add(gene_id) else: # first line with new group_id # create relationships for all gene_id from previous group for g1, g2 in combinations(current_group_genes, 2): self.gene_homolog_gene.add_relationship({'sid': g1}, {'sid': g2}, {}) # clear gene set current_group_genes = set() # add current gene_id which is the first from a new group current_group_genes.add(gene_id) # set current_group_id to this group_id current_group_id = group_id
class MirbaseParser(ReturnParser): def __init__(self): super(MirbaseParser, self).__init__() # NodeSets self.precursor_mirna = NodeSet(['PrecursorMirna'], merge_keys=['sid']) self.mature_mirna = NodeSet(['Mirna'], merge_keys=['sid']) # RelationshipSets self.precursor_codes_mature = RelationshipSet('PRE', ['PrecursorMirna'], ['Mirna'], ['sid'], ['sid']) self.transcript_codes_precursor = RelationshipSet( 'IS', ['Transcript'], ['PrecursorMirna'], ['sid'], ['sid']) self.gene_is_precursor = RelationshipSet('IS', ['Gene'], ['PrecursorMirna'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run() def run(self): self.get_mature_mirnas() self.get_pre_mirnas() self.get_pre_mature_relationship() self.get_pre_transcript_relationships() self.get_gene_pre_relationships() @property def mirbase_instance(self): return self.get_instance_by_name('Mirbase') @property def pre_mirna_df(self): """ Ge precursor miRNA DataFrame from mirna.txt.gz mir_acc, mir_id, prev_mir_id, desc, sequence, comment, organism_key, dead_flag """ precursor_mirna_table_file = self.mirbase_instance.get_file( 'mirna.txt.gz') pre_mirs_df = pandas.read_csv(precursor_mirna_table_file, sep='\t', index_col=0, header=None) pre_mirs_df.columns = [ 'mir_acc', 'mir_id', 'prev_mir_id', 'desc', 'sequence', 'comment', 'organism_key', 'dead_flag' ] return pre_mirs_df @property def mature_mirna_df(self): """ Get mature miRNA DataFrame from: mirna_mature.txt.gz name, prev_name, mir_acc, evidence, ref, similarity, dead_flag """ mirna_table_file = self.mirbase_instance.get_file( 'mirna_mature.txt.gz') mirnas_df = pandas.read_csv(mirna_table_file, sep='\t', index_col=0, header=None) mirnas_df.columns = [ 'name', 'prev_name', 'mir_acc', 'evidence', 'ref', 'similarity', 'dead_flag' ] return mirnas_df @property def context_df(self): """ Get context DataFrame from: mirna_context.txt.gz auto_mirna transcript_id overlap_sense overlap_type number transcript_source transcript_name 64777 ENST00000545242 + intron 15 HGNC_trans_name ABLIM2-203 """ mirna_context_file = self.mirbase_instance.get_file( 'mirna_context.txt.gz') mirna_context_df = pandas.read_csv(mirna_context_file, sep='\t', index_col=0, header=None) mirna_context_df.columns = [ 'transcript_id', 'overlap_sense', 'overlap_type', 'number', 'transcript_source', 'transcript_name' ] return mirna_context_df @property def mirna_database_url_df(self): """ Database list from: mirna_database_url.txt.gz `auto_db`, `display_name`, `url` 5 EntrezGene https://www.ncbi.nlm.nih.gov/gene/<?> """ file = self.mirbase_instance.get_file('mirna_database_url.txt.gz') df = pandas.read_csv(file, sep='\t', index_col=0, header=None) df.columns = ['display_name', 'url'] return df @property def mirna_database_link_df(self): """ Database links from: mirna_database_links.txt.gz 'auto_mirna', 'auto_db', 'link', 'display_name' 64744 5 406883 MIRLET7A3 """ file = self.mirbase_instance.get_file('mirna_database_links.txt.gz') df = pandas.read_csv(file, sep='\t', index_col=0, header=None) df.columns = ['auto_db', 'link', 'display_name'] return df def get_mature_mirnas(self): """ Mature miRNAs are stored in a single table: mirna_mature.txt.gz name, prev_name, mir_acc, evidence, ref, similarity, dead_flag """ for row in self.mature_mirna_df.itertuples(): # add node self.mature_mirna.add_node({ 'sid': row.mir_acc, 'name': row.name, 'evidence': row.evidence }) def get_pre_mirnas(self): """ Precursor miRNAs are stored in a table: mirna.txt.gz mir_acc, mir_id, prev_mir_id, desc, sequence, comment, organism_key Organism identifier for the precursor miRNAs are in another table: mirna_species.txt.gz organism, division, org_name, taxonomy, genome_assembly, genome_accession, ensembl_db `org_name` is the long name of the organism, there is no taxonomy ID. :return: List of precursor miRNAs :rtype: list[Entity] """ organism_table_file = self.mirbase_instance.get_file( 'mirna_species.txt.gz') # load pre-miRNA table # load organism table orgs_df = pandas.read_csv(organism_table_file, sep='\t', index_col=0, header=None) orgs_df.columns = [ 'organism', 'division', 'org_name', 'taxon_id', 'taxonomy', 'genome_assembly', 'genome_accession', 'ensembl_db' ] merged_pre_mirs_org_df = pandas.merge(self.pre_mirna_df, orgs_df, on=None, left_on='organism_key', right_index=True) # add precursor miRNA nodes for row in merged_pre_mirs_org_df.itertuples(): props = { 'sid': row.mir_acc, 'name': row.mir_id, 'desc': row.desc, 'sequence': row.sequence, 'taxid': row.taxon_id, 'comment': str(row.comment) } self.precursor_mirna.add_node(props) def get_pre_mature_relationship(self): """ Mature miRNAs and precursor miRNAs are in the same files described in the respective parser function (above). Mapping is stored in a mapping table: mirna_pre_mature.txt.gz pre_dbid, mature_dbid, start, end It contains the primary key of mature and precursor miRNA tables and the start/end of the mature sequence within the precursor. :return: List of relationships between mature and precursor miRNAs :rtype: list[Relationship] """ mapping_table_file = self.mirbase_instance.get_file( 'mirna_pre_mature.txt.gz') # collect db_primary_key -> mirBase accession to later parse # the mature/precursor mapping table precursor_db_key_2_accession = {} for row in self.pre_mirna_df.itertuples(): precursor_db_key_2_accession[row.Index] = row.mir_acc # mature miRNAs # get miRNAs from miRNAs table first # they are mapped to pre-miRNAs with a mapping table that contains the position of the mature sequence # organims is also only stored for pre-miRNA # collect db_primary_key -> mirBase accession to later parse # the mature/precursor mapping table mature_db_key_2_accession = {} for row in self.mature_mirna_df.itertuples(): mature_db_key_2_accession[row.Index] = row.mir_acc # parse mappings # get mapping table pre_2_mature_df = pandas.read_csv(mapping_table_file, sep='\t', index_col=False, header=None) pre_2_mature_df.columns = ['pre_dbid', 'mature_dbid', 'start', 'end'] # iterate over mapping table and create relationships for row in pre_2_mature_df.itertuples(): mature_acc = mature_db_key_2_accession[row.mature_dbid] precursor_acc = precursor_db_key_2_accession[row.pre_dbid] self.precursor_codes_mature.add_relationship( {'sid': precursor_acc}, {'sid': mature_acc}, { 'start': int(row.start), 'end': int(row.end) }) def get_pre_transcript_relationships(self): """ MirBase provides the transcriptional context based on ENSEMBL transcipts. Context is stored in a single file: mirna_context.txt.gz auto_mirna transcript_id overlap_sense overlap_type number transcript_source transcript_name 64777 ENST00000545242 + intron 15 HGNC_trans_name ABLIM2-203 For mapping the auto_mirna KEY we need the precursor miRNAs from: mirna.txt.gz """ pre_2_context = pandas.merge(self.context_df, self.pre_mirna_df, how='left', on=None, left_index=True, right_index=True) for row in pre_2_context.itertuples(): self.transcript_codes_precursor.add_relationship( {'sid': row.transcript_id}, {'sid': row.mir_acc}, { 'overlap_type': row.overlap_type, 'number': row.number }) def get_gene_pre_relationships(self): """ Parse relationships from Gene to Mirna. MiRBase provides links to external databases in a table:mirna_database_links.txt.gz `auto_mirna`, `auto_db`, `link`, `display_name` 64743 5 406882 MIRLET7A2 auto_mirna is the mirna KEY from: mirna.txt.gz auto_db is the DB KEY from: mirna_database_url.txt.gz 5 EntrezGene https://www.ncbi.nlm.nih.gov/gene/<?> Example line: 64743 ENTREZGENE 406882 MIRLET7A2 :return: """ gene_pre_df = pandas.merge(self.mirna_database_link_df, self.mirna_database_url_df, how='left', left_on='auto_db', right_index=True) print(len(self.mirna_database_link_df), len(self.mirna_database_url_df)) print(len(gene_pre_df)) final_merge = pandas.merge(gene_pre_df, self.pre_mirna_df, how='left', on=None, left_index=True, right_index=True) for row in final_merge.itertuples(): if row.display_name_y == 'EntrezGene': self.gene_is_precursor.add_relationship( {'sid': row.link}, {'sid': row.mir_acc}, {'source': self.mirbase_instance.datasource.name})