class MirtarbaseParser(ReturnParser): def __init__(self): """ """ super(MirtarbaseParser, self).__init__() # RelationshipSets self.mirna_targets_gene = RelationshipSet('TARGETS', ['Mirna'], ['Gene'], ['name'], ['sid']) def run_with_mounted_arguments(self): self.run() def run(self): log.debug("Run {}".format(self.__class__.__name__)) mirtarbase_instance = self.get_instance_by_name('Mirtarbase') mirtarbase_file = mirtarbase_instance.get_file(FILE_NAME) df = pandas.read_excel(mirtarbase_file, index_col=None, header=0) # rename columns for easier access df.columns = ['mirtarbase_id', 'mirna', 'species_mirna', 'target_genesymbol', 'target_entrez', 'species_target', 'experiments', 'support_type', 'references'] for row in df.itertuples(): self.mirna_targets_gene.add_relationship( {'name': row.mirna.strip()}, {'sid': str(row.target_entrez).strip()}, {'experiments': row.experiments, 'support_type': row.support_type, 'references': row.references, 'source': mirtarbase_instance.datasource.name} )
def __init__(self): super(SomeParser, self).__init__() self.source = NodeSet(['Source'], merge_keys=['source_id']) self.target = NodeSet(['Target'], merge_keys=['target_id']) self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id'])
def __init__(self): super(NcbiGeneParser, self).__init__() # arguments self.arguments = ['taxid'] # output data # both gene IDs and GeneSymbols have the label 'Gene' # two different NodeSets are used because only the GeneSymbol nodes need taxid for uniqueness self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.genesymbols = NodeSet(['Gene'], merge_keys=['sid', 'taxid'], default_props={ 'source': 'ncbigene', 'type': 'symbol' }) self.genesymbol_synonym_genesymbol = RelationshipSet( 'SYNONYM', ['Gene'], ['Gene'], ['sid', 'taxid'], ['sid', 'taxid'], default_props={'source': 'ncbigene'}) self.gene_maps_genesymbol = RelationshipSet( 'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid', 'taxid'], default_props={'source': 'ncbigene'})
def __init__(self): """ """ super(MirtarbaseParser, self).__init__() # RelationshipSets self.mirna_targets_gene = RelationshipSet('TARGETS', ['Mirna'], ['Gene'], ['name'], ['sid'])
class DummyParser(ReturnParser): def __init__(self): super(DummyParser, self).__init__() # arguments self.arguments = ['taxid'] # output data self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid']) self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid']) self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'], ['sid'], ['sid']) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): dummy_instance = self.get_instance_by_name('Dummy') dummyfile = dummy_instance.get_file('file.txt') target_sids = list(string.ascii_lowercase) # Fummy nodes for i in range(10): self.fummy_nodes.add_node({'sid': i, 'taxid': taxid}) with open(dummyfile) as f: for l in f: letter = l.strip() self.dummy_nodes.add_node({'sid': letter, 'taxid': taxid}) self.dummy_knows_fummy.add_relationship({'sid': letter}, {'sid': randint(0, 9)}, {'key': 'value'})
def __init__(self): """ :param ensembl_instance: The ENSEMBL DataSource instance. """ super(EnsemblEntityParser, self).__init__() # arguments self.arguments = ['taxid'] # NodeSets self.genes = NodeSet(['Gene'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'], default_props={'source': 'ensembl'}) self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'ensembl'}) # RelationshipSets self.gene_codes_transcript = RelationshipSet( 'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'ensembl'}) self.transcript_codes_protein = RelationshipSet( 'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'ensembl'})
def __init__(self): super(GeneOntologyAssociationParser, self).__init__() self.arguments = ['taxid'] # RelationshipSets self.protein_associates_goterm = RelationshipSet( 'ASSOCIATION', ['Protein'], ['Term'], ['sid'], ['sid'])
def __init__(self): super(MirdbParser, self).__init__() # arguments self.arguments = ['taxid'] # RelationshipSets self.mirna_targets_transcript = RelationshipSet( 'TARGETS', ['Mirna'], ['Transcript'], ['name'], ['sid'])
def __init__(self): super(NcbiLegacyGeneParser, self).__init__() self.arguments = ['taxid'] self.legacy_genes = NodeSet(['Gene', 'Legacy'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.legacy_gene_now_gene = RelationshipSet( 'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'], default_props={'source': 'ncbigene'})
def __init__(self): super(DummyParser, self).__init__() # arguments self.arguments = ['taxid'] # output data self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid']) self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid']) self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'], ['sid'], ['sid'])
def __init__(self): super(SwissLipidsParser, self).__init__() # define NodeSet and RelationshipSet self.lipids = NodeSet(['Lipid'], merge_keys=['sid']) self.lipid_fromclass_lipid = RelationshipSet('FROM_LIPID_CLASS', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_parent_lipid = RelationshipSet('HAS_PARENT', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_component_lipid = RelationshipSet('HAS_COMPONENT', ['Lipid'], ['Lipid'], ['sid'], ['sid']) self.lipid_maps_metabolite = RelationshipSet('MAPS', ['Lipid'], ['Metabolite'], ['sid'], ['sid']) self.lipid_associates_protein = RelationshipSet('HAS_ASSOCIATION', ['Lipid'], ['Protein'], ['sid'], ['sid'])
def __init__(self): """ :param refseq_instance: The RefSeq DataSource instance. """ super(RefseqCodesParser, self).__init__() # arguments self.arguments = ['taxid'] # define NodeSet and RelationshipSet self.gene_codes_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'})
class DependingTestParser(ReturnParser): def __init__(self): super(DependingTestParser, self).__init__() self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id']) def run_with_mounted_arguments(self): self.run() def run(self): for i in range(100): self.rels.add_relationship({'source_id': i}, {'target_id': i}, {'source': 'test'})
def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(NcbiGeneOrthologParser, self).__init__() self.gene_ortholog_gene = RelationshipSet('ORTHOLOG', ['Gene'], ['Gene'], ['sid'], ['sid']) self.object_sets = [self.gene_ortholog_gene] self.container.add_all(self.object_sets)
def __init__(self): super(HmdbParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'hmdb'}) self.metabolite_map_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'hmdb'}) self.metabolite_associates_protein = RelationshipSet( 'HAS_ASSOCIATION', ['Metabolite'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'hmdb'})
class NcbiLegacyGeneParser(ReturnParser): """ Parse legacy gene IDs from gene_history.gz #tax_id GeneID Discontinued_GeneID Discontinued_Symbol Discontinue_Date 9 - 1246494 repA1 20031113 9 - 1246495 repA2 20031113 9 - 1246496 leuA 20031113 """ def __init__(self): super(NcbiLegacyGeneParser, self).__init__() self.arguments = ['taxid'] self.legacy_genes = NodeSet(['Gene', 'Legacy'], merge_keys=['sid'], default_props={'source': 'ncbigene'}) self.legacy_gene_now_gene = RelationshipSet( 'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'], default_props={'source': 'ncbigene'}) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): log.debug(f'Run parser {self.__class__.__name__} for taxID: {taxid}.') ncbigene_instance = self.get_instance_by_name('NcbiGene') gene_history_file = ncbigene_instance.get_file('gene_history.gz') with gzip.open(gene_history_file, 'rt') as f: # skip header next(f) for l in f: flds = l.strip().split('\t') this_taxid = flds[0] if this_taxid == taxid: new_gene_id = flds[1] discontinued_gene_id = flds[2] discontinued_symbol = flds[3] date = flds[4] self.legacy_genes.add_node({ 'sid': discontinued_gene_id, 'date': date, 'symbol': discontinued_symbol, 'taxid': taxid }) if new_gene_id != '-': self.legacy_gene_now_gene.add_relationship( {'sid': discontinued_gene_id}, {'sid': new_gene_id}, {})
def __init__(self): """ :param mesh_instance: NcbiGene Instance :type mesh_instance: DataSourceInstance """ super(GtexDataParser, self).__init__() self.gene_expressed_tissue = RelationshipSet('EXPRESSED', ['Gene'], ['GtexDetailedTissue'], ['sid'], ['name']) self.object_sets = [self.gene_expressed_tissue] self.container.add_all(self.object_sets)
def deserialize(cls, source_dir: str, metadata_only: bool = False) -> 'Parser': """ Read from a serialized directory, recreate a Parser that can load to the database. :param source_dir: Directory to read from. :return: A Parser object. """ log.debug(f"Read Parser from {source_dir}.") p = cls() for file in os.listdir(source_dir): if not metadata_only: if file.startswith('nodeset_'): ns_name = file.replace('.json', '') with open(os.path.join(source_dir, file), 'rt') as f: log.debug(f"Deserialize {f}") ns = NodeSet.from_dict(json.load(f)) log.debug(f"Num nodes in NodeSet: {len(ns.nodes)}") p.__dict__[ns_name] = ns elif file.startswith('relationshipset_'): rs_name = file.replace('.json', '') with open(os.path.join(source_dir, file), 'rt') as f: log.debug(f"Deserialize {f}") rs = RelationshipSet.from_dict(json.load(f)) log.debug(f"Num relationships in RelationshipSet: {len(rs.relationships)}") p.__dict__[rs_name] = rs if file == 'parser_data.json': with open(os.path.join(source_dir, file), 'rt') as f: metadata = json.load(f) # TODO add datasource instances to deserializer p.name = metadata['name'] return p
def read_daily_report_data_csv_JHU(file): """ Extract data from a single daile report file from JHU. :param file: Path to the CSV file :return: """ log.info('Read JHU CSV file {}'.format(file)) countries = NodeSet(['Country'], ['name']) provinces = NodeSet(['Province'], ['name']) updates = NodeSet(['DailyReport'], ['uuid']) province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'], ['name'], ['name']) province_in_country.unique = True province_rep_update = RelationshipSet('REPORTED', ['Province'], ['DailyReport'], ['name'], ['uuid']) with open(file, 'rt') as csvfile: rows = csv.reader(csvfile, delimiter=',', quotechar='"') # skip header next(rows) for row in rows: country = row[1] province = row[0] # if no name for province, use country name if not province: province = '{}_complete'.format(country) date = parse(row[2]) uuid = country+province+str(date) confirmed = int(row[3]) if row[3] else 'na' death = int(row[4]) if row[4] else 'na' recovered = int(row[5]) if row[5] else 'na' lat = row[6] if len(row) >= 7 else None long = row[7] if len(row) >= 8 else None province_dict = {'name': province} if lat and long: province_dict['latitude'] = lat province_dict['longitude'] = long provinces.add_unique(province_dict) countries.add_unique({'name': country}) updates.add_unique( {'date': date, 'confirmed': confirmed, 'death': death, 'recovered': recovered, 'uuid': uuid}) province_in_country.add_relationship({'name': province}, {'name': country}, {'source': 'jhu'}) province_rep_update.add_relationship({'name': province}, {'uuid': uuid}, {'source': 'jhu'}) return countries, provinces, updates, province_in_country, province_rep_update
class GtexDataParser(ReturnParser): def __init__(self): """ :param mesh_instance: NcbiGene Instance :type mesh_instance: DataSourceInstance """ super(GtexDataParser, self).__init__() self.gene_expressed_tissue = RelationshipSet('EXPRESSED', ['Gene'], ['GtexDetailedTissue'], ['sid'], ['name']) self.object_sets = [self.gene_expressed_tissue] self.container.add_all(self.object_sets) def run_with_mounted_arguments(self): self.run() def run(self): gtex_instance = self.get_instance_by_name('Gtex') gtex_mean_gene = gtex_instance.get_file( 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz') with gzip.open(gtex_mean_gene, 'rt') as f: lines = f.readlines() # remove first two lines lines = lines[2:] # get header line header = lines.pop(0) header_fields = header.split('\t') # iterate data lines for line in lines: flds = line.split('\t') gene_id = flds[0].split('.')[0] data_flds = flds[2:] # iterate the other elements with index # have the index start at 2 to match the header which also includes the first two columns for i, value in enumerate(data_flds, start=2): tissue_detailed_name = header_fields[i] self.gene_expressed_tissue.add_relationship( {'sid': gene_id}, {'name': tissue_detailed_name}, {'val': value})
def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(HGNCParser, self).__init__() # output data self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'], ['GeneSymbol'], ['sid'], ['sid', 'taxid'])
class SomeParser(ReturnParser): def __init__(self): super(SomeParser, self).__init__() self.source = NodeSet(['Source'], merge_keys=['source_id']) self.target = NodeSet(['Target'], merge_keys=['target_id']) self.rels = RelationshipSet('FOO', ['Source'], ['Target'], ['source_id'], ['target_id']) def run_with_mounted_arguments(self): self.run() def run(self): for i in range(100): self.source.add_node({'source_id': i}) self.target.add_node({'target_id': i}) self.rels.add_relationship({'source_id': i}, {'target_id': i}, {'source': 'test'})
def __init__(self): """ :param mesh_instance: NcbiGene Instance :type mesh_instance: DataSourceInstance """ super(GtexMetadataParser, self).__init__() # NodeSets self.tissues = NodeSet(['GtexTissue'], merge_keys=['name']) self.detailed_tissues = NodeSet(['GtexDetailedTissue'], merge_keys=['name']) self.sample = NodeSet(['GtexSample'], merge_keys=['sid']) self.sample_measures_tissue = RelationshipSet('MEASURES', ['GtexSample'], ['GtexTissue'], ['sid'], ['name']) self.sample_measures_detailed_tissue = RelationshipSet( 'MEASURES', ['GtexSample'], ['GtexDetailedTissue'], ['sid'], ['name']) self.tissue_parent_detailed_tissue = RelationshipSet( 'PARENT', ['GtexTissue'], ['GtexDetailedTissue'], ['name'], ['name']) self.tissue_parent_detailed_tissue.unique = True
class HGNCParser(ReturnParser): def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(HGNCParser, self).__init__() # output data self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'], ['GeneSymbol'], ['sid'], ['sid', 'taxid']) def run_with_mounted_arguments(self): self.run() def run(self): hgnc_instance = self.get_instance_by_name('HGNC') hgnc_complete_file = hgnc_instance.get_file('hgnc_complete_set.txt') self.parse_hgnc_complete_file(hgnc_complete_file) def parse_hgnc_complete_file(self, hgnc_complete_file): with open(hgnc_complete_file, 'rt') as f: header = next(f) for l in f: flds = l.strip().split('\t') sid = flds[0] gene_symbol = flds[1] ncbi_id = flds[18] if len(flds) > 18 else None ensembl_id = flds[19] if len(flds) > 19 else None all_props = dict(zip(header, flds)) all_props['sid'] = sid all_props['source'] = 'hgnc' self.genes.add_node(all_props) if ncbi_id: self.gene_maps_gene.add_relationship({'sid': sid}, {'sid': ncbi_id}, {'source': 'hgnc'}) if ensembl_id: self.gene_maps_gene.add_relationship({'sid': sid}, {'sid': ensembl_id}, {'source': 'hgnc'}) if gene_symbol: self.gene_maps_genesymbol.add_relationship({'sid': sid}, { 'sid': gene_symbol, 'taxid': '9606' }, {'source': 'hgnc'})
def __init__(self): super(RefseqRemovedRecordsParser, self).__init__() self.arguments = ['taxid'] self.legacy_ids = set() self.legacy_transcripts = NodeSet(['Transcript', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.legacy_transcript_now_transcript = RelationshipSet('REPLACED_BY', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.legacy_proteins = NodeSet(['Protein', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'}) self.legacy_protein_now_protein = RelationshipSet('REPLACED_BY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.gene_codes_legacy_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript', 'Legacy'], ['sid'], ['sid'], default_props={'source': 'refseq'}) self.legacy_transcript_codes_protein = RelationshipSet('CODES', ['Transcript', 'Legacy'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'})
def __init__(self): super(LncipediaParser, self).__init__() self.genes = NodeSet(['Gene'], merge_keys=['sid']) self.transcripts = NodeSet(['Transcript'], merge_keys=['sid']) self.gene_codes_transcripts = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid']) self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid']) self.transcript_maps_transcript = RelationshipSet('MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'])
class NcbiGeneOrthologParser(ReturnParser): def __init__(self): """ :param ncbigene_instance: NcbiGene Instance :type ncbigene_instance: DataSourceInstance :param taxid: """ super(NcbiGeneOrthologParser, self).__init__() self.gene_ortholog_gene = RelationshipSet('ORTHOLOG', ['Gene'], ['Gene'], ['sid'], ['sid']) self.object_sets = [self.gene_ortholog_gene] self.container.add_all(self.object_sets) def run_with_mounted_arguments(self): self.run() def run(self): """ Get the Gene-ORTHOLOG-Gene relationships. This is currently not filteres for taxid. """ ncbigene_instance = self.get_instance_by_name('NcbiGene') ortholog_file = ncbigene_instance.get_file('gene_orthologs.gz') with gzip.open(ortholog_file, 'rt') as f: # skip first line next(f) for l in f: flds = l.strip().split() g1 = flds[1] g2 = flds[4] self.gene_ortholog_gene.add_relationship({'sid': g1}, {'sid': g2}, {})
class MirdbParser(ReturnParser): def __init__(self): super(MirdbParser, self).__init__() # arguments self.arguments = ['taxid'] # RelationshipSets self.mirna_targets_transcript = RelationshipSet( 'TARGETS', ['Mirna'], ['Transcript'], ['name'], ['sid']) def run_with_mounted_arguments(self): self.run(self.taxid) def run(self, taxid): mirdb_instance = self.get_instance_by_name('Mirdb') mirdb_file = mirdb_instance.datasource.get_prediction_file( mirdb_instance) datasource_name = mirdb_instance.datasource.name mir_prefix = TAXID_2_MIRPREFIX[taxid] with gzip.open(mirdb_file, 'rt') as f: for l in f: flds = l.split() mir_name = flds[0] if mir_name.startswith(mir_prefix): target = flds[1] score = float(flds[2]) self.mirna_targets_transcript.add_relationship( {'name': mir_name}, {'sid': target}, { 'score': score, 'source': datasource_name })
def __init__(self): super(MeshParser, self).__init__() # NodeSets self.descriptor = NodeSet(['MeshDescriptor'], merge_keys=['sid']) self.qualifier = NodeSet(['MeshQualifier'], merge_keys=['sid']) self.concept = NodeSet(['MeshConcept'], merge_keys=['sid']) self.term = NodeSet(['MeshTerm'], merge_keys=['sid']) self.descriptor_allowed_qualifier = RelationshipSet('ALLOWED', ['MeshDescriptor'], ['MeshQualifier'], ['sid'], ['sid']) self.descriptor_has_concept = RelationshipSet('HAS', ['MeshDescriptor'], ['MeshConcept'], ['sid'], ['sid']) self.descriptor_has_concept.unique = True self.concept_has_term = RelationshipSet('HAS', ['MeshConcept'], ['MeshTerm'], ['sid'], ['sid']) self.concept_has_term.unique = True self.concept_related_concept = RelationshipSet('RELATED', ['MeshConcept'], ['MeshConcept'], ['sid'], ['sid']) self.concept_related_concept.unique = True
def __init__(self): super(ChebiParser, self).__init__() # NodeSets self.metabolites = NodeSet(['Metabolite'], merge_keys=['sid'], default_props={'source': 'chebi'}) self.metabolite_isa_metabolite = RelationshipSet( 'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_rel_metabolite = RelationshipSet( 'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'}) self.metabolite_maps_metabolite = RelationshipSet( 'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'], default_props={'source': 'chebi'})