def get_generators(self, disease_data, batch_size, data_provider): """Creating generators""" counter = 0 disease_association_type = None gene_list_to_yield = [] allele_list_to_yield = [] agm_list_to_yield = [] evidence_code_list_to_yield = [] withs = [] pge_list_to_yield = [] xrefs = [] data_provider_object = disease_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') for disease_record in disease_data['data']: publication_mod_id = "" pub_med_id = "" pub_mod_url = None pub_med_url = None pge_key = '' if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( disease_record.get('objectId')) if is_it_test_entry is False: continue disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \ disease_record['objectRelation'].get("associationType").upper() counter = counter + 1 disease_object_type = disease_record['objectRelation'].get( "objectType") primary_id = disease_record.get('objectId') do_id = disease_record.get('DOid') if 'evidence' in disease_record: pecj_primary_key = str(uuid.uuid4()) evidence = disease_record.get('evidence') if 'publication' in evidence: publication = evidence.get('publication') if publication.get('publicationId').startswith('PMID:'): pub_med_id = publication.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = ETLHelper.get_complete_pub_url( local_pub_med_id, pub_med_id) if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) else: publication_mod_id = publication.get('publicationId') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) if 'evidenceCodes' in disease_record['evidence']: for ecode in disease_record['evidence'].get( 'evidenceCodes'): ecode_map = { "pecjPrimaryKey": pecj_primary_key, "ecode": ecode } evidence_code_list_to_yield.append(ecode_map) negation = '' if 'objectRelation' in disease_record: disease_association_type = disease_record[ 'objectRelation'].get("associationType").upper() if 'negation' in disease_record: # this capitalization is purposeful if disease_association_type == 'IS_IMPLICATED_IN': disease_association_type = 'IS_NOT_IMPLICATED_IN' if disease_association_type == 'IS_MODEL_OF': disease_association_type = 'IS_NOT_MODEL_OF' if disease_association_type == 'IS_MARKER_FOR': disease_association_type = 'IS_NOT_MARKER_FOR' negation = 'NOT' disease_unique_key = disease_unique_key + negation additional_genetic_components = [] if 'additionalGeneticComponents' in disease_record[ 'objectRelation']: for component in disease_record['objectRelation'][ 'additionalGeneticComponents']: component_symbol = component.get('componentSymbol') component_id = component.get('componentId') component_url = component.get( 'componentUrl') + component_id additional_genetic_components.append({ "id": component_id, "componentUrl": component_url, "componentSymbol": component_symbol }) if 'with' in disease_record: with_record = disease_record.get('with') for rec in with_record: disease_unique_key = disease_unique_key + rec for rec in with_record: with_map = { "diseaseUniqueKey": disease_unique_key, "withD": rec } withs.append(with_map) if 'primaryGeneticEntityIDs' in disease_record: pge_ids = disease_record.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) if 'dataProvider' in disease_record: for dp in disease_record['dataProvider']: annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') pages = xref.get('pages') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' if pages is not None and len(pages) > 0: for page in pages: if (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = data_provider mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_id, cross_ref_id + page + annotation_type) passing_xref['dataId'] = disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref) disease_record = { "diseaseUniqueKey": disease_unique_key, "doId": do_id, "primaryId": primary_id, "pecjPrimaryKey": pecj_primary_key, "relationshipType": disease_association_type.upper(), "dataProvider": data_provider, "dateAssigned": disease_record.get("dateAssigned"), "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url, "negation": negation } if disease_object_type == 'gene': gene_list_to_yield.append(disease_record) elif disease_object_type == 'allele': allele_list_to_yield.append(disease_record) else: agm_list_to_yield.append(disease_record) if counter == batch_size: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ] agm_list_to_yield = [] allele_list_to_yield = [] gene_list_to_yield = [] evidence_code_list_to_yield = [] pge_list_to_yield = [] xrefs = [] withs = [] counter = 0 if counter > 0: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ]
def get_generators(self, phenotype_data, batch_size): """Get Generators""" list_to_yield = [] pge_list_to_yield = [] date_produced = phenotype_data['metaData']['dateProduced'] data_providers = [] data_provider_object = phenotype_data['metaData']['dataProvider'] counter = 0 data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] pge_key = '' load_key = date_produced + data_provider + "_phenotype" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, ETL.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) for pheno in phenotype_data['data']: pecj_primary_key = str(uuid.uuid4()) counter = counter + 1 pub_med_id = None pub_mod_id = None pub_med_url = None pub_mod_url = None primary_id = pheno.get('objectId') phenotype_statement = pheno.get('phenotypeStatement') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( primary_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = pheno.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence['publicationId'] local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, primary_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') pub_mod_id = pub_xref.get('id') pub_mod_local_id = pub_mod_id.split(":")[1] if pub_mod_id is not None: pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) else: pub_mod_id = evidence.get('publicationId') if pub_mod_id is not None: pub_mod_local_id = pub_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) if pub_mod_id is None: pub_mod_id = "" if pub_med_id is None: pub_med_id = "" if pub_mod_id is None: pub_mod_id = "" date_assigned = pheno.get('dateAssigned') if pub_mod_id is None and pub_med_id is None: self.logger.info("%s is missing pubMed and pubMod id", primary_id) if 'primaryGeneticEntityIDs' in pheno: pge_ids = pheno.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) phenotype = { "primaryId": primary_id, "phenotypeUniqueKey": primary_id + phenotype_statement.strip(), "phenotypeStatement": phenotype_statement.strip(), "dateAssigned": date_assigned, "loadKey": load_key, "type": "gene", "dataProviders": data_providers, "dataProvider": data_provider, "dateProduced": date_produced, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": pub_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + pub_mod_id, "pecjPrimaryKey": pecj_primary_key } list_to_yield.append(phenotype) if counter == batch_size: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ] list_to_yield = [] pge_list_to_yield = [] counter = 0 if counter > 0: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ]