def get_generators(self, construct_data, data_provider, batch_size): """Create Generators""" data_providers = [] release = "" constructs = [] construct_synonyms = [] construct_secondary_ids = [] cross_reference_list = [] component_details = [] component_no_gene_details = [] non_bgi_components = [] counter = 0 date_produced = construct_data['metaData']['dateProduced'] data_provider_object = construct_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') self.logger.info("DataProvider: " + data_provider) data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_construct" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) if 'release' in construct_data['metaData']: release = construct_data['metaData']['release'] for construct_record in construct_data['data']: counter = counter + 1 global_id = construct_record['primaryId'] local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue name_text = TextProcessingHelper.cleanhtml( construct_record.get('name')) construct_dataset = { "symbol": construct_record.get('name'), "primaryId": construct_record.get('primaryId'), "globalId": global_id, "localId": local_id, "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "nameText": name_text, "name": construct_record.get('name') } constructs.append(construct_dataset) if 'crossReferences' in construct_record: for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'construct': mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'constructComponents' in construct_record: for component in construct_record.get('constructComponents'): component_relation = component.get( 'componentRelation').upper() component_symbol = component.get('componentSymbol') component_id = component.get('componentID') if component_id is not None: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "componentID": component_id, "constructID": construct_record.get('primaryId') } component_details.append(component_detail) else: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "constructID": construct_record.get('primaryId') } non_bgi_component = { "componentSymbol": component_symbol } non_bgi_components.append(non_bgi_component) component_no_gene_details.append(component_detail) if 'synonyms' in construct_record: for syn in construct_record.get('synonyms'): construct_synonym = { "data_id": construct_record.get('primaryId'), "synonym": syn.strip() } construct_synonyms.append(construct_synonym) if 'secondaryIds' in construct_record: for secondary_id in construct_record.get('secondaryIds'): construct_secondary_id = { "data_id": construct_record.get('primaryId'), "secondary_id": secondary_id } construct_secondary_ids.append(construct_secondary_id) if counter == batch_size: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ] constructs = [] construct_secondary_ids = [] construct_synonyms = [] cross_reference_list = [] non_bgi_components = [] component_details = [] component_no_gene_details = [] counter = 0 if counter > 0: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ]
def get_generators(self, disease_data, batch_size, data_provider): """Creating generators""" counter = 0 disease_association_type = None gene_list_to_yield = [] allele_list_to_yield = [] agm_list_to_yield = [] evidence_code_list_to_yield = [] withs = [] pge_list_to_yield = [] xrefs = [] data_provider_object = disease_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') for disease_record in disease_data['data']: publication_mod_id = "" pub_med_id = "" pub_mod_url = None pub_med_url = None pge_key = '' if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( disease_record.get('objectId')) if is_it_test_entry is False: continue disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \ disease_record['objectRelation'].get("associationType").upper() counter = counter + 1 disease_object_type = disease_record['objectRelation'].get( "objectType") primary_id = disease_record.get('objectId') do_id = disease_record.get('DOid') if 'evidence' in disease_record: pecj_primary_key = str(uuid.uuid4()) evidence = disease_record.get('evidence') if 'publication' in evidence: publication = evidence.get('publication') if publication.get('publicationId').startswith('PMID:'): pub_med_id = publication.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = ETLHelper.get_complete_pub_url( local_pub_med_id, pub_med_id) if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) else: publication_mod_id = publication.get('publicationId') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) if 'evidenceCodes' in disease_record['evidence']: for ecode in disease_record['evidence'].get( 'evidenceCodes'): ecode_map = { "pecjPrimaryKey": pecj_primary_key, "ecode": ecode } evidence_code_list_to_yield.append(ecode_map) negation = '' if 'objectRelation' in disease_record: disease_association_type = disease_record[ 'objectRelation'].get("associationType").upper() if 'negation' in disease_record: # this capitalization is purposeful if disease_association_type == 'IS_IMPLICATED_IN': disease_association_type = 'IS_NOT_IMPLICATED_IN' if disease_association_type == 'IS_MODEL_OF': disease_association_type = 'IS_NOT_MODEL_OF' if disease_association_type == 'IS_MARKER_FOR': disease_association_type = 'IS_NOT_MARKER_FOR' negation = 'NOT' disease_unique_key = disease_unique_key + negation additional_genetic_components = [] if 'additionalGeneticComponents' in disease_record[ 'objectRelation']: for component in disease_record['objectRelation'][ 'additionalGeneticComponents']: component_symbol = component.get('componentSymbol') component_id = component.get('componentId') component_url = component.get( 'componentUrl') + component_id additional_genetic_components.append({ "id": component_id, "componentUrl": component_url, "componentSymbol": component_symbol }) if 'with' in disease_record: with_record = disease_record.get('with') for rec in with_record: disease_unique_key = disease_unique_key + rec for rec in with_record: with_map = { "diseaseUniqueKey": disease_unique_key, "withD": rec } withs.append(with_map) if 'primaryGeneticEntityIDs' in disease_record: pge_ids = disease_record.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) if 'dataProvider' in disease_record: for dp in disease_record['dataProvider']: annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') pages = xref.get('pages') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' if pages is not None and len(pages) > 0: for page in pages: if (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = data_provider mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_id, cross_ref_id + page + annotation_type) passing_xref['dataId'] = disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref) disease_record = { "diseaseUniqueKey": disease_unique_key, "doId": do_id, "primaryId": primary_id, "pecjPrimaryKey": pecj_primary_key, "relationshipType": disease_association_type.upper(), "dataProvider": data_provider, "dateAssigned": disease_record.get("dateAssigned"), "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url, "negation": negation } if disease_object_type == 'gene': gene_list_to_yield.append(disease_record) elif disease_object_type == 'allele': allele_list_to_yield.append(disease_record) else: agm_list_to_yield.append(disease_record) if counter == batch_size: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ] agm_list_to_yield = [] allele_list_to_yield = [] gene_list_to_yield = [] evidence_code_list_to_yield = [] pge_list_to_yield = [] xrefs = [] withs = [] counter = 0 if counter > 0: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ]
def get_generators(self, sqtr_data, data_provider, batch_size): """Get Generators""" data_providers = [] sqtrs = [] sqtr_synonyms = [] sqtr_secondary_ids = [] mod_global_cross_ref_url = "" tgs = [] counter = 0 date_produced = sqtr_data['metaData']['dateProduced'] data_provider_object = sqtr_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_SqTR" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict( \ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for sqtr_record in sqtr_data['data']: counter = counter + 1 global_id = sqtr_record['primaryId'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue if sqtr_record.get('secondaryIds') is not None: for sid in sqtr_record.get('secondaryIds'): sqtr_secondary_id_dataset = { "primaryId": sqtr_record.get('primaryId'), "secondaryId": sid } sqtr_secondary_ids.append(sqtr_secondary_id_dataset) if sqtr_record.get('synonyms') is not None: for syn in sqtr_record.get('synonyms'): syn_dataset = { "primaryId": sqtr_record.get('primaryId'), "synonym": syn } sqtr_synonyms.append(syn_dataset) if sqtr_record.get('targetGeneIds') is not None: for target_gene_id in sqtr_record.get('targetGeneIds'): tg_dataset = { "primaryId": sqtr_record.get('primaryId'), "geneId": target_gene_id } tgs.append(tg_dataset) if 'crossReferences' in sqtr_record: for cross_ref in sqtr_record['modCrossReference']: cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is None or len(pages) == 0: continue if 'sequence_targeting_reagent' in pages: page = 'sequence_targeting_reagent' mod_global_cross_ref_url = ETLHelper.get_page_complete_url( \ local_crossref_id, self.xref_url_map, prefix, page) sqtr_dataset = { "primaryId": sqtr_record.get('primaryId'), "name": sqtr_record.get('name'), "globalId": global_id, "localId": local_id, "soTerm": sqtr_record.get('soTermId'), "taxonId": sqtr_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider } sqtrs.append(sqtr_dataset) if counter == batch_size: yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs] sqtrs = [] sqtr_secondary_ids = [] sqtr_synonyms = [] tgs = [] counter = 0 if counter > 0: yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
def get_generators(self, agm_data, data_provider, batch_size): """Get Generators""" data_providers = [] agms = [] agm_synonyms = [] agm_secondary_ids = [] mod_global_cross_ref_url = "" components = [] backgrounds = [] sqtrs = [] counter = 0 date_produced = agm_data['metaData']['dateProduced'] data_provider_object = agm_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_agm" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for agm_record in agm_data['data']: counter = counter + 1 global_id = agm_record['primaryID'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue if agm_record.get('secondaryIds') is not None: for sid in agm_record.get('secondaryIds'): agm_secondary_id_dataset = { "primaryId": agm_record.get('primaryID'), "secondaryId": sid } agm_secondary_ids.append(agm_secondary_id_dataset) if agm_record.get('synonyms') is not None: for syn in agm_record.get('synonyms'): syn_dataset = { "primaryId": agm_record.get('primaryID'), "synonym": syn } agm_synonyms.append(syn_dataset) if 'crossReference' in agm_record: cross_ref = agm_record.get('crossReference') cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page in ['Fish', 'genotype', 'strain']: mod_global_cross_ref_url = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) short_species_abbreviation = ETLHelper.get_short_species_abbreviation( agm_record.get('taxonId')) name_text = TextProcessingHelper.cleanhtml(agm_record.get('name')) # TODO: make subtype required in submission file. subtype = agm_record.get('subtype') if subtype is None and data_provider == 'WB': subtype = 'strain' if subtype is None: subtype = 'affected_genomic_model' # TODO: name_text agm_dataset = { "primaryId": agm_record.get('primaryID'), "name": agm_record.get('name'), "globalId": global_id, "localId": local_id, "taxonId": agm_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "subtype": subtype, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider, "nameText": name_text, "nameWithSpecies": agm_record.get('name') + " (" + short_species_abbreviation + ")", "nameTextWithSpecies": name_text + " (" + short_species_abbreviation + ")", } agms.append(agm_dataset) if agm_record.get('affectedGenomicModelComponents') is not None: for component in agm_record.get( 'affectedGenomicModelComponents'): component_dataset = { "primaryId": agm_record.get('primaryID'), "componentId": component.get('alleleID'), "zygosityId": component.get('zygosity') } components.append(component_dataset) if agm_record.get('sequenceTargetingReagentIDs') is not None: for sqtr in agm_record.get('sequenceTargetingReagentIDs'): sqtr_dataset = { "primaryId": agm_record.get('primaryID'), "sqtrId": sqtr } sqtrs.append(sqtr_dataset) if agm_record.get('parentalPopulationIDs') is not None: for background in agm_record.get('parentalPopulationIDs'): background_dataset = { "primaryId": agm_record.get('primaryID'), "backgroundId": background } backgrounds.append(background_dataset) if counter == batch_size: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ] agms = [] agm_secondary_ids = [] agm_synonyms = [] components = [] backgrounds = [] counter = 0 if counter > 0: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ]
def get_generators(self, phenotype_data, batch_size): """Get Generators""" list_to_yield = [] pge_list_to_yield = [] date_produced = phenotype_data['metaData']['dateProduced'] data_providers = [] data_provider_object = phenotype_data['metaData']['dataProvider'] counter = 0 data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] pge_key = '' load_key = date_produced + data_provider + "_phenotype" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, ETL.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) for pheno in phenotype_data['data']: pecj_primary_key = str(uuid.uuid4()) counter = counter + 1 pub_med_id = None pub_mod_id = None pub_med_url = None pub_mod_url = None primary_id = pheno.get('objectId') phenotype_statement = pheno.get('phenotypeStatement') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( primary_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = pheno.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence['publicationId'] local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, primary_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') pub_mod_id = pub_xref.get('id') pub_mod_local_id = pub_mod_id.split(":")[1] if pub_mod_id is not None: pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) else: pub_mod_id = evidence.get('publicationId') if pub_mod_id is not None: pub_mod_local_id = pub_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) if pub_mod_id is None: pub_mod_id = "" if pub_med_id is None: pub_med_id = "" if pub_mod_id is None: pub_mod_id = "" date_assigned = pheno.get('dateAssigned') if pub_mod_id is None and pub_med_id is None: self.logger.info("%s is missing pubMed and pubMod id", primary_id) if 'primaryGeneticEntityIDs' in pheno: pge_ids = pheno.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) phenotype = { "primaryId": primary_id, "phenotypeUniqueKey": primary_id + phenotype_statement.strip(), "phenotypeStatement": phenotype_statement.strip(), "dateAssigned": date_assigned, "loadKey": load_key, "type": "gene", "dataProviders": data_providers, "dataProvider": data_provider, "dateProduced": date_produced, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": pub_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + pub_mod_id, "pecjPrimaryKey": pecj_primary_key } list_to_yield.append(phenotype) if counter == batch_size: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ] list_to_yield = [] pge_list_to_yield = [] counter = 0 if counter > 0: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ]
def get_generators(self, expression_file, batch_size): """Get Generators""" self.logger.debug("made it to the expression generator") counter = 0 cross_references = [] bio_entities = [] bio_join_entities = [] bio_entity_gene_aos = [] pubs = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] stage_uberon_data = [] uberon_ao_data = [] uberon_ao_other_data = [] uberon_stage_other_data = [] self.logger.debug("streaming json data from %s ...", expression_file) with codecs.open(expression_file, 'r', 'utf-8') as file_handle: for xpat in ijson.items(file_handle, 'data.item'): counter = counter + 1 pub_med_url = None pub_mod_url = None pub_med_id = "" publication_mod_id = "" stage_term_id = "" stage_name = "" stage_uberon_term_id = "" gene_id = xpat.get('geneId') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( gene_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = xpat.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, gene_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \ publication_mod_id) else: publication_mod_id = evidence['publicationId'] if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\ publication_mod_id) if publication_mod_id is None: publication_mod_id = "" assay = xpat.get('assay') if 'whereExpressed' in xpat: where_expressed = xpat.get('whereExpressed') cellular_component_qualifier_term_id = \ where_expressed.get('cellularComponentQualifierTermId') cellular_component_term_id = where_expressed.get( 'cellularComponentTermId') anatomical_structure_term_id = where_expressed.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = where_expressed.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = \ where_expressed.get('anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = where_expressed.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = where_expressed.get( 'whereExpressedStatement') when_expressed_stage = xpat.get('whenExpressed') if 'stageTermId' in when_expressed_stage: stage_term_id = when_expressed_stage.get('stageTermId') if 'stageName' in when_expressed_stage: stage_name = when_expressed_stage.get('stageName') # TODO: making unique BioEntityGeneExpressionJoin nodes # and ExpressionBioEntity nodes is tedious. # TODO: Lets get the DQMs to fix this. expression_unique_key = gene_id + assay + stage_name expression_entity_unique_key = "" if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key \ += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement expression_unique_key += where_expressed_statement if where_expressed.get( 'anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in \ where_expressed.get('anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = \ uberon_structure_term_object.get('uberonTerm') if structure_uberon_term_id is not None \ and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None \ and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if where_expressed.get( 'anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in \ where_expressed.get('anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = \ uberon_sub_structure_term_object.get('uberonTerm') if sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is None: cellular_component_term_id = "" if when_expressed_stage.get( 'stageUberonSlimTerm') is not None: stage_uberon_term_object = when_expressed_stage.get( 'stageUberonSlimTerm') stage_uberon_term_id = stage_uberon_term_object.get( "uberonTerm") if stage_uberon_term_id is not None \ and stage_uberon_term_id != "post embryonic, pre-adult": stage_uberon = { "uberonStageId": stage_uberon_term_id, "ei_uuid": expression_unique_key } stage_uberon_data.append(stage_uberon) if stage_uberon_term_id == "post embryonic, pre-adult": stage_uberon_other = { "ei_uuid": expression_unique_key } uberon_stage_other_data.append(stage_uberon_other) if stage_term_id is None or stage_name == 'N/A': stage_term_id = "" stage_name = "" stage_uberon_term_id = "" if stage_name is not None: stage = { "stageTermId": stage_term_id, "stageName": stage_name, "ei_uuid": expression_unique_key } stage_list.append(stage) else: stage_uberon_term_id = "" if 'crossReference' in xpat: cross_ref = xpat.get('crossReference') cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'gene/expression/annotation/detail': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\ local_cross_ref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['ei_uuid'] = expression_unique_key cross_references.append(xref) bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement } bio_entities.append(bio_entity) bio_join_entity = { "ei_uuid": expression_unique_key, "assay": assay } bio_join_entities.append(bio_join_entity) bio_entity_gene_ao = { "geneId": gene_id, "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id, "ei_uuid": expression_unique_key } bio_entity_gene_aos.append(bio_entity_gene_ao) pub = { "ei_uuid": expression_unique_key, "pubPrimaryKey": pub_med_id + publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url } pubs.append(pub) ao_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "assay": assay, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_expressions.append(ao_expression) if cellular_component_qualifier_term_id is not None: cc_qualifier = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } cc_qualifiers.append(cc_qualifier) if anatomical_structure_term_id is None: anatomical_structure_term_id = "" cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "assay": assay, "whereExpressedStatement": where_expressed_statement, "cellularComponentTermId": cellular_component_term_id, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } cc_expressions.append(cc_expression) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" if anatomical_structure_term_id is not None \ and anatomical_structure_term_id != "" \ and cellular_component_term_id is not None \ and cellular_component_term_id != "": ao_cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "stageTermId": stage_term_id, "stageName": stage_name, "stageUberonTermId": stage_uberon_term_id, "assay": assay, "cellularComponentTermId": cellular_component_term_id, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_cc_expressions.append(ao_cc_expression) if counter == batch_size: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ] bio_entities = [] bio_join_entities = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] uberon_stage_other_data = [] stage_uberon_data = [] uberon_ao_other_data = [] uberon_ao_data = [] cross_references = [] bio_entity_gene_aos = [] pubs = [] counter = 0 if counter > 0: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ]
def get_generators(self, allele_data, batch_size): data_providers = [] release = "" alleles_no_constrcut_no_gene = [] alleles_construct_gene = [] alleles_no_construct = [] alleles_no_gene = [] allele_synonyms = [] allele_secondary_ids = [] cross_reference_list = [] counter = 0 date_produced = allele_data['metaData']['dateProduced'] data_provider_object = allele_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] loadKey = date_produced + data_provider + "_ALLELE" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) logger.info("data provider: " + data_provider) if 'release' in allele_data['metaData']: release = allele_data['metaData']['release'] for allele_record in allele_data['data']: counter = counter + 1 global_id = allele_record['primaryId'] # fixing parsing error on this end while MGI fixes on their end. if global_id == 'MGI:3826848': description = allele_record.get('description')[:-2] else: description = allele_record.get('description') local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue gene_id = '' construct_id = '' association_type = '' short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId')) symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol')) if allele_record.get('alleleObjectRelations') is not None: for relation in allele_record.get('alleleObjectRelations'): association_type = relation.get('objectRelation').get('associationType') if relation.get('objectRelation').get('gene') is not None: gene_id = relation.get('objectRelation').get('gene') if relation.get('objectRelation').get('construct') is not None: construct_id = relation.get('objectRelation').get('construct') if gene_id != '' and construct_id != '': allele_construct_gene_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_construct_gene.append(allele_construct_gene_dataset) elif construct_id != '' and gene_id == '': allele_construct_no_gene_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_no_gene.append(allele_construct_no_gene_dataset) elif gene_id != '' and construct_id == '': allele_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_construct.append(allele_gene_no_construct_dataset) elif gene_id == '' and construct_id == '': allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) else: allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) if 'crossReferences' in allele_record: for crossRef in allele_record['crossReferences']: crossRefId = crossRef.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRef.get('id').split(":")[0] pages = crossRef.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \ or page == 'transgene/references' or page == 'construct/references': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_id, crossRefId + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'synonyms' in allele_record: for syn in allele_record.get('synonyms'): allele_synonym = { "data_id": allele_record.get('primaryId'), "synonym": syn.strip() } allele_synonyms.append(allele_synonym) if 'secondaryIds' in allele_record: for secondary_id in allele_record.get('secondaryIds'): allele_secondary_id = { "data_id": allele_record.get('primaryId'), "secondary_id": secondary_id } allele_secondary_ids.append(allele_secondary_id) if counter == batch_size: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list] alleles_no_construct = [] alleles_construct_gene = [] alleles_no_gene = [] alleles_no_constrcut_no_gene = [] allele_secondary_ids = [] allele_synonyms = [] cross_reference_list = [] counter = 0 if counter > 0: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list]
def get_generators(self, variant_data, batch_size): """Get Generators""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] data_provider_object = variant_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_VARIATION" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": #not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = ETLHelper.get_no_page_complete_url( local_cross_ref_id, ETL.xref_url_map, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]