def get_generators(self, agm_data, data_provider, batch_size): """Get Generators""" data_providers = [] agms = [] agm_synonyms = [] agm_secondary_ids = [] mod_global_cross_ref_url = "" components = [] backgrounds = [] sqtrs = [] counter = 0 date_produced = agm_data['metaData']['dateProduced'] data_provider_object = agm_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_agm" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for agm_record in agm_data['data']: counter = counter + 1 global_id = agm_record['primaryID'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue if agm_record.get('secondaryIds') is not None: for sid in agm_record.get('secondaryIds'): agm_secondary_id_dataset = { "primaryId": agm_record.get('primaryID'), "secondaryId": sid } agm_secondary_ids.append(agm_secondary_id_dataset) if agm_record.get('synonyms') is not None: for syn in agm_record.get('synonyms'): syn_dataset = { "primaryId": agm_record.get('primaryID'), "synonym": syn } agm_synonyms.append(syn_dataset) if 'crossReference' in agm_record: cross_ref = agm_record.get('crossReference') cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page in ['Fish', 'genotype', 'strain']: mod_global_cross_ref_url = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) short_species_abbreviation = ETLHelper.get_short_species_abbreviation( agm_record.get('taxonId')) name_text = TextProcessingHelper.cleanhtml(agm_record.get('name')) # TODO: make subtype required in submission file. subtype = agm_record.get('subtype') if subtype is None and data_provider == 'WB': subtype = 'strain' if subtype is None: subtype = 'affected_genomic_model' # TODO: name_text agm_dataset = { "primaryId": agm_record.get('primaryID'), "name": agm_record.get('name'), "globalId": global_id, "localId": local_id, "taxonId": agm_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "subtype": subtype, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider, "nameText": name_text, "nameWithSpecies": agm_record.get('name') + " (" + short_species_abbreviation + ")", "nameTextWithSpecies": name_text + " (" + short_species_abbreviation + ")", } agms.append(agm_dataset) if agm_record.get('affectedGenomicModelComponents') is not None: for component in agm_record.get( 'affectedGenomicModelComponents'): component_dataset = { "primaryId": agm_record.get('primaryID'), "componentId": component.get('alleleID'), "zygosityId": component.get('zygosity') } components.append(component_dataset) if agm_record.get('sequenceTargetingReagentIDs') is not None: for sqtr in agm_record.get('sequenceTargetingReagentIDs'): sqtr_dataset = { "primaryId": agm_record.get('primaryID'), "sqtrId": sqtr } sqtrs.append(sqtr_dataset) if agm_record.get('parentalPopulationIDs') is not None: for background in agm_record.get('parentalPopulationIDs'): background_dataset = { "primaryId": agm_record.get('primaryID'), "backgroundId": background } backgrounds.append(background_dataset) if counter == batch_size: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ] agms = [] agm_secondary_ids = [] agm_synonyms = [] components = [] backgrounds = [] counter = 0 if counter > 0: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ]
def get_generators(self, allele_data, batch_size): data_providers = [] release = "" alleles_no_constrcut_no_gene = [] alleles_construct_gene = [] alleles_no_construct = [] alleles_no_gene = [] allele_synonyms = [] allele_secondary_ids = [] cross_reference_list = [] counter = 0 date_produced = allele_data['metaData']['dateProduced'] data_provider_object = allele_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] loadKey = date_produced + data_provider + "_ALLELE" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) logger.info("data provider: " + data_provider) if 'release' in allele_data['metaData']: release = allele_data['metaData']['release'] for allele_record in allele_data['data']: counter = counter + 1 global_id = allele_record['primaryId'] # fixing parsing error on this end while MGI fixes on their end. if global_id == 'MGI:3826848': description = allele_record.get('description')[:-2] else: description = allele_record.get('description') local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue gene_id = '' construct_id = '' association_type = '' short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId')) symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol')) if allele_record.get('alleleObjectRelations') is not None: for relation in allele_record.get('alleleObjectRelations'): association_type = relation.get('objectRelation').get('associationType') if relation.get('objectRelation').get('gene') is not None: gene_id = relation.get('objectRelation').get('gene') if relation.get('objectRelation').get('construct') is not None: construct_id = relation.get('objectRelation').get('construct') if gene_id != '' and construct_id != '': allele_construct_gene_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_construct_gene.append(allele_construct_gene_dataset) elif construct_id != '' and gene_id == '': allele_construct_no_gene_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_no_gene.append(allele_construct_no_gene_dataset) elif gene_id != '' and construct_id == '': allele_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_construct.append(allele_gene_no_construct_dataset) elif gene_id == '' and construct_id == '': allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) else: allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) if 'crossReferences' in allele_record: for crossRef in allele_record['crossReferences']: crossRefId = crossRef.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRef.get('id').split(":")[0] pages = crossRef.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \ or page == 'transgene/references' or page == 'construct/references': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_id, crossRefId + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'synonyms' in allele_record: for syn in allele_record.get('synonyms'): allele_synonym = { "data_id": allele_record.get('primaryId'), "synonym": syn.strip() } allele_synonyms.append(allele_synonym) if 'secondaryIds' in allele_record: for secondary_id in allele_record.get('secondaryIds'): allele_secondary_id = { "data_id": allele_record.get('primaryId'), "secondary_id": secondary_id } allele_secondary_ids.append(allele_secondary_id) if counter == batch_size: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list] alleles_no_construct = [] alleles_construct_gene = [] alleles_no_gene = [] alleles_no_constrcut_no_gene = [] allele_secondary_ids = [] allele_synonyms = [] cross_reference_list = [] counter = 0 if counter > 0: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list]