def get_generators(self, agm_data, data_provider, batch_size):
        """Get Generators"""

        data_providers = []
        agms = []
        agm_synonyms = []
        agm_secondary_ids = []
        mod_global_cross_ref_url = ""
        components = []
        backgrounds = []
        sqtrs = []

        counter = 0
        date_produced = agm_data['metaData']['dateProduced']

        data_provider_object = agm_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_agm"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        for agm_record in agm_data['data']:
            counter = counter + 1
            global_id = agm_record['primaryID']
            local_id = global_id.split(":")[1]

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if agm_record.get('secondaryIds') is not None:
                for sid in agm_record.get('secondaryIds'):
                    agm_secondary_id_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "secondaryId": sid
                    }
                    agm_secondary_ids.append(agm_secondary_id_dataset)

            if agm_record.get('synonyms') is not None:
                for syn in agm_record.get('synonyms'):
                    syn_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "synonym": syn
                    }
                    agm_synonyms.append(syn_dataset)

            if 'crossReference' in agm_record:
                cross_ref = agm_record.get('crossReference')
                cross_ref_id = cross_ref.get('id')
                local_crossref_id = cross_ref_id.split(":")[1]
                prefix = cross_ref.get('id').split(":")[0]
                pages = cross_ref.get('pages')

                # some pages collection have 0 elements
                if pages is not None and len(pages) > 0:
                    for page in pages:
                        if page in ['Fish', 'genotype', 'strain']:
                            mod_global_cross_ref_url = ETLHelper.get_page_complete_url(
                                local_crossref_id, self.xref_url_map, prefix,
                                page)

            short_species_abbreviation = ETLHelper.get_short_species_abbreviation(
                agm_record.get('taxonId'))
            name_text = TextProcessingHelper.cleanhtml(agm_record.get('name'))

            # TODO: make subtype required in submission file.

            subtype = agm_record.get('subtype')
            if subtype is None and data_provider == 'WB':
                subtype = 'strain'
            if subtype is None:
                subtype = 'affected_genomic_model'

            # TODO: name_text
            agm_dataset = {
                "primaryId":
                agm_record.get('primaryID'),
                "name":
                agm_record.get('name'),
                "globalId":
                global_id,
                "localId":
                local_id,
                "taxonId":
                agm_record.get('taxonId'),
                "dataProviders":
                data_providers,
                "dateProduced":
                date_produced,
                "loadKey":
                load_key,
                "subtype":
                subtype,
                "modGlobalCrossRefUrl":
                mod_global_cross_ref_url,
                "dataProvider":
                data_provider,
                "nameText":
                name_text,
                "nameWithSpecies":
                agm_record.get('name') + " (" + short_species_abbreviation +
                ")",
                "nameTextWithSpecies":
                name_text + " (" + short_species_abbreviation + ")",
            }
            agms.append(agm_dataset)

            if agm_record.get('affectedGenomicModelComponents') is not None:

                for component in agm_record.get(
                        'affectedGenomicModelComponents'):
                    component_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "componentId": component.get('alleleID'),
                        "zygosityId": component.get('zygosity')
                    }
                    components.append(component_dataset)

            if agm_record.get('sequenceTargetingReagentIDs') is not None:
                for sqtr in agm_record.get('sequenceTargetingReagentIDs'):
                    sqtr_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "sqtrId": sqtr
                    }
                    sqtrs.append(sqtr_dataset)

            if agm_record.get('parentalPopulationIDs') is not None:
                for background in agm_record.get('parentalPopulationIDs'):
                    background_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "backgroundId": background
                    }
                    backgrounds.append(background_dataset)

            if counter == batch_size:
                yield [
                    agms, agm_secondary_ids, agm_synonyms, components, sqtrs,
                    backgrounds
                ]
                agms = []
                agm_secondary_ids = []
                agm_synonyms = []
                components = []
                backgrounds = []
                counter = 0

        if counter > 0:
            yield [
                agms, agm_secondary_ids, agm_synonyms, components, sqtrs,
                backgrounds
            ]
示例#2
0
    def get_generators(self, allele_data, batch_size):

        data_providers = []
        release = ""
        alleles_no_constrcut_no_gene = []
        alleles_construct_gene = []
        alleles_no_construct = []
        alleles_no_gene = []
        allele_synonyms = []
        allele_secondary_ids = []
        cross_reference_list = []

        counter = 0
        date_produced = allele_data['metaData']['dateProduced']

        data_provider_object = allele_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        loadKey = date_produced + data_provider + "_ALLELE"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider,
                                                                      data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page,
                                                                       data_provider_page, data_provider,
                                                                       cross_ref_complete_url,
                                                                       data_provider + data_provider_page))

                data_providers.append(data_provider)
                logger.info("data provider: " + data_provider)

        if 'release' in allele_data['metaData']:
            release = allele_data['metaData']['release']

        for allele_record in allele_data['data']:
            counter = counter + 1
            global_id = allele_record['primaryId']
            # fixing parsing error on this end while MGI fixes on their end.
            if global_id == 'MGI:3826848':
                description = allele_record.get('description')[:-2]
            else:
                description = allele_record.get('description')

            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            gene_id = ''
            construct_id = ''
            association_type = ''

            short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId'))
            symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol'))

            if allele_record.get('alleleObjectRelations') is not None:
                for relation in allele_record.get('alleleObjectRelations'):
                    association_type = relation.get('objectRelation').get('associationType')
                    if relation.get('objectRelation').get('gene') is not None:
                        gene_id = relation.get('objectRelation').get('gene')
                    if relation.get('objectRelation').get('construct') is not None:
                        construct_id = relation.get('objectRelation').get('construct')

                    if gene_id != '' and construct_id != '':
                        allele_construct_gene_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "geneId": gene_id,
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "constructId": construct_id,
                            "associationType": association_type
                        }
                        alleles_construct_gene.append(allele_construct_gene_dataset)

                    elif construct_id != '' and gene_id == '':
                        allele_construct_no_gene_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "constructId": construct_id,
                            "associationType": association_type
                        }

                        alleles_no_gene.append(allele_construct_no_gene_dataset)

                    elif gene_id != '' and construct_id == '':
                        allele_gene_no_construct_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "geneId": gene_id,
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "associationType": association_type
                        }

                        alleles_no_construct.append(allele_gene_no_construct_dataset)

                    elif gene_id == '' and construct_id == '':
                        allele_no_gene_no_construct_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "associationType": association_type
                        }

                        alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset)

            else:
                allele_no_gene_no_construct_dataset = {
                    "symbol": allele_record.get('symbol'),
                    "primaryId": allele_record.get('primaryId'),
                    "globalId": global_id,
                    "localId": local_id,
                    "taxonId": allele_record.get('taxonId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": loadKey,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": data_provider,
                    "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                    "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                    "symbolText": symbol_text,
                    "alleleDescription": description,
                    "associationType": association_type
                }
                alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset)

            if 'crossReferences' in allele_record:

                for crossRef in allele_record['crossReferences']:
                    crossRefId = crossRef.get('id')
                    local_crossref_id = crossRefId.split(":")[1]
                    prefix = crossRef.get('id').split(":")[0]
                    pages = crossRef.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \
                                    or page == 'transgene/references' or page == 'construct/references':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id,
                                                                                      self.xref_url_map, prefix, page)
                                xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId,
                                                               mod_global_cross_ref_id, crossRefId + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'synonyms' in allele_record:
                for syn in allele_record.get('synonyms'):
                    allele_synonym = {
                        "data_id": allele_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    allele_synonyms.append(allele_synonym)

            if 'secondaryIds' in allele_record:
                for secondary_id in allele_record.get('secondaryIds'):
                    allele_secondary_id = {
                        "data_id": allele_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    allele_secondary_ids.append(allele_secondary_id)

            if counter == batch_size:
                yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene,
                       allele_secondary_ids, allele_synonyms, cross_reference_list]
                alleles_no_construct = []
                alleles_construct_gene = []
                alleles_no_gene = []
                alleles_no_constrcut_no_gene = []

                allele_secondary_ids = []
                allele_synonyms = []
                cross_reference_list = []
                counter = 0

        if counter > 0:
            yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene,
                   allele_secondary_ids, allele_synonyms, cross_reference_list]