Пример #1
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Phenotype Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Phenotype Data: %s", sub_type.get_data_provider())
        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_neo4j_commit_size()
        data_provider = sub_type.get_data_provider()
        self.logger.info("subtype: " + data_provider)

        query_template_list = [
                [self.execute_gene_query_template, commit_size,
                 "phenotype_gene_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_allele_query_template, commit_size,
                 "phenotype_allele_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_agm_query_template, commit_size,
                 "phenotype_agm_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_pges_allele_query_template, commit_size,
                 "phenotype_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_pges_agm_query_template, commit_size,
                 "phenotype_pges_agm_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Phenotype-{}: ".format(sub_type.get_data_provider()))
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading Transcript Data: %s", sub_type.get_data_provider())
        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()
        filepath = sub_type.get_filepath()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [self.transcript_alternate_id_query_template, commit_size,
             "transcript_gff3ID_data_" + sub_type.get_data_provider() + ".csv"],
            [self.transcript_query_template, commit_size,
             "transcript_data_" + sub_type.get_data_provider() + ".csv"],
            [self.chromosomes_query_template, commit_size,
             "transcript_data_chromosome_" + sub_type.get_data_provider() + ".csv"],
            [self.genomic_locations_query_template, commit_size,
             "transcript_genomic_locations_" + sub_type.get_data_provider() + ".csv"],
            [self.exon_query_template, commit_size,
             "exon_data_" + sub_type.get_data_provider() + ".csv"],
            [self.exon_genomic_locations_template, commit_size,
             "exon_genomic_location_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(filepath, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #3
0
    def _load_and_process_data(self):
        filepath = self.data_type_config.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.do_query_template, commit_size, "do_term_data.csv"],
            [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"],
            [
                self.doterm_synonyms_query_template, commit_size,
                "do_synonyms_data.csv"
            ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"],
            [
                self.doterm_alt_ids_query_template, commit_size,
                "do_alt_ids_data.csv"
            ]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("DO-?: ")
Пример #4
0
    def _process_sub_type(self, sub_type):
        data_provider = sub_type.get_data_provider()
        self.logger.info(data_provider)
        if data_provider == 'DOID':
            data_provider = 'DO'

        self.logger.debug("Starting isa_partof_ Closure for: %s",
                          data_provider)

        query_list = [
            [
                self.insert_isa_partof_closure_query_template, "100000",
                "isa_partof_closure_" + data_provider + ".csv", data_provider,
                data_provider
            ],
        ]

        generators = self.get_closure_terms(data_provider)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.error_messages("Closure-{}: ".format(data_provider))
        self.logger.debug("Finished isa_partof Closure for: %s", data_provider)
Пример #5
0
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading ECOMAP Ontology Data: %s",
                         sub_type.get_data_provider())

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        filepath = sub_type.get_filepath()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.eco_query_template, commit_size,
                "ecomap_data_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(filepath, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.error_messages("Ecomap-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading ECOMAP Data: %s",
                         sub_type.get_data_provider())
Пример #6
0
    def _load_and_process_data(self):

        for sub_type in self.data_type_config.get_sub_type_objects():

            species_encoded = urllib.parse.quote_plus(\
                    ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider()))

            commit_size = self.data_type_config.get_neo4j_commit_size()
            #batch_size = self.data_type_config.get_generator_batch_size()
            batch_size = 100000

            generators = self.get_generators(sub_type, batch_size,
                                             species_encoded)

            query_template_list = [
                [
                    self.geo_xref_query_template, commit_size,
                    "geo_xref_data_" + sub_type.get_data_provider() + ".csv"
                ],
            ]

            query_and_file_list = self.process_query_params(
                query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Sequence Targeting Reagent Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        self.logger.info(filepath)
        data = JSONFile().get_data(filepath)
        self.logger.info(
            "Finished Loading Sequence Targeting Reagent Data: %s",
            sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.agm_query_template, commit_size,
                "agm_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_secondary_ids_query_template, commit_size,
                "agm_secondary_ids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_synonyms_query_template, commit_size,
                "agm_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_components_query_template, commit_size,
                "agm_components_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_sqtrs_query_template, commit_size,
                "agm_sqtrs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_backgrounds_query_template, commit_size,
                "agm_backgrounds_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("AGM-{}: ".format(sub_type.get_data_provider()))
Пример #8
0
    def _load_and_process_data(self):
        filepath = self.data_type_config.get_single_filepath()
        generators = self.get_generators(filepath)

        query_template_list = [[self.main_query_template, 10000, "mi_term_data.csv"]]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #9
0
    def _load_and_process_data(self):

        filepath = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/master/ingest/species/species.yaml'
        generators = self.get_generators(filepath)

        query_template_list = [[self.main_query_template, 10000, "species_data.csv"]]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #10
0
    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetETL.htp_dataset_query_template, commit_size,
                "htp_metadataset_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_category_tags_query_template,
                commit_size,
                "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size,
                "htp_metadataset_publications_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size,
                "htp_metadataset_xrefs_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size,
                "htp_metadataset_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _load_and_process_data(self):

        commit_size = self.data_type_config.get_neo4j_commit_size()

        query_template_list = [[
            self.query_xref_query_template, commit_size, "mol_int_xref.csv"
        ],
                               [
                                   self.xrefs_relationships_query_template,
                                   commit_size, "mol_int_xref.csv"
                               ]]

        query_list = self.process_query_params(query_template_list)
        Neo4jTransactor.execute_query_batch(query_list)
    def _load_and_process_data(self):
        self.logger.info("Starting Expression Ribbon Data")

        query_template_list = [[
            self.insert_ribonless_ebes_query_template, "30000",
            "expression_ribbonless_ebes" + ".csv"
        ]]

        generators = self.get_ribbon_terms()

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.logger.info("Finished Expression Ribbon Data")
Пример #13
0
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading Generic Ontology Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        ont_type = sub_type.get_data_provider()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.generic_ontology_term_query_template, 600000,
                "generic_ontology_term_" + ont_type + ".csv", ont_type
            ],
            [
                self.generic_ontology_isas_query_template, commit_size,
                "generic_ontology_isas_" + ont_type + ".csv", ont_type,
                ont_type
            ],
            [
                self.generic_ontology_partofs_query_template, commit_size,
                "generic_ontology_partofs_" + ont_type + ".csv", ont_type,
                ont_type
            ],
            [
                self.generic_ontology_synonyms_query_template, 400000,
                "generic_ontology_synonyms_" + ont_type + ".csv", ont_type
            ],
            [
                self.generic_ontology_altids_query_template, commit_size,
                "generic_ontology_altids_" + ont_type + ".csv", ont_type
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(filepath, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.logger.info("Finished Loading Generic Ontology Data: %s",
                         sub_type.get_data_provider())
Пример #14
0
    def _process_sub_type(self, sub_type):

        filepath = sub_type.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.query_template, commit_size, "stub_data.csv"],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #15
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Variation Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Variation Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored

        query_template_list = [
            [
                self.variation_query_template, commit_size,
                "variation_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "variant_genomiclocations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "variant_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "variant_xrefs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        generators = self.get_generators(data, batch_size)
        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))
Пример #16
0
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading VEP Data: %s", sub_type.get_data_provider())
        commit_size = self.data_type_config.get_neo4j_commit_size()
        filepath = sub_type.get_filepath()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [[
            self.vep_transcript_query_template, commit_size,
            "vep_transcript_data_" + sub_type.get_data_provider() + ".csv"
        ]]

        # Obtain the generator
        generators = self.get_generators(filepath)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #17
0
    def _load_and_process_data(self):
        thread_pool = []

        query_tracking_list = multiprocessing.Manager().list()
        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type,
                                                    query_tracking_list))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        queries = []
        for item in query_tracking_list:
            queries.append(item)

        Neo4jTransactor.execute_query_batch(queries)
    def _process_sub_type(self, subtype):

        self.logger.info("Starting Gene Disease Ortho Data: %s", subtype)

        query_template_list = [
            [self.insert_gene_disease_ortho_query_template, "10000",
             "gene_disease_by_orthology.csv"]
        ]

        self.logger.info("gene disease ortho pub created")

        generators = self.retrieve_gene_disease_ortho()

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.logger.info("Finished Gene Disease Ortho Data")
Пример #19
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [self.execute_allele_query_template, commit_size,
             "disease_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_gene_query_template, commit_size,
             "disease_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_agms_query_template, commit_size,
             "disease_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_gene_query_template, commit_size,
             "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_allele_query_template, commit_size,
             "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_agm_query_template, commit_size,
             "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_withs_query_template, commit_size,
             "disease_withs_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_ecode_query_template, commit_size,
             "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_annotation_xrefs_query_template, commit_size,
             "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size, sub_type.get_data_provider())

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Disease-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())
Пример #20
0
    def _load_and_process_data(self):

        filepath = self.data_type_config.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.main_query_template, commit_size, "go_term_data.csv"],
            [self.goterm_isas_query_template, commit_size, "go_isas_data.csv"],
            [
                self.goterm_partofs_query_template, commit_size,
                "go_partofs_data.csv"
            ],
            [
                self.goterm_synonyms_query_template, commit_size,
                "go_synonym_data.csv"
            ],
            [
                self.goterm_regulates_query_template, commit_size,
                "go_regulates_data.csv"
            ],
            [
                self.goterm_negatively_regulates_query_template, commit_size,
                "go_negatively_regulates_data.csv"
            ],
            [
                self.goterm_positively_regulates_query_template, commit_size,
                "go_positively_regulates_data.csv"
            ],
            [
                self.goterm_secondary_query_template, commit_size,
                "goterm_secondary_data.csv"
            ]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages()
Пример #21
0
    def _load_and_process_data(self):

        sub_types = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            sub_types.append(sub_type.get_data_provider())

        thread_pool = []

        query_tracking_list = multiprocessing.Manager().list()
        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type, sub_types,
                                                    query_tracking_list))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        queries = []
        for item in query_tracking_list:
            queries.append(item)

        algo_queries = []

        for item in queries:
            if "algorithm" in item[1]:
                algo_queries.append(item)

        main_list = self.get_randomized_list(sub_types)

        for file_set in main_list:
            for pair in file_set:
                for item in queries:
                    if pair[0] + "_" + pair[1] in item[1]:
                        self.logger.debug("Pair: %s Item: %s", pair, item[1])
                        Neo4jTransactor.execute_query_batch([item])

            Neo4jTransactor().wait_for_queues()

        Neo4jTransactor.execute_query_batch(algo_queries)
        self.error_messages()
    def _load_and_process_data(self):

        # filepath = self.data_type_config.get_single_filepath()
        # Temporary fix for 3.0 release.
        filepath = 'tmp/alliance_molecular_interactions.tsv'

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.main_query_template, commit_size, "mol_int_data.csv"],
            [self.xref_query_template, commit_size, "mol_int_xref.csv"],
            [self.mod_xref_query_template, commit_size, "mol_int_mod_xref.csv"]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map):

        data_provider = sub_type.get_data_provider()
        expression_atlas_gene_pages = self._get_expression_atlas_gene_pages(
            sub_type, data_provider, ensg_to_gene_primary_id_map)

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(expression_atlas_gene_pages,
                                         data_provider, batch_size)

        query_template_list = [
            [
                self.add_expression_atlas_crossreferences_query_template,
                commit_size, "expression_atlas_" + data_provider + "_data.csv"
            ],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("ExpAtlas-{}: ".format(
            sub_type.get_data_provider()))
Пример #24
0
    def run_loader(self):
        """Main function for running loader"""

        if self.args.verbose:
            self.logger.warn('DEBUG mode enabled!')
            time.sleep(3)

        data_manager = DataFileManager(self.context_info.config_file_location)
        file_transactor = FileTransactor()

        file_transactor.start_threads(
            data_manager.get_file_transactor_thread_settings())

        data_manager.download_and_validate()
        self.logger.debug("finished downloading now doing thread")

        file_transactor.check_for_thread_errors()
        self.logger.debug("finished threads waiting for queues")

        file_transactor.wait_for_queues()
        self.logger.debug("finished queues waiting for shutdown")
        file_transactor.shutdown()

        neo_transactor = Neo4jTransactor()
        neo_transactor.start_threads(
            data_manager.get_neo_transactor_thread_settings())

        self.logger.debug("finished starting neo threads ")

        if not self.context_info.env["USING_PICKLE"]:
            self.logger.info("Creating indices.")
            Neo4jHelper.create_indices()

        etl_time_tracker_list = self.run_etl_groups(self.logger, data_manager,
                                                    neo_transactor)

        neo_transactor.shutdown()

        elapsed_time = time.time() - self.start_time

        for time_item in etl_time_tracker_list:
            self.logger.info(time_item)

        self.logger.info('Loader finished. Elapsed time: %s' %
                         time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_query_template,
                commit_size, "htp_metadataset_sample_samples_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_bio_entity_expression_query_template, commit_size,
                "htp_metadataset_sample_bioentities_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_secondaryIds_query_template,
                commit_size, "htp_metadataset_sample_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_join_query_template,
                commit_size, "htp_metadataset_sample_datasets_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size,
                "htp_metadataset_sample_stages_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size,
                "htp_metadataset_sample_aoterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_substructures_query_template,
                commit_size, "htp_metadataset_sample_ao_substructures_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.cc_term_query_template, commit_size,
                "htp_metadataset_sample_ccterms" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ccq_expression_query_template,
                commit_size, "htp_metadataset_sample_ccqterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size,
                "htp_metadataset_sample_uberon_ao_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_other_query_template,
                commit_size, "htp_metadataset_sample_uberon_ao_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template,
                commit_size, "htp_metadataset_sample_agms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_agmtext_query_template, commit_size,
                "htp_metadataset_sample_agmstext_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_assemblies_query_template, commit_size,
                "htp_metadataset_sample_assemblies_" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #26
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Construct Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Construct Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                ConstructETL.construct_query_template, commit_size,
                "Construct_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_secondary_ids_query_template,
                commit_size, "Construct_secondary_ids_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_synonyms_query_template, commit_size,
                "Construct_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_xrefs_query_template, commit_size,
                "Construct_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.non_bgi_component_query_template, commit_size,
                "Construct_non_bgi_component_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                ConstructETL.construct_gene_component_query_template,
                commit_size, "Construct_components_gene" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_no_gene_component_query_template,
                commit_size, "Construct_components_no_gene" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Пример #27
0
    def _process_sub_type(self, sub_type):

        logger.info("Loading Allele Data: %s" % sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                AlleleETL.allele_gene_no_construct_query_template, commit_size,
                "allele_gene_no_construct_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_construct_gene_query_template, commit_size,
                "allele_construct_gene_data_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                AlleleETL.allele_construct_no_gene_query_template, commit_size,
                "allele_construct_no_gene_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_no_gene_no_construct_query_template,
                commit_size, "allele_no_gene_no_construct_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_secondaryids_template, commit_size,
                "allele_secondaryids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_synonyms_template, commit_size,
                "allele_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_xrefs_template, commit_size,
                "allele_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Allele-{}: ".format(sub_type.get_data_provider()))
        logger.info("Finished Loading Allele Data: %s" %
                    sub_type.get_data_provider())
    def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)