Exemplo n.º 1
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Phenotype Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Phenotype Data: %s", sub_type.get_data_provider())
        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_neo4j_commit_size()
        data_provider = sub_type.get_data_provider()
        self.logger.info("subtype: " + data_provider)

        query_template_list = [
                [self.execute_gene_query_template, commit_size,
                 "phenotype_gene_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_allele_query_template, commit_size,
                 "phenotype_allele_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_agm_query_template, commit_size,
                 "phenotype_agm_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_pges_allele_query_template, commit_size,
                 "phenotype_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_pges_agm_query_template, commit_size,
                 "phenotype_pges_agm_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Phenotype-{}: ".format(sub_type.get_data_provider()))
Exemplo n.º 2
0
 def load_genes_mod(self, batch_size, testObject, bgiName, loadFile):
     path = "tmp"
     S3File("mod-datadumps", loadFile, path).download()
     TARFile(path, loadFile).extract_all()
     gene_data = JSONFile().get_data(path + bgiName, 'BGI')
     gene_lists = BGIExt().get_data(gene_data, batch_size, testObject)
     return self.yield_gene_lists(gene_lists)
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Sequence Targeting Reagent Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        self.logger.info(filepath)
        data = JSONFile().get_data(filepath)
        self.logger.info(
            "Finished Loading Sequence Targeting Reagent Data: %s",
            sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.agm_query_template, commit_size,
                "agm_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_secondary_ids_query_template, commit_size,
                "agm_secondary_ids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_synonyms_query_template, commit_size,
                "agm_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_components_query_template, commit_size,
                "agm_components_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_sqtrs_query_template, commit_size,
                "agm_sqtrs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_backgrounds_query_template, commit_size,
                "agm_backgrounds_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("AGM-{}: ".format(sub_type.get_data_provider()))
Exemplo n.º 4
0
    def load_allele_objects_mod(self, batch_size, testObject, alleleName,
                                loadFile):
        path = "tmp"
        S3File("mod-datadumps", loadFile, path).download()
        TARFile(path, loadFile).extract_all()
        alleleData = JSONFile().get_data(path + alleleName, 'allele')
        alleleDict = AlleleExt().get_alleles(alleleData, batch_size,
                                             testObject)

        return alleleDict
Exemplo n.º 5
0
    def load_disease_allele_objects_mod(self, batch_size, testObject,
                                        diseaseName, loadFile, graph):
        path = "tmp"
        S3File("mod-datadumps", loadFile, path).download()
        TARFile(path, loadFile).extract_all()
        disease_data = JSONFile().get_data(path + diseaseName, 'disease')
        disease_dict = DiseaseAlleleExt().get_allele_disease_data(
            disease_data, batch_size, graph)

        return disease_dict
Exemplo n.º 6
0
    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetETL.htp_dataset_query_template, commit_size,
                "htp_metadataset_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_category_tags_query_template,
                commit_size,
                "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size,
                "htp_metadataset_publications_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size,
                "htp_metadataset_xrefs_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size,
                "htp_metadataset_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Exemplo n.º 7
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Variation Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Variation Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored

        query_template_list = [
            [
                self.variation_query_template, commit_size,
                "variation_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "variant_genomiclocations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "variant_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "variant_xrefs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        generators = self.get_generators(data, batch_size)
        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))
Exemplo n.º 8
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [self.execute_allele_query_template, commit_size,
             "disease_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_gene_query_template, commit_size,
             "disease_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_agms_query_template, commit_size,
             "disease_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_gene_query_template, commit_size,
             "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_allele_query_template, commit_size,
             "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_agm_query_template, commit_size,
             "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_withs_query_template, commit_size,
             "disease_withs_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_ecode_query_template, commit_size,
             "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_annotation_xrefs_query_template, commit_size,
             "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size, sub_type.get_data_provider())

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Disease-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())
    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_query_template,
                commit_size, "htp_metadataset_sample_samples_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_bio_entity_expression_query_template, commit_size,
                "htp_metadataset_sample_bioentities_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_secondaryIds_query_template,
                commit_size, "htp_metadataset_sample_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_join_query_template,
                commit_size, "htp_metadataset_sample_datasets_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size,
                "htp_metadataset_sample_stages_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size,
                "htp_metadataset_sample_aoterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_substructures_query_template,
                commit_size, "htp_metadataset_sample_ao_substructures_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.cc_term_query_template, commit_size,
                "htp_metadataset_sample_ccterms" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ccq_expression_query_template,
                commit_size, "htp_metadataset_sample_ccqterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size,
                "htp_metadataset_sample_uberon_ao_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_other_query_template,
                commit_size, "htp_metadataset_sample_uberon_ao_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template,
                commit_size, "htp_metadataset_sample_agms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_agmtext_query_template, commit_size,
                "htp_metadataset_sample_agmstext_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_assemblies_query_template, commit_size,
                "htp_metadataset_sample_assemblies_" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Exemplo n.º 10
0
    def _process_sub_type(self, sub_type, query_tracking_list):

        self.logger.info("Loading BGI Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        if filepath is None:
            self.logger.error("Can't find input file for %s", sub_type)
            sys.exit()

        data = JSONFile().get_data(filepath)

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # gene_metadata, gene_dataset, secondary_ids, genomic_locations, cross_references, synonyms
        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.gene_metadata_query_template, commit_size,
                "gene_metadata_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_query_template, commit_size,
                "gene_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.basic_gene_load_relations_query_template, commit_size,
                "gene_data_load_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.basic_gene_species_relations_query_template, commit_size,
                "gene_data_species_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "gene_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.chromosomes_query_template, commit_size,
                "gene_chromosomes_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_secondary_ids_query_template, commit_size,
                "gene_secondary_ids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "gene_genomic_locations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "gene_cross_references_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.xrefs_relationships_query_template, commit_size,
                "gene_cross_references_relationships_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_synonyms_query_template, 600000,
                "gene_synonyms_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)

        self.error_messages("BGI-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading BGI Data: %s",
                         sub_type.get_data_provider())
Exemplo n.º 11
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Construct Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Construct Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                ConstructETL.construct_query_template, commit_size,
                "Construct_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_secondary_ids_query_template,
                commit_size, "Construct_secondary_ids_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_synonyms_query_template, commit_size,
                "Construct_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_xrefs_query_template, commit_size,
                "Construct_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.non_bgi_component_query_template, commit_size,
                "Construct_non_bgi_component_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                ConstructETL.construct_gene_component_query_template,
                commit_size, "Construct_components_gene" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_no_gene_component_query_template,
                commit_size, "Construct_components_no_gene" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Exemplo n.º 12
0
    def __init__(self, config_file_loc):

        self.context_info = ContextInfo()

        # Load config yaml.
        logger.debug('Loading config file: %s' % config_file_loc)
        config_file = open(config_file_loc, 'r')
        self.config_data = yaml.load(config_file, Loader=yaml.SafeLoader)
        logger.debug("Config Data: %s" % self.config_data)

        # Load validation yaml.
        validation_yaml_file_loc = os.path.abspath('src/config/validation.yml')
        logger.debug('Loading validation schema: %s' %
                     validation_yaml_file_loc)
        validation_schema_file = open(validation_yaml_file_loc, 'r')
        self.validation_schema = yaml.load(validation_schema_file,
                                           Loader=yaml.SafeLoader)

        # Assign values for thread counts.
        self.FileTransactorThreads = self.config_data['FileTransactorThreads']

        # Loading a JSON blurb from a file as a placeholder for submission system query.
        other_file_meta_data = os.path.abspath(
            'src/config/local_submission.json')
        self.non_submission_system_data = JSONFile().get_data(
            other_file_meta_data)
        urllib3.disable_warnings()
        self.http = urllib3.PoolManager()

        # use the recently created snapshot
        api_url = self.context_info.env[
            "FMS_API_URL"] + '/api/snapshot/release/' + self.context_info.env[
                "ALLIANCE_RELEASE"]
        logger.info(api_url)

        submission_data = self.http.request('GET', api_url)

        if submission_data.status != 200:
            logger.error("Status: %s" % submission_data.status)
            logger.error("No Data came from API: %s" % api_url)
            sys.exit(-1)

        self.snapshot_submission_system_data = json.loads(
            submission_data.data.decode('UTF-8'))
        logger.debug(self.snapshot_submission_system_data)

        for dataFile in self.non_submission_system_data['snapShot'][
                'dataFiles']:
            self.snapshot_submission_system_data['snapShot'][
                'dataFiles'].append(dataFile)

        logger.debug(self.snapshot_submission_system_data)

        # List used for MOD and data type objects.
        self.master_data_dictionary = {}

        # Dictionary for transformed submission system data.
        self.transformed_submission_system_data = {}

        # process config file during initialization
        self.process_config()
Exemplo n.º 13
0
    def _process_sub_type(self, sub_type):

        logger.info("Loading Allele Data: %s" % sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                AlleleETL.allele_gene_no_construct_query_template, commit_size,
                "allele_gene_no_construct_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_construct_gene_query_template, commit_size,
                "allele_construct_gene_data_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                AlleleETL.allele_construct_no_gene_query_template, commit_size,
                "allele_construct_no_gene_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_no_gene_no_construct_query_template,
                commit_size, "allele_no_gene_no_construct_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_secondaryids_template, commit_size,
                "allele_secondaryids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_synonyms_template, commit_size,
                "allele_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_xrefs_template, commit_size,
                "allele_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Allele-{}: ".format(sub_type.get_data_provider()))
        logger.info("Finished Loading Allele Data: %s" %
                    sub_type.get_data_provider())