def _process_sub_type(self, sub_type): self.logger.info("Loading Phenotype Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Phenotype Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_neo4j_commit_size() data_provider = sub_type.get_data_provider() self.logger.info("subtype: " + data_provider) query_template_list = [ [self.execute_gene_query_template, commit_size, "phenotype_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_allele_query_template, commit_size, "phenotype_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agm_query_template, commit_size, "phenotype_agm_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "phenotype_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "phenotype_pges_agm_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Phenotype-{}: ".format(sub_type.get_data_provider()))
def _process_sub_type(self, sub_type): self.logger.info("Loading Transcript Data: %s", sub_type.get_data_provider()) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() filepath = sub_type.get_filepath() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [self.transcript_alternate_id_query_template, commit_size, "transcript_gff3ID_data_" + sub_type.get_data_provider() + ".csv"], [self.transcript_query_template, commit_size, "transcript_data_" + sub_type.get_data_provider() + ".csv"], [self.chromosomes_query_template, commit_size, "transcript_data_chromosome_" + sub_type.get_data_provider() + ".csv"], [self.genomic_locations_query_template, commit_size, "transcript_genomic_locations_" + sub_type.get_data_provider() + ".csv"], [self.exon_query_template, commit_size, "exon_data_" + sub_type.get_data_provider() + ".csv"], [self.exon_genomic_locations_template, commit_size, "exon_genomic_location_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(filepath, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.do_query_template, commit_size, "do_term_data.csv"], [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"], [ self.doterm_synonyms_query_template, commit_size, "do_synonyms_data.csv" ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"], [ self.doterm_alt_ids_query_template, commit_size, "do_alt_ids_data.csv" ] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("DO-?: ")
def _process_sub_type(self, sub_type): data_provider = sub_type.get_data_provider() self.logger.info(data_provider) if data_provider == 'DOID': data_provider = 'DO' self.logger.debug("Starting isa_partof_ Closure for: %s", data_provider) query_list = [ [ self.insert_isa_partof_closure_query_template, "100000", "isa_partof_closure_" + data_provider + ".csv", data_provider, data_provider ], ] generators = self.get_closure_terms(data_provider) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Closure-{}: ".format(data_provider)) self.logger.debug("Finished isa_partof Closure for: %s", data_provider)
def _process_sub_type(self, sub_type): self.logger.info("Loading ECOMAP Ontology Data: %s", sub_type.get_data_provider()) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() filepath = sub_type.get_filepath() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.eco_query_template, commit_size, "ecomap_data_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(filepath, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Ecomap-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading ECOMAP Data: %s", sub_type.get_data_provider())
def _load_and_process_data(self): for sub_type in self.data_type_config.get_sub_type_objects(): species_encoded = urllib.parse.quote_plus(\ ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider())) commit_size = self.data_type_config.get_neo4j_commit_size() #batch_size = self.data_type_config.get_generator_batch_size() batch_size = 100000 generators = self.get_generators(sub_type, batch_size, species_encoded) query_template_list = [ [ self.geo_xref_query_template, commit_size, "geo_xref_data_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params( query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): self.logger.info("Loading Sequence Targeting Reagent Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() self.logger.info(filepath) data = JSONFile().get_data(filepath) self.logger.info( "Finished Loading Sequence Targeting Reagent Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.agm_query_template, commit_size, "agm_data_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_secondary_ids_query_template, commit_size, "agm_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_synonyms_query_template, commit_size, "agm_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_components_query_template, commit_size, "agm_components_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_sqtrs_query_template, commit_size, "agm_sqtrs_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_backgrounds_query_template, commit_size, "agm_backgrounds_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("AGM-{}: ".format(sub_type.get_data_provider()))
def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() generators = self.get_generators(filepath) query_template_list = [[self.main_query_template, 10000, "mi_term_data.csv"]] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): filepath = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/master/ingest/species/species.yaml' generators = self.get_generators(filepath) query_template_list = [[self.main_query_template, 10000, "species_data.csv"]] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata Data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetETL.htp_dataset_query_template, commit_size, "htp_metadataset_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_category_tags_query_template, commit_size, "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size, "htp_metadataset_publications_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size, "htp_metadataset_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): commit_size = self.data_type_config.get_neo4j_commit_size() query_template_list = [[ self.query_xref_query_template, commit_size, "mol_int_xref.csv" ], [ self.xrefs_relationships_query_template, commit_size, "mol_int_xref.csv" ]] query_list = self.process_query_params(query_template_list) Neo4jTransactor.execute_query_batch(query_list)
def _load_and_process_data(self): self.logger.info("Starting Expression Ribbon Data") query_template_list = [[ self.insert_ribonless_ebes_query_template, "30000", "expression_ribbonless_ebes" + ".csv" ]] generators = self.get_ribbon_terms() query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.logger.info("Finished Expression Ribbon Data")
def _process_sub_type(self, sub_type): self.logger.info("Loading Generic Ontology Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() ont_type = sub_type.get_data_provider() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.generic_ontology_term_query_template, 600000, "generic_ontology_term_" + ont_type + ".csv", ont_type ], [ self.generic_ontology_isas_query_template, commit_size, "generic_ontology_isas_" + ont_type + ".csv", ont_type, ont_type ], [ self.generic_ontology_partofs_query_template, commit_size, "generic_ontology_partofs_" + ont_type + ".csv", ont_type, ont_type ], [ self.generic_ontology_synonyms_query_template, 400000, "generic_ontology_synonyms_" + ont_type + ".csv", ont_type ], [ self.generic_ontology_altids_query_template, commit_size, "generic_ontology_altids_" + ont_type + ".csv", ont_type ], ] # Obtain the generator generators = self.get_generators(filepath, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.logger.info("Finished Loading Generic Ontology Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type): filepath = sub_type.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.query_template, commit_size, "stub_data.csv"], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): self.logger.info("Loading Variation Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Variation Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.variation_query_template, commit_size, "variation_data_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "variant_genomiclocations_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "variant_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "variant_xrefs_" + sub_type.get_data_provider() + ".csv" ] ] generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))
def _process_sub_type(self, sub_type): self.logger.info("Loading VEP Data: %s", sub_type.get_data_provider()) commit_size = self.data_type_config.get_neo4j_commit_size() filepath = sub_type.get_filepath() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [[ self.vep_transcript_query_template, commit_size, "vep_transcript_data_" + sub_type.get_data_provider() + ".csv" ]] # Obtain the generator generators = self.get_generators(filepath) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): thread_pool = [] query_tracking_list = multiprocessing.Manager().list() for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, query_tracking_list)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) queries = [] for item in query_tracking_list: queries.append(item) Neo4jTransactor.execute_query_batch(queries)
def _process_sub_type(self, subtype): self.logger.info("Starting Gene Disease Ortho Data: %s", subtype) query_template_list = [ [self.insert_gene_disease_ortho_query_template, "10000", "gene_disease_by_orthology.csv"] ] self.logger.info("gene disease ortho pub created") generators = self.retrieve_gene_disease_ortho() query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.logger.info("Finished Gene Disease Ortho Data")
def _process_sub_type(self, sub_type): self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [self.execute_allele_query_template, commit_size, "disease_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_gene_query_template, commit_size, "disease_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agms_query_template, commit_size, "disease_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_gene_query_template, commit_size, "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_withs_query_template, commit_size, "disease_withs_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_ecode_query_template, commit_size, "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_annotation_xrefs_query_template, commit_size, "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size, sub_type.get_data_provider()) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Disease-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())
def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.main_query_template, commit_size, "go_term_data.csv"], [self.goterm_isas_query_template, commit_size, "go_isas_data.csv"], [ self.goterm_partofs_query_template, commit_size, "go_partofs_data.csv" ], [ self.goterm_synonyms_query_template, commit_size, "go_synonym_data.csv" ], [ self.goterm_regulates_query_template, commit_size, "go_regulates_data.csv" ], [ self.goterm_negatively_regulates_query_template, commit_size, "go_negatively_regulates_data.csv" ], [ self.goterm_positively_regulates_query_template, commit_size, "go_positively_regulates_data.csv" ], [ self.goterm_secondary_query_template, commit_size, "goterm_secondary_data.csv" ] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages()
def _load_and_process_data(self): sub_types = [] for sub_type in self.data_type_config.get_sub_type_objects(): sub_types.append(sub_type.get_data_provider()) thread_pool = [] query_tracking_list = multiprocessing.Manager().list() for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, sub_types, query_tracking_list)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) queries = [] for item in query_tracking_list: queries.append(item) algo_queries = [] for item in queries: if "algorithm" in item[1]: algo_queries.append(item) main_list = self.get_randomized_list(sub_types) for file_set in main_list: for pair in file_set: for item in queries: if pair[0] + "_" + pair[1] in item[1]: self.logger.debug("Pair: %s Item: %s", pair, item[1]) Neo4jTransactor.execute_query_batch([item]) Neo4jTransactor().wait_for_queues() Neo4jTransactor.execute_query_batch(algo_queries) self.error_messages()
def _load_and_process_data(self): # filepath = self.data_type_config.get_single_filepath() # Temporary fix for 3.0 release. filepath = 'tmp/alliance_molecular_interactions.tsv' commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.main_query_template, commit_size, "mol_int_data.csv"], [self.xref_query_template, commit_size, "mol_int_xref.csv"], [self.mod_xref_query_template, commit_size, "mol_int_mod_xref.csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map): data_provider = sub_type.get_data_provider() expression_atlas_gene_pages = self._get_expression_atlas_gene_pages( sub_type, data_provider, ensg_to_gene_primary_id_map) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(expression_atlas_gene_pages, data_provider, batch_size) query_template_list = [ [ self.add_expression_atlas_crossreferences_query_template, commit_size, "expression_atlas_" + data_provider + "_data.csv" ], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("ExpAtlas-{}: ".format( sub_type.get_data_provider()))
def run_loader(self): """Main function for running loader""" if self.args.verbose: self.logger.warn('DEBUG mode enabled!') time.sleep(3) data_manager = DataFileManager(self.context_info.config_file_location) file_transactor = FileTransactor() file_transactor.start_threads( data_manager.get_file_transactor_thread_settings()) data_manager.download_and_validate() self.logger.debug("finished downloading now doing thread") file_transactor.check_for_thread_errors() self.logger.debug("finished threads waiting for queues") file_transactor.wait_for_queues() self.logger.debug("finished queues waiting for shutdown") file_transactor.shutdown() neo_transactor = Neo4jTransactor() neo_transactor.start_threads( data_manager.get_neo_transactor_thread_settings()) self.logger.debug("finished starting neo threads ") if not self.context_info.env["USING_PICKLE"]: self.logger.info("Creating indices.") Neo4jHelper.create_indices() etl_time_tracker_list = self.run_etl_groups(self.logger, data_manager, neo_transactor) neo_transactor.shutdown() elapsed_time = time.time() - self.start_time for time_item in etl_time_tracker_list: self.logger.info(time_item) self.logger.info('Loader finished. Elapsed time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetSampleETL.htp_dataset_sample_query_template, commit_size, "htp_metadataset_sample_samples_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_bio_entity_expression_query_template, commit_size, "htp_metadataset_sample_bioentities_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_sample_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_join_query_template, commit_size, "htp_metadataset_sample_datasets_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size, "htp_metadataset_sample_stages_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size, "htp_metadataset_sample_aoterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_substructures_query_template, commit_size, "htp_metadataset_sample_ao_substructures_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.cc_term_query_template, commit_size, "htp_metadataset_sample_ccterms" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ccq_expression_query_template, commit_size, "htp_metadataset_sample_ccqterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size, "htp_metadataset_sample_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_other_query_template, commit_size, "htp_metadataset_sample_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template, commit_size, "htp_metadataset_sample_agms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_agmtext_query_template, commit_size, "htp_metadataset_sample_agmstext_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_assemblies_query_template, commit_size, "htp_metadataset_sample_assemblies_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): self.logger.info("Loading Construct Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Construct Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ ConstructETL.construct_query_template, commit_size, "Construct_data_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_secondary_ids_query_template, commit_size, "Construct_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_synonyms_query_template, commit_size, "Construct_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_xrefs_query_template, commit_size, "Construct_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.non_bgi_component_query_template, commit_size, "Construct_non_bgi_component_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_gene_component_query_template, commit_size, "Construct_components_gene" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_no_gene_component_query_template, commit_size, "Construct_components_no_gene" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): logger.info("Loading Allele Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ AlleleETL.allele_gene_no_construct_query_template, commit_size, "allele_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_construct_gene_query_template, commit_size, "allele_construct_gene_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_construct_no_gene_query_template, commit_size, "allele_construct_no_gene_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_no_gene_no_construct_query_template, commit_size, "allele_no_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_secondaryids_template, commit_size, "allele_secondaryids_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_synonyms_template, commit_size, "allele_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_xrefs_template, commit_size, "allele_xrefs_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Allele-{}: ".format(sub_type.get_data_provider())) logger.info("Finished Loading Allele Data: %s" % sub_type.get_data_provider())
def _load_and_process_data(self): # create gene descriptions data manager and load common data context_info = ContextInfo() data_manager = DataFileManager(context_info.config_file_location) #go_onto_config = data_manager.get_config('GO') go_annot_config = data_manager.get_config('GAF') #do_onto_config = data_manager.get_config('DOID') go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()} this_dir = os.path.split(__file__)[0] gd_config = GenedescConfigParser(os.path.join(this_dir, os.pardir, os.pardir, "gene_descriptions.yml")) gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) gd_data_manager.set_ontology(ontology_type=DataType.GO, ontology=self.get_ontology(data_type=DataType.GO), config=gd_config) gd_data_manager.set_ontology(ontology_type=DataType.DO, ontology=self.get_ontology(data_type=DataType.DO), config=gd_config) # generate descriptions for each MOD for prvdr in [sub_type.get_data_provider().upper() \ for sub_type in self.data_type_config.get_sub_type_objects()]: gd_config_mod_specific = copy.deepcopy(gd_config) if prvdr == "WB": gd_config_mod_specific.config["expression_sentences_options"][ "remove_children_if_parent_is_present"] = True self.logger.info("Generating gene descriptions for %s", prvdr) data_provider = prvdr if prvdr != "HUMAN" else "RGD" json_desc_writer = DescriptionsWriter() go_annot_path = "file://" + os.path.join(os.getcwd(), "tmp", go_annot_sub_dict[prvdr].file_to_download) gd_data_manager.load_associations_from_file( associations_type=DataType.GO, associations_url=go_annot_path, associations_cache_path=os.path.join(os.getcwd(), "tmp", "gd_cache", "go_annot_" + prvdr + ".gaf"), config=gd_config_mod_specific) gd_data_manager.set_associations(associations_type=DataType.DO, associations=self.get_disease_annotations_from_db( data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP: gd_data_manager.set_ontology(ontology_type=DataType.EXPR, ontology=self.get_ontology(data_type=DataType.EXPR, provider=prvdr), config=gd_config_mod_specific) gd_data_manager.set_associations( associations_type=DataType.EXPR, associations=self.get_expression_annotations_from_db(data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) commit_size = self.data_type_config.get_neo4j_commit_size() generators = self.get_generators(prvdr, gd_data_manager, gd_config_mod_specific, json_desc_writer) query_template_list = [ [self.gene_descriptions_query_template, commit_size, "genedescriptions_data_" + prvdr + ".csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.save_descriptions_report_files(data_provider=prvdr, json_desc_writer=json_desc_writer, context_info=context_info, gd_data_manager=gd_data_manager)