def _process_sub_type(self, sub_type): self.logger.info("Loading Phenotype Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Phenotype Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_neo4j_commit_size() data_provider = sub_type.get_data_provider() self.logger.info("subtype: " + data_provider) query_template_list = [ [self.execute_gene_query_template, commit_size, "phenotype_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_allele_query_template, commit_size, "phenotype_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agm_query_template, commit_size, "phenotype_agm_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "phenotype_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "phenotype_pges_agm_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Phenotype-{}: ".format(sub_type.get_data_provider()))
def _process_sub_type(self, sub_type): self.logger.info("Loading Transcript Data: %s", sub_type.get_data_provider()) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() filepath = sub_type.get_filepath() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [self.transcript_alternate_id_query_template, commit_size, "transcript_gff3ID_data_" + sub_type.get_data_provider() + ".csv"], [self.transcript_query_template, commit_size, "transcript_data_" + sub_type.get_data_provider() + ".csv"], [self.chromosomes_query_template, commit_size, "transcript_data_chromosome_" + sub_type.get_data_provider() + ".csv"], [self.genomic_locations_query_template, commit_size, "transcript_genomic_locations_" + sub_type.get_data_provider() + ".csv"], [self.exon_query_template, commit_size, "exon_data_" + sub_type.get_data_provider() + ".csv"], [self.exon_genomic_locations_template, commit_size, "exon_genomic_location_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(filepath, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading GOAnnot Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_file_to_download() filepath = os.path.join('tmp/', filepath) self.logger.info("goannot path: %s", filepath) file = open(filepath, "r") self.logger.info("Finished Loading GOAnnot Data: %s", sub_type.get_data_provider()) # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(\ file, ETLHelper.go_annot_prefix_lookup(sub_type.get_data_provider()), batch_size) query_template_list = [ [ self.main_query_template, commit_size, "go_annot_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item)
def _process_sub_type(self, sub_type): data_provider = sub_type.get_data_provider() self.logger.info(data_provider) if data_provider == 'DOID': data_provider = 'DO' self.logger.debug("Starting isa_partof_ Closure for: %s", data_provider) query_list = [ [ self.insert_isa_partof_closure_query_template, "100000", "isa_partof_closure_" + data_provider + ".csv", data_provider, data_provider ], ] generators = self.get_closure_terms(data_provider) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Closure-{}: ".format(data_provider)) self.logger.debug("Finished isa_partof Closure for: %s", data_provider)
def _process_sub_type(self, sub_type): self.logger.info("Loading ECOMAP Ontology Data: %s", sub_type.get_data_provider()) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() filepath = sub_type.get_filepath() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.eco_query_template, commit_size, "ecomap_data_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(filepath, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Ecomap-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading ECOMAP Data: %s", sub_type.get_data_provider())
def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.do_query_template, commit_size, "do_term_data.csv"], [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"], [ self.doterm_synonyms_query_template, commit_size, "do_synonyms_data.csv" ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"], [ self.doterm_alt_ids_query_template, commit_size, "do_alt_ids_data.csv" ] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("DO-?: ")
def _load_and_process_data(self): for sub_type in self.data_type_config.get_sub_type_objects(): species_encoded = urllib.parse.quote_plus(\ ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider())) commit_size = self.data_type_config.get_neo4j_commit_size() #batch_size = self.data_type_config.get_generator_batch_size() batch_size = 100000 generators = self.get_generators(sub_type, batch_size, species_encoded) query_template_list = [ [ self.geo_xref_query_template, commit_size, "geo_xref_data_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params( query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): self.logger.info("Loading Sequence Targeting Reagent Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() self.logger.info(filepath) data = JSONFile().get_data(filepath) self.logger.info( "Finished Loading Sequence Targeting Reagent Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.agm_query_template, commit_size, "agm_data_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_secondary_ids_query_template, commit_size, "agm_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_synonyms_query_template, commit_size, "agm_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_components_query_template, commit_size, "agm_components_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_sqtrs_query_template, commit_size, "agm_sqtrs_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_backgrounds_query_template, commit_size, "agm_backgrounds_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("AGM-{}: ".format(sub_type.get_data_provider()))
def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() generators = self.get_generators(filepath) query_template_list = [[self.main_query_template, 10000, "mi_term_data.csv"]] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): filepath = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/master/ingest/species/species.yaml' generators = self.get_generators(filepath) query_template_list = [[self.main_query_template, 10000, "species_data.csv"]] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata Data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetETL.htp_dataset_query_template, commit_size, "htp_metadataset_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_category_tags_query_template, commit_size, "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size, "htp_metadataset_publications_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size, "htp_metadataset_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): self.logger.info("Starting Expression Ribbon Data") query_template_list = [[ self.insert_ribonless_ebes_query_template, "30000", "expression_ribbonless_ebes" + ".csv" ]] generators = self.get_ribbon_terms() query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.logger.info("Finished Expression Ribbon Data")
def _process_sub_type(self, sub_type): self.logger.info("Loading Generic Ontology Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() ont_type = sub_type.get_data_provider() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.generic_ontology_term_query_template, 600000, "generic_ontology_term_" + ont_type + ".csv", ont_type ], [ self.generic_ontology_isas_query_template, commit_size, "generic_ontology_isas_" + ont_type + ".csv", ont_type, ont_type ], [ self.generic_ontology_partofs_query_template, commit_size, "generic_ontology_partofs_" + ont_type + ".csv", ont_type, ont_type ], [ self.generic_ontology_synonyms_query_template, 400000, "generic_ontology_synonyms_" + ont_type + ".csv", ont_type ], [ self.generic_ontology_altids_query_template, commit_size, "generic_ontology_altids_" + ont_type + ".csv", ont_type ], ] # Obtain the generator generators = self.get_generators(filepath, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.logger.info("Finished Loading Generic Ontology Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type, sub_types, query_tracking_list): self.logger.info("Loading Orthology Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() # data = JSONFile().get_data(filepath) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, sub_type.get_data_provider(), sub_types, batch_size) query_template_list = [] for mod_sub_type in sub_types: if mod_sub_type != sub_type.get_data_provider(): query_template_list.append([ self.main_query_template, "100000", "orthology_data_" + sub_type.get_data_provider() + "_" + mod_sub_type + ".csv" ]) query_template_list.append([ self.matched_algorithm_query_template, commit_size, "orthology_matched_algorithm_data_{}.csv".format( sub_type.get_data_provider()) ]) query_template_list.append([ self.not_matched_algorithm_query_template, commit_size, "orthology_not_matched_algorithm_data_" + sub_type.get_data_provider() + ".csv" ]) query_template_list.append([ self.not_called_algorithm_query_template, commit_size, "orthology_not_called_algorithm_data_" + sub_type.get_data_provider() + ".csv" ]) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item) self.error_messages("Ortho-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading Orthology Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type): filepath = sub_type.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.query_template, commit_size, "stub_data.csv"], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): self.logger.info("Loading Variation Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Variation Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.variation_query_template, commit_size, "variation_data_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "variant_genomiclocations_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "variant_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "variant_xrefs_" + sub_type.get_data_provider() + ".csv" ] ] generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))
def _process_sub_type(self, sub_type): self.logger.info("Loading VEP Data: %s", sub_type.get_data_provider()) commit_size = self.data_type_config.get_neo4j_commit_size() filepath = sub_type.get_filepath() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [[ self.vep_transcript_query_template, commit_size, "vep_transcript_data_" + sub_type.get_data_provider() + ".csv" ]] # Obtain the generator generators = self.get_generators(filepath) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, subtype): self.logger.info("Starting Gene Disease Ortho Data: %s", subtype) query_template_list = [ [self.insert_gene_disease_ortho_query_template, "10000", "gene_disease_by_orthology.csv"] ] self.logger.info("gene disease ortho pub created") generators = self.retrieve_gene_disease_ortho() query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.logger.info("Finished Gene Disease Ortho Data")
def _process_sub_type(self, sub_type): self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [self.execute_allele_query_template, commit_size, "disease_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_gene_query_template, commit_size, "disease_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agms_query_template, commit_size, "disease_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_gene_query_template, commit_size, "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_withs_query_template, commit_size, "disease_withs_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_ecode_query_template, commit_size, "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_annotation_xrefs_query_template, commit_size, "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size, sub_type.get_data_provider()) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Disease-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())
def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.main_query_template, commit_size, "go_term_data.csv"], [self.goterm_isas_query_template, commit_size, "go_isas_data.csv"], [ self.goterm_partofs_query_template, commit_size, "go_partofs_data.csv" ], [ self.goterm_synonyms_query_template, commit_size, "go_synonym_data.csv" ], [ self.goterm_regulates_query_template, commit_size, "go_regulates_data.csv" ], [ self.goterm_negatively_regulates_query_template, commit_size, "go_negatively_regulates_data.csv" ], [ self.goterm_positively_regulates_query_template, commit_size, "go_positively_regulates_data.csv" ], [ self.goterm_secondary_query_template, commit_size, "goterm_secondary_data.csv" ] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages()
def _load_and_process_data(self): # filepath = self.data_type_config.get_single_filepath() # Temporary fix for 3.0 release. filepath = 'tmp/alliance_molecular_interactions.tsv' commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.main_query_template, commit_size, "mol_int_data.csv"], [self.xref_query_template, commit_size, "mol_int_xref.csv"], [self.mod_xref_query_template, commit_size, "mol_int_mod_xref.csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map): data_provider = sub_type.get_data_provider() expression_atlas_gene_pages = self._get_expression_atlas_gene_pages( sub_type, data_provider, ensg_to_gene_primary_id_map) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(expression_atlas_gene_pages, data_provider, batch_size) query_template_list = [ [ self.add_expression_atlas_crossreferences_query_template, commit_size, "expression_atlas_" + data_provider + "_data.csv" ], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("ExpAtlas-{}: ".format( sub_type.get_data_provider()))
def _process_sub_type(self, sub_type): logger.info("Loading Allele Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ AlleleETL.allele_gene_no_construct_query_template, commit_size, "allele_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_construct_gene_query_template, commit_size, "allele_construct_gene_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_construct_no_gene_query_template, commit_size, "allele_construct_no_gene_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_no_gene_no_construct_query_template, commit_size, "allele_no_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_secondaryids_template, commit_size, "allele_secondaryids_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_synonyms_template, commit_size, "allele_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_xrefs_template, commit_size, "allele_xrefs_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Allele-{}: ".format(sub_type.get_data_provider())) logger.info("Finished Loading Allele Data: %s" % sub_type.get_data_provider())
def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetSampleETL.htp_dataset_sample_query_template, commit_size, "htp_metadataset_sample_samples_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_bio_entity_expression_query_template, commit_size, "htp_metadataset_sample_bioentities_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_sample_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_join_query_template, commit_size, "htp_metadataset_sample_datasets_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size, "htp_metadataset_sample_stages_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size, "htp_metadataset_sample_aoterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_substructures_query_template, commit_size, "htp_metadataset_sample_ao_substructures_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.cc_term_query_template, commit_size, "htp_metadataset_sample_ccterms" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ccq_expression_query_template, commit_size, "htp_metadataset_sample_ccqterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size, "htp_metadataset_sample_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_other_query_template, commit_size, "htp_metadataset_sample_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template, commit_size, "htp_metadataset_sample_agms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_agmtext_query_template, commit_size, "htp_metadataset_sample_agmstext_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_assemblies_query_template, commit_size, "htp_metadataset_sample_assemblies_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _load_and_process_data(self): # create gene descriptions data manager and load common data context_info = ContextInfo() data_manager = DataFileManager(context_info.config_file_location) #go_onto_config = data_manager.get_config('GO') go_annot_config = data_manager.get_config('GAF') #do_onto_config = data_manager.get_config('DOID') go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()} this_dir = os.path.split(__file__)[0] gd_config = GenedescConfigParser(os.path.join(this_dir, os.pardir, os.pardir, "gene_descriptions.yml")) gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) gd_data_manager.set_ontology(ontology_type=DataType.GO, ontology=self.get_ontology(data_type=DataType.GO), config=gd_config) gd_data_manager.set_ontology(ontology_type=DataType.DO, ontology=self.get_ontology(data_type=DataType.DO), config=gd_config) # generate descriptions for each MOD for prvdr in [sub_type.get_data_provider().upper() \ for sub_type in self.data_type_config.get_sub_type_objects()]: gd_config_mod_specific = copy.deepcopy(gd_config) if prvdr == "WB": gd_config_mod_specific.config["expression_sentences_options"][ "remove_children_if_parent_is_present"] = True self.logger.info("Generating gene descriptions for %s", prvdr) data_provider = prvdr if prvdr != "HUMAN" else "RGD" json_desc_writer = DescriptionsWriter() go_annot_path = "file://" + os.path.join(os.getcwd(), "tmp", go_annot_sub_dict[prvdr].file_to_download) gd_data_manager.load_associations_from_file( associations_type=DataType.GO, associations_url=go_annot_path, associations_cache_path=os.path.join(os.getcwd(), "tmp", "gd_cache", "go_annot_" + prvdr + ".gaf"), config=gd_config_mod_specific) gd_data_manager.set_associations(associations_type=DataType.DO, associations=self.get_disease_annotations_from_db( data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP: gd_data_manager.set_ontology(ontology_type=DataType.EXPR, ontology=self.get_ontology(data_type=DataType.EXPR, provider=prvdr), config=gd_config_mod_specific) gd_data_manager.set_associations( associations_type=DataType.EXPR, associations=self.get_expression_annotations_from_db(data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) commit_size = self.data_type_config.get_neo4j_commit_size() generators = self.get_generators(prvdr, gd_data_manager, gd_config_mod_specific, json_desc_writer) query_template_list = [ [self.gene_descriptions_query_template, commit_size, "genedescriptions_data_" + prvdr + ".csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.save_descriptions_report_files(data_provider=prvdr, json_desc_writer=json_desc_writer, context_info=context_info, gd_data_manager=gd_data_manager)
def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading BGI Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() if filepath is None: self.logger.error("Can't find input file for %s", sub_type) sys.exit() data = JSONFile().get_data(filepath) # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # gene_metadata, gene_dataset, secondary_ids, genomic_locations, cross_references, synonyms # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.gene_metadata_query_template, commit_size, "gene_metadata_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_query_template, commit_size, "gene_data_" + sub_type.get_data_provider() + ".csv" ], [ self.basic_gene_load_relations_query_template, commit_size, "gene_data_load_" + sub_type.get_data_provider() + ".csv" ], [ self.basic_gene_species_relations_query_template, commit_size, "gene_data_species_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "gene_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.chromosomes_query_template, commit_size, "gene_chromosomes_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_secondary_ids_query_template, commit_size, "gene_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "gene_genomic_locations_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "gene_cross_references_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_relationships_query_template, commit_size, "gene_cross_references_relationships_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_synonyms_query_template, 600000, "gene_synonyms_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item) self.error_messages("BGI-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading BGI Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading Expression Data: %s", sub_type.get_data_provider()) data_file = sub_type.get_filepath() data_provider = sub_type.get_data_provider() if data_file is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.bio_entity_expression_query_template, commit_size, "expression_entities_" + sub_type.get_data_provider() + ".csv" ], [ self.bio_entity_gene_ao_query_template, commit_size, "expression_gene_ao_" + sub_type.get_data_provider() + ".csv" ], [ self.bio_entity_gene_expression_join_query_template, commit_size, "expression_entity_joins_" + sub_type.get_data_provider() + ".csv" ], [ self.ao_expression_query_template, commit_size, "expression_ao_expression_" + sub_type.get_data_provider() + ".csv" ] ] if data_provider == 'SGD': query_template_list += [[ self.sgd_cc_expression_query_template, commit_size, "expression_SGD_cc_expression_" + sub_type.get_data_provider() + ".csv" ]] else: query_template_list += [[ self.cc_expression_query_template, commit_size, "expression_cc_expression_" + sub_type.get_data_provider() + ".csv" ]] query_template_list += [ [ self.ao_cc_expression_query_template, commit_size, "expression_ao_cc_expression_" + sub_type.get_data_provider() + ".csv" ], [ self.eas_qualified_query_template, commit_size, "expression_eas_qualified_" + sub_type.get_data_provider() + ".csv" ], [ self.eas_substructure_query_template, commit_size, "expression_eas_substructure_" + sub_type.get_data_provider() + ".csv" ], [ self.eass_qualified_query_template, commit_size, "expression_eass_qualified_" + sub_type.get_data_provider() + ".csv" ], [ self.ccq_expression_query_template, commit_size, "expression_ccq_expression_" + sub_type.get_data_provider() + ".csv" ], [ self.stage_expression_query_template, commit_size, "expression_stage_expression_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_stage_query_template, commit_size, "expression_uberon_stage_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_ao_query_template, commit_size, "expression_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_ao_other_query_template, commit_size, "expression_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_stage_other_query_template, commit_size, "expression_uberon_stage_other_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "expression_cross_references_" + sub_type.get_data_provider() + ".csv" ], [ self.add_pubs_query_template, commit_size, "expression_add_pubs_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data_file, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item) self.logger.info("Finished Loading Expression Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type): self.logger.info("Loading Construct Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Construct Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ ConstructETL.construct_query_template, commit_size, "Construct_data_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_secondary_ids_query_template, commit_size, "Construct_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_synonyms_query_template, commit_size, "Construct_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_xrefs_query_template, commit_size, "Construct_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.non_bgi_component_query_template, commit_size, "Construct_non_bgi_component_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_gene_component_query_template, commit_size, "Construct_components_gene" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_no_gene_component_query_template, commit_size, "Construct_components_no_gene" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)