def load_genes_mod(self, batch_size, testObject, bgiName, loadFile): path = "tmp" S3File("mod-datadumps", loadFile, path).download() TARFile(path, loadFile).extract_all() gene_data = JSONFile().get_data(path + bgiName, 'BGI') gene_lists = BGIExt().get_data(gene_data, batch_size, testObject) return self.yield_gene_lists(gene_lists)
def get_data(self): path = "tmp" S3File("mod-datadumps/SO", "so_1.0.obo", path).download() so_data = TXTFile(path + "/so_1.0.obo").get_data() so_list = [] for current_line, next_line in self.get_current_next(so_data): so_dataset = {} current_line = current_line.strip() key = (current_line.split(":")[0]).strip() if key == "id": value = ("".join(":".join( current_line.split(":")[1:]))).strip() if not value.startswith('SO'): continue next_key = (next_line.split(":")[0]).strip() if next_key == "name": next_value = ("".join(":".join( next_line.split(":")[1:]))).strip() else: sys.exit("FATAL ERROR: Expected SO name not found for %s" % (key)) so_dataset = {'id': value, 'name': next_value} so_list.append(so_dataset) return so_list
def load_allele_objects_mod(self, batch_size, testObject, alleleName, loadFile): path = "tmp" S3File("mod-datadumps", loadFile, path).download() TARFile(path, loadFile).extract_all() alleleData = JSONFile().get_data(path + alleleName, 'allele') alleleDict = AlleleExt().get_alleles(alleleData, batch_size, testObject) return alleleDict
def load_disease_allele_objects_mod(self, batch_size, testObject, diseaseName, loadFile, graph): path = "tmp" S3File("mod-datadumps", loadFile, path).download() TARFile(path, loadFile).extract_all() disease_data = JSONFile().get_data(path + diseaseName, 'disease') disease_dict = DiseaseAlleleExt().get_allele_disease_data( disease_data, batch_size, graph) return disease_dict
def get_data(self): # Grab the data (TODO validate). # Some of this algorithm is temporary. # e.g. Files from the submission system will arrive without the need for unzipping, etc. path = 'tmp' context_info = ContextInfo() if "SAVE_PATH" in context_info.env: if context_info.env["SAVE_PATH"]: path = context_info.env["SAVE_PATH"] if not os.path.exists(path): logger.info("Making temp file storage: %s" % (path)) os.makedirs(path) if self.filepath is not None: if not os.path.isfile(self.filepath): logger.debug("File to download: " + self.file_to_download) if self.file_to_download.startswith('http'): download_filename = os.path.basename(self.filepath) logger.debug("Download Name: " + download_filename) download_object = Download(path, self.file_to_download, download_filename) self.already_downloaded = download_object.get_downloaded_data_new( ) else: logger.debug("Downloading JSON File: " + self.file_to_download) self.already_downloaded = S3File(self.file_to_download, path).download_new() logger.debug("File already downloaded: %s" % (self.already_downloaded)) if self.file_to_download.endswith('tar.gz'): logger.debug("Extracting all files: %s" % (self.file_to_download)) tar_object = TARFile(path, self.file_to_download) tar_object.extract_all() # Check whether the file exists locally. if self.filepath is not None: try: os.path.isfile(self.filepath) except: logger.critical( 'No local copy of the specified file found!') logger.critical( 'Missing copy of %s for sub type: %s from data type: %s' % (self.filepath, self.sub_data_type, self.data_type)) logger.critical( 'Please check download functions or data source.') sys.exit(-1) else: logger.debug("File Path already downloaded: %s" % (self.filepath)) else: logger.debug("File Path is None not downloading")
def extract_go_annots_mod(self, geneAssociationFile, species, identifierPrefix, testObject): path = "tmp" S3File("mod-datadumps/GO/ANNOT", geneAssociationFile, path).download() go_annot_dict = {} go_annot_list = [] with gzip.open(path + "/" + geneAssociationFile, 'rt') as file: reader = csv.reader(file, delimiter='\t') for line in reader: if line[0].startswith('!'): continue try: gene = identifierPrefix + line[1] go_id = line[4] dateProduced = line[14] dataProvider = line[15] if gene in go_annot_dict: go_annot_dict[gene]['go_id'].append(go_id) else: go_annot_dict[gene] = { 'gene_id': gene, 'go_id': [go_id], 'species': species, 'loadKey': dataProvider + "_" + dateProduced + "_" + "GAF", 'dataProvider': dataProvider, 'dateProduced': dateProduced } except: continue # Convert the dictionary into a list of dictionaries for Neo4j. # Check for the use of testObject and only return test data if necessary. if testObject.using_test_data() is True: for entry in go_annot_dict: if testObject.check_for_test_id_entry( go_annot_dict[entry]['gene_id']) is True: go_annot_list.append(go_annot_dict[entry]) testObject.add_ontology_ids(go_annot_dict[entry]['go_id']) else: continue return go_annot_list else: for entry in go_annot_dict: go_annot_list.append(go_annot_dict[entry]) return go_annot_list
def get_data(self): """get data""" # Grab the data (TODO validate). # Some of this algorithm is temporary. # e.g. Files from the submission system will arrive without the need for unzipping, etc. download_dir = 'tmp' if self.filepath is not None: if not os.path.isfile(self.filepath): self.logger.debug("File to download: %s", self.file_to_download) if self.file_to_download.startswith('http'): download_filename = os.path.basename(self.filepath) self.logger.debug("Download Name: %s", download_filename) download_object = Download(download_dir, self.file_to_download, download_filename) self.already_downloaded = download_object.is_data_downloaded() else: self.logger.debug("Downloading JSON File: %s", self.file_to_download) self.already_downloaded = S3File(self.file_to_download, download_dir).download_new() self.logger.debug("File already downloaded: %s", self.already_downloaded) if self.file_to_download.endswith('tar.gz'): self.logger.debug("Extracting all files: %s", self.file_to_download) tar_object = TARFile(download_dir, self.file_to_download) tar_object.extract_all() # Check whether the file exists locally. if self.filepath is not None: try: os.path.isfile(self.filepath) except (FileNotFoundError, IOError): self.logger.critical('No local copy of the specified file found!') self.logger.critical('Missing copy of %s for sub type: %s %s: %s', self.filepath, "from data_type", self.sub_data_type, self.data_type) self.logger.critical('Please check download functions or data source.') sys.exit(-1) else: self.logger.debug("File Path already downloaded: %s", (self.filepath)) else: self.logger.debug("File Path is None not downloading")
def get_data(self, testObject, filename, prefix): path = "tmp" S3File("mod-datadumps" + prefix, filename, path).download() o_data = TXTFile(path + "/" + filename).get_data() parsed_line = parseOBO(o_data) list_to_return = [] for line in parsed_line: # Convert parsed obo term into a schema-friendly AGR dictionary. isasWithoutNames = [] o_syns = line.get('synonym') syns = [] xrefs = [] complete_url = None xref = None xref_urls = [] local_id = None defLinksProcessed = [] defText = None defLinks = [] subset = [] newSubset = None definition = "" is_obsolete = "false" ident = line['id'] prefix = ident.split(":")[0] if syns is None: syns = [ ] # Set the synonyms to an empty array if None. Necessary for Neo4j parsing if o_syns is not None: if isinstance(o_syns, (list, tuple)): for syn in o_syns: syn = syn.split("\"")[1].strip() syns.append(syn) else: syn = o_syns.split("\"")[1].strip() syns.append(syn) o_xrefs = line.get('xref') if o_xrefs is not None: if isinstance(o_xrefs, (list, tuple)): for xrefId in o_xrefs: if ":" in xrefId: local_id = xrefId.split(":")[1].strip() prefix = xrefId.split(":")[0].strip() complete_url = self.get_complete_url( local_id, xrefId) xrefs.append(xref) xref_urls.append({ "oid": line['id'], "xrefId": xrefId, "local_id": local_id, "prefix": prefix, "complete_url": complete_url }) else: if ":" in o_xrefs: local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() xrefs.append(o_xrefs) complete_url = self.get_complete_url(local_id, o_xrefs) xref_urls.append({ "oid": line['id'], "xrefId": o_xrefs, "local_id": local_id, "prefix": prefix, "complete_url": complete_url }) if xrefs is None: xrefs = [ ] # Set the synonyms to an empty array if None. Necessary for Neo4j parsing o_is_as = line.get('is_a') if o_is_as is None: o_is_as = [] isasWithoutNames = [] else: if isinstance(o_is_as, (list, tuple)): for isa in o_is_as: isaWithoutName = isa.split("!")[0].strip() isasWithoutNames.append(isaWithoutName) else: isaWithoutName = o_is_as.split("!")[0].strip() isasWithoutNames.append(isaWithoutName) definition = line.get('def') defLinks = "" defLinksProcessed = [] if definition is None: definition = "" else: if definition is not None and "\"" in definition: defText = definition.split("\"")[1].strip() if "[" in definition.split("\"")[2].strip(): defLinks = definition.split("\"")[2].strip() defLinks = defLinks.rstrip("]").replace("[", "") defLinks = defLinks.replace("url:www", "http://wwww") defLinks = defLinks.replace("url:", "") defLinks = defLinks.replace("URL:", "") defLinks = defLinks.replace("\\:", ":") if "," in defLinks: defLinks = defLinks.split(",") for link in defLinks: if link.strip().startswith("http"): defLinksProcessed.append(link) else: if defLinks.strip().startswith("http"): defLinksProcessed.append(defLinks) else: definition = defText if definition is None: definition = "" newSubset = line.get('subset') if isinstance(newSubset, (list, tuple)): subset = newSubset else: if newSubset is not None: subset.append(newSubset) is_obsolete = line.get('is_obsolete') if is_obsolete is None: is_obsolete = "false" dict_to_append = { 'o_genes': [], 'o_species': [], 'name': line['name'], 'o_synonyms': syns, 'name_key': line['name'], 'id': line['id'], 'definition': definition, 'isas': isasWithoutNames, 'is_obsolete': is_obsolete, 'subset': subset, 'xrefs': xrefs, #TODO: fix links to not be passed for each ontology load. 'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=All&x=1&acc_id=' + line['id'] + '#annot', 'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=All&x=1&acc_id=' + line['id'] + '#annot', 'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id=' + line['id'] + '#annot', 'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=Human&x=1&acc_id=' + line['id'] + '#annot', 'mgi_link': 'http://www.informatics.jax.org/disease/' + line['id'], 'wormbase_link': 'http://www.wormbase.org/resources/disease/' + line['id'], 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id=' + line['id'], 'zfin_link': 'https://zfin.org/' + line['id'], 'oUrl': "http://www.disease-ontology.org/?id=" + line['id'], 'oPrefix': prefix, 'xref_urls': xref_urls, 'defText': defText, 'defLinksProcessed': defLinksProcessed, 'oboFile': prefix, 'href': 'http://amigo.geneontology.org/amigo/term/' + line['id'], 'category': 'go', 'o_type': line.get('namespace'), } list_to_return.append(dict_to_append) # if testObject.using_test_data() is True: # filtered_dict = [] # for entry in list_to_return: # if testObject.check_for_test_ontology_entry(entry['id']) is True: # filtered_dict.append(entry) # else: # continue # return filtered_dict # else: return list_to_return