예제 #1
0
 def load_genes_mod(self, batch_size, testObject, bgiName, loadFile):
     path = "tmp"
     S3File("mod-datadumps", loadFile, path).download()
     TARFile(path, loadFile).extract_all()
     gene_data = JSONFile().get_data(path + bgiName, 'BGI')
     gene_lists = BGIExt().get_data(gene_data, batch_size, testObject)
     return self.yield_gene_lists(gene_lists)
예제 #2
0
    def get_data(self):
        path = "tmp"
        S3File("mod-datadumps/SO", "so_1.0.obo", path).download()
        so_data = TXTFile(path + "/so_1.0.obo").get_data()

        so_list = []

        for current_line, next_line in self.get_current_next(so_data):
            so_dataset = {}
            current_line = current_line.strip()
            key = (current_line.split(":")[0]).strip()
            if key == "id":
                value = ("".join(":".join(
                    current_line.split(":")[1:]))).strip()
                if not value.startswith('SO'):
                    continue
                next_key = (next_line.split(":")[0]).strip()
                if next_key == "name":
                    next_value = ("".join(":".join(
                        next_line.split(":")[1:]))).strip()
                else:
                    sys.exit("FATAL ERROR: Expected SO name not found for %s" %
                             (key))
                so_dataset = {'id': value, 'name': next_value}
                so_list.append(so_dataset)
        return so_list
예제 #3
0
    def load_allele_objects_mod(self, batch_size, testObject, alleleName,
                                loadFile):
        path = "tmp"
        S3File("mod-datadumps", loadFile, path).download()
        TARFile(path, loadFile).extract_all()
        alleleData = JSONFile().get_data(path + alleleName, 'allele')
        alleleDict = AlleleExt().get_alleles(alleleData, batch_size,
                                             testObject)

        return alleleDict
예제 #4
0
    def load_disease_allele_objects_mod(self, batch_size, testObject,
                                        diseaseName, loadFile, graph):
        path = "tmp"
        S3File("mod-datadumps", loadFile, path).download()
        TARFile(path, loadFile).extract_all()
        disease_data = JSONFile().get_data(path + diseaseName, 'disease')
        disease_dict = DiseaseAlleleExt().get_allele_disease_data(
            disease_data, batch_size, graph)

        return disease_dict
    def get_data(self):
        # Grab the data (TODO validate).
        # Some of this algorithm is temporary.
        # e.g. Files from the submission system will arrive without the need for unzipping, etc.
        path = 'tmp'
        context_info = ContextInfo()
        if "SAVE_PATH" in context_info.env:
            if context_info.env["SAVE_PATH"]:
                path = context_info.env["SAVE_PATH"]
                if not os.path.exists(path):
                    logger.info("Making temp file storage: %s" % (path))
                    os.makedirs(path)

        if self.filepath is not None:
            if not os.path.isfile(self.filepath):
                logger.debug("File to download: " + self.file_to_download)
                if self.file_to_download.startswith('http'):
                    download_filename = os.path.basename(self.filepath)
                    logger.debug("Download Name: " + download_filename)
                    download_object = Download(path, self.file_to_download,
                                               download_filename)
                    self.already_downloaded = download_object.get_downloaded_data_new(
                    )
                else:
                    logger.debug("Downloading JSON File: " +
                                 self.file_to_download)
                    self.already_downloaded = S3File(self.file_to_download,
                                                     path).download_new()
                    logger.debug("File already downloaded: %s" %
                                 (self.already_downloaded))
                    if self.file_to_download.endswith('tar.gz'):
                        logger.debug("Extracting all files: %s" %
                                     (self.file_to_download))
                        tar_object = TARFile(path, self.file_to_download)
                        tar_object.extract_all()
                        # Check whether the file exists locally.
                if self.filepath is not None:
                    try:
                        os.path.isfile(self.filepath)
                    except:
                        logger.critical(
                            'No local copy of the specified file found!')
                        logger.critical(
                            'Missing copy of %s for sub type: %s from data type: %s'
                            % (self.filepath, self.sub_data_type,
                               self.data_type))
                        logger.critical(
                            'Please check download functions or data source.')
                        sys.exit(-1)
            else:
                logger.debug("File Path already downloaded: %s" %
                             (self.filepath))
        else:
            logger.debug("File Path is None not downloading")
예제 #6
0
 def extract_go_annots_mod(self, geneAssociationFile, species,
                           identifierPrefix, testObject):
     path = "tmp"
     S3File("mod-datadumps/GO/ANNOT", geneAssociationFile, path).download()
     go_annot_dict = {}
     go_annot_list = []
     with gzip.open(path + "/" + geneAssociationFile, 'rt') as file:
         reader = csv.reader(file, delimiter='\t')
         for line in reader:
             if line[0].startswith('!'):
                 continue
             try:
                 gene = identifierPrefix + line[1]
                 go_id = line[4]
                 dateProduced = line[14]
                 dataProvider = line[15]
                 if gene in go_annot_dict:
                     go_annot_dict[gene]['go_id'].append(go_id)
                 else:
                     go_annot_dict[gene] = {
                         'gene_id': gene,
                         'go_id': [go_id],
                         'species': species,
                         'loadKey':
                         dataProvider + "_" + dateProduced + "_" + "GAF",
                         'dataProvider': dataProvider,
                         'dateProduced': dateProduced
                     }
             except:
                 continue
     # Convert the dictionary into a list of dictionaries for Neo4j.
     # Check for the use of testObject and only return test data if necessary.
     if testObject.using_test_data() is True:
         for entry in go_annot_dict:
             if testObject.check_for_test_id_entry(
                     go_annot_dict[entry]['gene_id']) is True:
                 go_annot_list.append(go_annot_dict[entry])
                 testObject.add_ontology_ids(go_annot_dict[entry]['go_id'])
             else:
                 continue
         return go_annot_list
     else:
         for entry in go_annot_dict:
             go_annot_list.append(go_annot_dict[entry])
         return go_annot_list
예제 #7
0
    def get_data(self):
        """get data"""

        # Grab the data (TODO validate).
        # Some of this algorithm is temporary.
        # e.g. Files from the submission system will arrive without the need for unzipping, etc.
        download_dir = 'tmp'

        if self.filepath is not None:
            if not os.path.isfile(self.filepath):
                self.logger.debug("File to download: %s", self.file_to_download)
                if self.file_to_download.startswith('http'):
                    download_filename = os.path.basename(self.filepath)
                    self.logger.debug("Download Name: %s", download_filename)
                    download_object = Download(download_dir,
                                               self.file_to_download,
                                               download_filename)
                    self.already_downloaded = download_object.is_data_downloaded()
                else:
                    self.logger.debug("Downloading JSON File: %s", self.file_to_download)
                    self.already_downloaded = S3File(self.file_to_download,
                                                     download_dir).download_new()
                    self.logger.debug("File already downloaded: %s", self.already_downloaded)
                    if self.file_to_download.endswith('tar.gz'):
                        self.logger.debug("Extracting all files: %s", self.file_to_download)
                        tar_object = TARFile(download_dir, self.file_to_download)
                        tar_object.extract_all()
                        # Check whether the file exists locally.
                if self.filepath is not None:
                    try:
                        os.path.isfile(self.filepath)
                    except (FileNotFoundError, IOError):
                        self.logger.critical('No local copy of the specified file found!')
                        self.logger.critical('Missing copy of %s for sub type: %s %s: %s',
                                             self.filepath,
                                             "from data_type",
                                             self.sub_data_type,
                                             self.data_type)
                        self.logger.critical('Please check download functions or data source.')
                        sys.exit(-1)
            else:
                self.logger.debug("File Path already downloaded: %s", (self.filepath))
        else:
            self.logger.debug("File Path is None not downloading")
예제 #8
0
    def get_data(self, testObject, filename, prefix):
        path = "tmp"
        S3File("mod-datadumps" + prefix, filename, path).download()
        o_data = TXTFile(path + "/" + filename).get_data()
        parsed_line = parseOBO(o_data)
        list_to_return = []
        for line in parsed_line:  # Convert parsed obo term into a schema-friendly AGR dictionary.
            isasWithoutNames = []
            o_syns = line.get('synonym')
            syns = []
            xrefs = []
            complete_url = None
            xref = None
            xref_urls = []
            local_id = None
            defLinksProcessed = []
            defText = None
            defLinks = []
            subset = []
            newSubset = None
            definition = ""
            is_obsolete = "false"
            ident = line['id']
            prefix = ident.split(":")[0]
            if syns is None:
                syns = [
                ]  # Set the synonyms to an empty array if None. Necessary for Neo4j parsing
            if o_syns is not None:
                if isinstance(o_syns, (list, tuple)):
                    for syn in o_syns:
                        syn = syn.split("\"")[1].strip()
                        syns.append(syn)
                else:
                    syn = o_syns.split("\"")[1].strip()
                    syns.append(syn)
            o_xrefs = line.get('xref')
            if o_xrefs is not None:
                if isinstance(o_xrefs, (list, tuple)):
                    for xrefId in o_xrefs:
                        if ":" in xrefId:
                            local_id = xrefId.split(":")[1].strip()
                            prefix = xrefId.split(":")[0].strip()
                            complete_url = self.get_complete_url(
                                local_id, xrefId)
                            xrefs.append(xref)
                            xref_urls.append({
                                "oid": line['id'],
                                "xrefId": xrefId,
                                "local_id": local_id,
                                "prefix": prefix,
                                "complete_url": complete_url
                            })
                else:
                    if ":" in o_xrefs:
                        local_id = o_xrefs.split(":")[1].strip()
                        prefix = o_xrefs.split(":")[0].strip()
                        xrefs.append(o_xrefs)
                        complete_url = self.get_complete_url(local_id, o_xrefs)
                        xref_urls.append({
                            "oid": line['id'],
                            "xrefId": o_xrefs,
                            "local_id": local_id,
                            "prefix": prefix,
                            "complete_url": complete_url
                        })
            if xrefs is None:
                xrefs = [
                ]  # Set the synonyms to an empty array if None. Necessary for Neo4j parsing
            o_is_as = line.get('is_a')
            if o_is_as is None:
                o_is_as = []
                isasWithoutNames = []
            else:
                if isinstance(o_is_as, (list, tuple)):
                    for isa in o_is_as:
                        isaWithoutName = isa.split("!")[0].strip()
                        isasWithoutNames.append(isaWithoutName)
                else:
                    isaWithoutName = o_is_as.split("!")[0].strip()
                    isasWithoutNames.append(isaWithoutName)
            definition = line.get('def')
            defLinks = ""
            defLinksProcessed = []
            if definition is None:
                definition = ""
            else:
                if definition is not None and "\"" in definition:
                    defText = definition.split("\"")[1].strip()
                    if "[" in definition.split("\"")[2].strip():
                        defLinks = definition.split("\"")[2].strip()
                        defLinks = defLinks.rstrip("]").replace("[", "")
                        defLinks = defLinks.replace("url:www", "http://wwww")
                        defLinks = defLinks.replace("url:", "")
                        defLinks = defLinks.replace("URL:", "")
                        defLinks = defLinks.replace("\\:", ":")

                        if "," in defLinks:
                            defLinks = defLinks.split(",")
                            for link in defLinks:
                                if link.strip().startswith("http"):
                                    defLinksProcessed.append(link)
                        else:
                            if defLinks.strip().startswith("http"):
                                defLinksProcessed.append(defLinks)
                else:
                    definition = defText
            if definition is None:
                definition = ""

            newSubset = line.get('subset')
            if isinstance(newSubset, (list, tuple)):
                subset = newSubset
            else:
                if newSubset is not None:
                    subset.append(newSubset)
            is_obsolete = line.get('is_obsolete')
            if is_obsolete is None:
                is_obsolete = "false"

            dict_to_append = {
                'o_genes': [],
                'o_species': [],
                'name':
                line['name'],
                'o_synonyms':
                syns,
                'name_key':
                line['name'],
                'id':
                line['id'],
                'definition':
                definition,
                'isas':
                isasWithoutNames,
                'is_obsolete':
                is_obsolete,
                'subset':
                subset,
                'xrefs':
                xrefs,
                #TODO: fix links to not be passed for each ontology load.
                'rgd_link':
                'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=All&x=1&acc_id='
                + line['id'] + '#annot',
                'rgd_all_link':
                'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=All&x=1&acc_id='
                + line['id'] + '#annot',
                'rat_only_rgd_link':
                'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id='
                + line['id'] + '#annot',
                'human_only_rgd_link':
                'http://rgd.mcw.edu/rgdweb/ontology/annot.html?species=Human&x=1&acc_id='
                + line['id'] + '#annot',
                'mgi_link':
                'http://www.informatics.jax.org/disease/' + line['id'],
                'wormbase_link':
                'http://www.wormbase.org/resources/disease/' + line['id'],
                'flybase_link':
                'http://flybase.org/cgi-bin/cvreport.html?id=' + line['id'],
                'zfin_link':
                'https://zfin.org/' + line['id'],
                'oUrl':
                "http://www.disease-ontology.org/?id=" + line['id'],
                'oPrefix':
                prefix,
                'xref_urls':
                xref_urls,
                'defText':
                defText,
                'defLinksProcessed':
                defLinksProcessed,
                'oboFile':
                prefix,
                'href':
                'http://amigo.geneontology.org/amigo/term/' + line['id'],
                'category':
                'go',
                'o_type':
                line.get('namespace'),
            }
            list_to_return.append(dict_to_append)

        # if testObject.using_test_data() is True:
        #     filtered_dict = []
        #     for entry in list_to_return:
        #         if testObject.check_for_test_ontology_entry(entry['id']) is True:
        #             filtered_dict.append(entry)
        #         else:
        #             continue
        #     return filtered_dict
        # else:
        return list_to_return