示例#1
0
    def __init__(self, UniProtAC, XML = None, cache_dir = None, silent = True):
        if cache_dir and not(os.path.exists(cache_dir)):
            raise Exception("The cache directory %s does not exist." % cache_dir)

        self.UniProtAC = UniProtAC
        self.silent = silent

        # Get XML
        if XML == None:
            protein_xml = None
            cached_filepath = None
            if cache_dir:
                cached_filepath = os.path.join(cache_dir, '%s.xml' % UniProtAC)
            if cached_filepath and os.path.exists(cached_filepath):
                protein_xml = read_file(cached_filepath)
            else:
                if not silent:
                    colortext.write("Retrieving %s\n" % UniProtAC, "cyan")
                url = 'http://www.uniprot.org/uniprot/%s.xml' % UniProtAC
                protein_xml = http_get(url)
                if not(protein_xml.strip()):
                    raise EmptyUniProtACXMLException('The file %s is empty.' % UniProtAC)
                if cached_filepath:
                    write_file(cached_filepath, protein_xml)
            self.XML = protein_xml
        else:
            self.XML = XML

        self.recommended_name = None
        self.submitted_names = []
        self.alternative_names = []

        # Get DOM
        try:
            self._dom = parseString(protein_xml)
        except:
            if cached_filepath:
                raise Exception("The UniProtAC XML for '%s' was invalid. The cached file is located at %s. Check this file - if it is not valid XML then delete the file and rerun the script." % (UniProtAC, cached_filepath))
            else:
                raise Exception("The UniProtAC XML for '%s' was invalid." % UniProtAC)
        main_tags = self._dom.getElementsByTagName("uniprot")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]

        self._parse_evidence_tag()
        self._parse_sequence_tag()
        self._parse_protein_tag()
        self._parse_organism_tag()
        self._parse_subsections()
        self._parse_PDB_mapping()
示例#2
0
def get_obsolete_acc_to_uniparc(acc):
    ''' Tries to determine the UniParc ID for obsolete ACCs which are not returned using uniprot_map.

        :param acc: The UniProt accession number.
        :return: The corresponding UniParc ID.

        Warning: This is a fragile function as the underlying website generation or URL could change.
    '''
    contents = http_get('www.uniprot.org/uniparc/?query={0}'.format(acc))
    mtchs = re.findall(r'"UPI[A-Z0-9]+?"', contents, re.DOTALL)
    uniparc_id = set([m[1:-1] for m in mtchs])
    if len(uniparc_id) == 1:
        return uniparc_id.pop()
    elif len(uniparc_id) > 1:
        raise Exception('Multiple UPI identifiers found.')
    return None
示例#3
0
    def _get_XML(self):
        uparc_xml = None
        cached_filepath = None
        if self.cache_dir:
            cached_filepath = os.path.join(self.cache_dir, '%s.xml' % self.UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            uparc_xml = read_file(cached_filepath)
        else:
            if not self.silent:
                colortext.write("Retrieving %s\n" % self.UniParcID, "cyan")
            url = 'http://www.uniprot.org/uniparc/%s.xml' % self.UniParcID
            uparc_xml = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, uparc_xml)
        self.XML = uparc_xml

        # Get DOM
        self._dom = parseString(uparc_xml)
        main_tags = self._dom.getElementsByTagName("uniparc")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]
示例#4
0
    def __init__(self, UniParcID, UniProtACs = None, UniProtIDs = None, cache_dir = None, silent = False):
        if cache_dir and not(os.path.exists(os.path.abspath(cache_dir))):
            raise Exception("The cache directory %s does not exist." % os.path.abspath(cache_dir))
        self.UniParcID = UniParcID
        self.cache_dir = cache_dir
        self.recommended_name = None
        self.silent = silent

        # Get AC mapping
        if not UniProtACs or UniParcID=='UPI0000047CA3': # todo: is this UPI0000047CA3 special handling necessary?
            mapping = uniprot_map('UPARC', 'ACC', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID]
            self.UniProtACs = mapping
        else:
            self.UniProtACs = UniProtACs

        # Get ID mapping
        if not UniProtIDs:
            mapping = uniprot_map('UPARC', 'ID', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID]
            self.UniProtIDs = mapping
        else:
            self.UniProtIDs = UniProtIDs

        # Get FASTA
        cached_filepath = None
        if cache_dir:
            cached_filepath = os.path.join(cache_dir, '%s.fasta' % UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            fasta = read_file(cached_filepath)
        else:
            if not silent:
                print("Getting FASTA file")
            url = 'http://www.uniprot.org/uniparc/%s.fasta' % UniParcID
            fasta = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, fasta)

        # Get sequence
        header = fasta.split("\n")[0].split()
        assert(len(header) == 2)
        assert(header[0] == ">%s" % UniParcID)
        assert(header[1].startswith("status="))
        sequence = "".join(map(string.strip, fasta.split("\n")[1:]))
        self.sequence = sequence

        # Get atomic mass (and sequence again)
        self.atomic_mass = None
        self.CRC64Digest = None
        recommended_names = []
        alternative_names = []
        submitted_names = []

        self.AC_entries = {}
        subsections = ProteinSubsectionHolder(len(sequence))

        for UniProtAC in self.UniProtACs:
            #colortext.write("%s\n" % UniProtAC, 'cyan')
            try:
                AC_entry = UniProtACEntry(UniProtAC, cache_dir = self.cache_dir, silent = silent)
            except EmptyUniProtACXMLException:
                continue
            self.AC_entries[UniProtAC] = AC_entry

            # Mass sanity check
            if self.atomic_mass != None:
                assert(self.atomic_mass == AC_entry.atomic_mass)
            self.atomic_mass = AC_entry.atomic_mass

            # Sequence sanity check
            assert(self.sequence == AC_entry.sequence)
            # CRC 64 sanity check
            if self.CRC64Digest != None:
                assert(self.CRC64Digest == AC_entry.CRC64Digest)
            self.CRC64Digest = AC_entry.CRC64Digest
            assert(CRC64.CRC64digest(self.sequence) == self.CRC64Digest)

            if AC_entry.recommended_name:
                found = False
                for n in recommended_names:
                    if n[0] == AC_entry.recommended_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    recommended_names.append([AC_entry.recommended_name, 1])

            for alternative_name in AC_entry.alternative_names:
                found = False
                for n in alternative_names:
                    if n[0] == alternative_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    alternative_names.append([alternative_name, 1])

            for submitted_name in AC_entry.submitted_names:
                found = False
                for n in submitted_names:
                    if n[0] == submitted_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    submitted_names.append([submitted_name, 1])

            subsections += AC_entry.subsections
        self.subsections = subsections

        assert(len(set(UniParcMergedRecommendedNamesRemap.keys()).intersection(set(UniParcMergedSubmittedNamesRemap.keys()))) == 0)
        if UniParcID in UniParcMergedRecommendedNamesRemap:
            recommended_names = [[UniParcMergedRecommendedNamesRemap[UniParcID], 1]]
        elif UniParcID in UniParcMergedSubmittedNamesRemap:
            recommended_names = [[UniParcMergedSubmittedNamesRemap[UniParcID], 1]]

        if not silent:
            colortext.write('Subsections\n', 'orange')
        #print(subsections)

        if len(recommended_names) == 0 and len(alternative_names) == 0 and len(submitted_names) == 0:
            raise UniParcEntryStandardizationException("UniParcID %s has no recommended names." % UniParcID)
        elif len(recommended_names) == 0:
            s = ["UniParcID %s has no recommended names.\n" % UniParcID]
            if alternative_names:
                s.append("It has the following alternative names:")
                for tpl in sorted(alternative_names, key=lambda x:-x[1]):
                    s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                    if tpl[0]['Short names']:
                        s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                    if tpl[0]['EC numbers']:
                        s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            if submitted_names:
                s.append("It has the following submitted names:")
                for tpl in sorted(submitted_names, key=lambda x:-x[1]):
                    s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                    if tpl[0]['Short names']:
                        s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                    if tpl[0]['EC numbers']:
                        s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            #raise UniParcEntryStandardizationException("".join(s))
        elif len(recommended_names) > 1:
            s = ["UniParcID %s has multiple recommended names: " % UniParcID]
            for tpl in sorted(recommended_names, key=lambda x:-x[1]):
                s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                if tpl[0]['Short names']:
                    s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                if tpl[0]['EC numbers']:
                    s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            raise UniParcEntryStandardizationException("".join(s))

        #assert(len(recommended_names) == 1) # todo: this is not always available
        #print(recommended_names)
        self.recommended_name = None
        if len(recommended_names) == 1:
            self.recommended_name = recommended_names[0][0]
        self.get_organisms()