Пример #1
0
 def test_lipoprotein(self):
     """Parsing ENZYME record for lipoprotein lipase (3.1.1.34)"""
     filename = os.path.join('Enzymes', 'lipoprotein.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     handle.close()
     self.assertEqual(record["ID"], "3.1.1.34")
     self.assertEqual(record["DE"], "Lipoprotein lipase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Clearing factor lipase.")
     self.assertEqual(record["AN"][1], "Diacylglycerol lipase.")
     self.assertEqual(record["AN"][2], "Diglyceride lipase.")
     self.assertEqual(record["CA"], "Triacylglycerol + H(2)O = diacylglycerol + a carboxylate.")
     self.assertEqual(record["CC"][0], 'Hydrolyzes triacylglycerols in chylomicrons and very low-density lipoproteins (VLDL).')
     self.assertEqual(record["CC"][1], "Also hydrolyzes diacylglycerol.")
     self.assertEqual(record['PR'], ["PDOC00110"])
     self.assertEqual(record["DR"][0], ["P11151", "LIPL_BOVIN"])
     self.assertEqual(record["DR"][1], ["P11153", "LIPL_CAVPO"])
     self.assertEqual(record["DR"][2], ["P11602", "LIPL_CHICK"])
     self.assertEqual(record["DR"][3], ["P55031", "LIPL_FELCA"])
     self.assertEqual(record["DR"][4], ["P06858", "LIPL_HUMAN"])
     self.assertEqual(record["DR"][5], ["P11152", "LIPL_MOUSE"])
     self.assertEqual(record["DR"][6], ["O46647", "LIPL_MUSVI"])
     self.assertEqual(record["DR"][7], ["P49060", "LIPL_PAPAN"])
     self.assertEqual(record["DR"][8], ["P49923", "LIPL_PIG"])
     self.assertEqual(record["DR"][9], ["Q06000", "LIPL_RAT"])
     self.assertEqual(record["DR"][10], ["Q29524", "LIPL_SHEEP"])
Пример #2
0
 def test_lipoprotein(self):
     """Parsing ENZYME record for lipoprotein lipase (3.1.1.34)."""
     filename = os.path.join("Enzymes", "lipoprotein.txt")
     with open(filename) as handle:
         record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "3.1.1.34")
     self.assertEqual(record["DE"], "Lipoprotein lipase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Clearing factor lipase.")
     self.assertEqual(record["AN"][1], "Diacylglycerol lipase.")
     self.assertEqual(record["AN"][2], "Diglyceride lipase.")
     self.assertEqual(
         record["CA"],
         "Triacylglycerol + H(2)O = diacylglycerol + a carboxylate.")
     self.assertEqual(
         record["CC"][0],
         "Hydrolyzes triacylglycerols in chylomicrons and very low-density lipoproteins (VLDL).",
     )
     self.assertEqual(record["CC"][1], "Also hydrolyzes diacylglycerol.")
     self.assertEqual(record["PR"], ["PDOC00110"])
     self.assertEqual(record["DR"][0], ["P11151", "LIPL_BOVIN"])
     self.assertEqual(record["DR"][1], ["P11153", "LIPL_CAVPO"])
     self.assertEqual(record["DR"][2], ["P11602", "LIPL_CHICK"])
     self.assertEqual(record["DR"][3], ["P55031", "LIPL_FELCA"])
     self.assertEqual(record["DR"][4], ["P06858", "LIPL_HUMAN"])
     self.assertEqual(record["DR"][5], ["P11152", "LIPL_MOUSE"])
     self.assertEqual(record["DR"][6], ["O46647", "LIPL_MUSVI"])
     self.assertEqual(record["DR"][7], ["P49060", "LIPL_PAPAN"])
     self.assertEqual(record["DR"][8], ["P49923", "LIPL_PIG"])
     self.assertEqual(record["DR"][9], ["Q06000", "LIPL_RAT"])
     self.assertEqual(record["DR"][10], ["Q29524", "LIPL_SHEEP"])
     self.assertTrue(
         str(record).startswith("ID: 3.1.1.34\nDE: Lipoprotein lipase.\n"),
         "Did not expect:\n%s" % record,
     )
Пример #3
0
 def test_lipoprotein(self):
     "Parsing ENZYME record for lipoprotein lipase (3.1.1.34)"
     filename = os.path.join( 'Enzymes', 'lipoprotein.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     handle.close()
     self.assertEqual(record["ID"], "3.1.1.34")
     self.assertEqual(record["DE"], "Lipoprotein lipase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Clearing factor lipase.")
     self.assertEqual(record["AN"][1], "Diacylglycerol lipase.")
     self.assertEqual(record["AN"][2], "Diglyceride lipase.")
     self.assertEqual(record["CA"], "Triacylglycerol + H(2)O = diacylglycerol + a carboxylate.")
     self.assertEqual(record["CC"][0], 'Hydrolyzes triacylglycerols in chylomicrons and very low-density lipoproteins (VLDL).')
     self.assertEqual(record["CC"][1], "Also hydrolyzes diacylglycerol.")
     self.assertEqual(record['PR'], ["PDOC00110"])
     self.assertEqual(record["DR"][0], ["P11151", "LIPL_BOVIN"])
     self.assertEqual(record["DR"][1], ["P11153", "LIPL_CAVPO"])
     self.assertEqual(record["DR"][2], ["P11602", "LIPL_CHICK"])
     self.assertEqual(record["DR"][3], ["P55031", "LIPL_FELCA"])
     self.assertEqual(record["DR"][4], ["P06858", "LIPL_HUMAN"])
     self.assertEqual(record["DR"][5], ["P11152", "LIPL_MOUSE"])
     self.assertEqual(record["DR"][6], ["O46647", "LIPL_MUSVI"])
     self.assertEqual(record["DR"][7], ["P49060", "LIPL_PAPAN"])
     self.assertEqual(record["DR"][8], ["P49923", "LIPL_PIG"])
     self.assertEqual(record["DR"][9], ["Q06000", "LIPL_RAT"])
     self.assertEqual(record["DR"][10], ["Q29524", "LIPL_SHEEP"])
Пример #4
0
 def test_valine(self):
     "Parsing ENZYME record for valine decarboxylase (4.1.1.14)"
     filename = os.path.join( 'Enzymes', 'valine.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "4.1.1.14")
     self.assertEqual(record["DE"], "Valine decarboxylase.")
     self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).")
     self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.")
     self.assertEqual(record["CC"], ["Also acts on L-leucine."])
     self.assertEqual(len(record["DR"]), 0)
Пример #5
0
def get_EC_RXNs():

    ec_count = 0
    ec_ontologyDictionary_filename = 'EBI_EC_ontologyDictionary.json'

    # get EC version number
    ebi_ec_release = ""

    if args.test == False:
        ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzclass.txt'
        ebi_ec_release = os.popen(ebi_ec_call).read()
    else:
        with open('enzclass.txt', 'r') as myfile:
            ebi_ec_release = myfile.read()

    ebi_ec_release = ebi_ec_release.split("\n")[7].split()[1]

    # create dictionary
    ec_dict = {
        'data_version': ebi_ec_release,
        'date': timestamp,
        'format_version': 'N/A',
        'ontology': 'ec_orthology',
        'term_hash': {}
    }

    # parse data
    ebi_ec_enzyme = 'enzyme.dat'

    if args.test == False:
        ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzyme.dat > enzyme.dat'
        os.system(ebi_ec_call)

    records = Enzyme.parse(open(ebi_ec_enzyme))

    for record in records:
        ec_dict['term_hash'][record['ID']] = {
            'id': record['ID'],
            'name': record['DE'],
            'synonyms': record['AN']
        }
        ec_count += 1

    # save json file
    with open(ec_ontologyDictionary_filename, 'w') as outfile:
        json.dump(ec_dict, outfile, indent=2)

    # print summary
    if args.summary == True:
        print("ec_orthology",
              ec_count,
              ebi_ec_release,
              ec_ontologyDictionary_filename,
              sep="\t")
Пример #6
0
 def test_valine(self):
     """Parsing ENZYME record for valine decarboxylase (4.1.1.14)"""
     filename = os.path.join('Enzymes', 'valine.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     handle.close()
     self.assertEqual(record["ID"], "4.1.1.14")
     self.assertEqual(record["DE"], "Valine decarboxylase.")
     self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).")
     self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.")
     self.assertEqual(record["CC"], ["Also acts on L-leucine."])
     self.assertEqual(len(record["DR"]), 0)
Пример #7
0
def get_expasy_enzyme():
    """

    """
    url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
    enzyme = urllib.request.urlretrieve(url)
    enzyme_p = bee.parse(open(enzyme[0], 'r'))
    enz_records = []
    count = 0
    for record in enzyme_p:

        enz_rec = {}
        enz_rec['Reaction(s)'] = record['CA']
        #create record for each enzyme with EC number as primary key
        enz_rec['PreferedName'] = record['DE']
        enz_rec['ECNumber'] = record['ID']
        enz_rec['Reaction(s)'] = []
        enz_rec['Substrates'] = {}
        enz_rec['Products'] = {}
        enz_rec['UniProt'] = {}
        enz_records.append(enz_rec)

        # split split to seperate multiple reactions
        reaction1 = record['CA'].split('.')
        for rxn in reaction1:
            if len(reaction1) > 2:
                rxn = rxn[3:]
            enz_rec['Reaction(s)'].append(rxn)
            #split reactions into [substrates, products]
            constituents = rxn.split('=')
            # split each side of reaction on '+' not '(+)'
            r = re.compile(r'(?:[^\+(]|\([^)]*\))+')
            substrates = r.findall(constituents[0])
            products = r.findall(constituents[-1])

            if substrates:
                for sub in substrates:
                    sub = replace_strings(sub.lstrip().rstrip())
                    schebi = link_compound2chebi(sub)
                    enz_rec['Substrates'][sub] = schebi
            if products:
                for prod in products:
                    prod = replace_strings(prod.lstrip().rstrip())
                    pchebi = link_compound2chebi(prod)
                    enz_rec['Products'][prod] = pchebi

                # populate enz_rec['UniProt'] with dictionary of uniprotid:name key, value pairs for protein
            for unpid in record['DR']:
                enz_rec['UniProt'][unpid[0]] = unpid[1]

        enz_records.append(enz_rec)

    return enz_records
Пример #8
0
 def test_lactate(self):
     "Parsing ENZYME record for lactate racemase (5.1.2.1)"
     filename = os.path.join( 'Enzymes', 'lactate.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "5.1.2.1")
     self.assertEqual(record["DE"], "Lactate racemase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Hydroxyacid racemase.")
     self.assertEqual(record["AN"][1], "Lactic acid racemase.")
     self.assertEqual(record["AN"][2], "Lacticoracemase.")
     self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.")
     self.assertEqual(len(record["DR"]), 0)
Пример #9
0
 def test_lactate(self):
     "Parsing ENZYME record for lactate racemase (5.1.2.1)"
     filename = os.path.join('Enzymes', 'lactate.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "5.1.2.1")
     self.assertEqual(record["DE"], "Lactate racemase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Hydroxyacid racemase.")
     self.assertEqual(record["AN"][1], "Lactic acid racemase.")
     self.assertEqual(record["AN"][2], "Lacticoracemase.")
     self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.")
     self.assertEqual(len(record["DR"]), 0)
Пример #10
0
 def test_valine(self):
     """Parsing ENZYME record for valine decarboxylase (4.1.1.14)"""
     filename = os.path.join('Enzymes', 'valine.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     handle.close()
     self.assertEqual(record["ID"], "4.1.1.14")
     self.assertEqual(record["DE"], "Valine decarboxylase.")
     self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).")
     self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.")
     self.assertEqual(record["CC"], ["Also acts on L-leucine."])
     self.assertEqual(len(record["DR"]), 0)
     self.assertTrue(str(record).startswith("ID: 4.1.1.14\nDE: Valine decarboxylase.\n"),
                     "Did not expect:\n%s" % record)
Пример #11
0
 def test_parse_many(self):
     """Check parse function with multiple records."""
     data = ""
     for filename in ["Enzymes/lipoprotein.txt",
                      "Enzymes/proline.txt",
                      "Enzymes/valine.txt"]:
         with open(filename) as handle:
             data += handle.read()
     handle = StringIO(data)
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 3)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
     self.assertEqual(records[1]["ID"], "5.1.1.4")
     self.assertEqual(records[2]["ID"], "4.1.1.14")
Пример #12
0
 def test_parse_many(self):
     """Check parse function with multiple records."""
     data = ""
     for filename in ["Enzymes/lipoprotein.txt",
                      "Enzymes/proline.txt",
                      "Enzymes/valine.txt"]:
         with open(filename) as handle:
             data += handle.read()
     handle = StringIO(data)
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 3)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
     self.assertEqual(records[1]["ID"], "5.1.1.4")
     self.assertEqual(records[2]["ID"], "4.1.1.14")
Пример #13
0
 def test_valine(self):
     """Parsing ENZYME record for valine decarboxylase (4.1.1.14)."""
     filename = os.path.join("Enzymes", "valine.txt")
     handle = open(filename)
     record = Enzyme.read(handle)
     handle.close()
     self.assertEqual(record["ID"], "4.1.1.14")
     self.assertEqual(record["DE"], "Valine decarboxylase.")
     self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).")
     self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.")
     self.assertEqual(record["CC"], ["Also acts on L-leucine."])
     self.assertEqual(len(record["DR"]), 0)
     self.assertTrue(str(record).startswith("ID: 4.1.1.14\nDE: Valine decarboxylase.\n"),
                     "Did not expect:\n%s" % record)
def get_expasy_enzyme():
    """

    """
    url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
    enzyme = urllib.request.urlretrieve(url)
    enzyme_p = bee.parse(open(enzyme[0], 'r'))
    enz_records = []
    count = 0
    for record in enzyme_p:
        count += 1
    
        enz_rec = {}
        enz_rec['Reaction(s)'] = record['CA']
        #create record for each enzyme with EC number as primary key
        enz_rec['PreferedName'] = record['DE']
        enz_rec['ECNumber'] = record['ID']
        enz_rec['Reaction(s)'] = []
        enz_rec['Substrates'] = {}
        enz_rec['Products'] = {}
        enz_rec['UniProt'] = {}
        enz_records.append(enz_rec)

        # split split to seperate multiple reactions
        reaction1 = record['CA'].split('.')
        for rxn in reaction1:
            if len(reaction1) > 2:
                rxn = rxn[3:]
            enz_rec['Reaction(s)'].append(rxn)
            #split reactions into [substrates, products]
            constituents = rxn.split('=')
            # split each side of reaction on '+' not '(+)'
            r = re.compile(r'(?:[^\+(]|\([^)]*\))+')
            for sub in r.findall(constituents[0]):
                sub = replace_strings(sub.lstrip().rstrip())
                schebi = link_compound2chebi(sub)
                enz_rec['Substrates'][sub] = schebi

            for prod in r.findall(constituents[-1]):
                prod = replace_strings(prod.lstrip().rstrip())
                pchebi = link_compound2chebi(prod)
                enz_rec['Products'][prod] = pchebi

                # populate enz_rec['UniProt'] with dictionary of uniprotid:name key, value pairs for protein
            for unpid in record['DR']:
                enz_rec['UniProt'][unpid[0]] = unpid[1]
        enz_records.append(enz_rec)

    return enz_records
Пример #15
0
 def test_lactate(self):
     """Parsing ENZYME record for lactate racemase (5.1.2.1)."""
     filename = os.path.join("Enzymes", "lactate.txt")
     with open(filename) as handle:
         record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "5.1.2.1")
     self.assertEqual(record["DE"], "Lactate racemase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Hydroxyacid racemase.")
     self.assertEqual(record["AN"][1], "Lactic acid racemase.")
     self.assertEqual(record["AN"][2], "Lacticoracemase.")
     self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.")
     self.assertEqual(len(record["DR"]), 0)
     self.assertTrue(
         str(record).startswith("ID: 5.1.2.1\nDE: Lactate racemase.\n"),
         "Did not expect:\n%s" % record)
Пример #16
0
 def test_lactate(self):
     """Parsing ENZYME record for lactate racemase (5.1.2.1)"""
     filename = os.path.join('Enzymes', 'lactate.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     handle.close()
     self.assertEqual(record["ID"], "5.1.2.1")
     self.assertEqual(record["DE"], "Lactate racemase.")
     self.assertEqual(len(record["AN"]), 3)
     self.assertEqual(record["AN"][0], "Hydroxyacid racemase.")
     self.assertEqual(record["AN"][1], "Lactic acid racemase.")
     self.assertEqual(record["AN"][2], "Lacticoracemase.")
     self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.")
     self.assertEqual(len(record["DR"]), 0)
     self.assertTrue(str(record).startswith("ID: 5.1.2.1\nDE: Lactate racemase.\n"),
                     "Did not expect:\n%s" % record)
Пример #17
0
 def test_proline(self):
     "Parsing ENZYME record for proline racemase (5.1.1.4)"
     filename = os.path.join('Enzymes', 'proline.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "5.1.1.4")
     self.assertEqual(record["DE"], "Proline racemase.")
     self.assertEqual(record["CA"], "L-proline = D-proline.")
     self.assertEqual(len(record["DR"]), 9)
     self.assertEqual(record["DR"][0], ["Q17ZY4", "PRAC_CLOD6"])
     self.assertEqual(record["DR"][1], ["A8DEZ8", "PRAC_CLODI"])
     self.assertEqual(record["DR"][2], ["Q4DA80", "PRCMA_TRYCR"])
     self.assertEqual(record["DR"][3], ["Q868H8", "PRCMB_TRYCR"])
     self.assertEqual(record["DR"][4], ["Q3SX04", "PRCM_BOVIN"])
     self.assertEqual(record["DR"][5], ["Q96EM0", "PRCM_HUMAN"])
     self.assertEqual(record["DR"][6], ["Q9CXA2", "PRCM_MOUSE"])
     self.assertEqual(record["DR"][7], ["Q5RC28", "PRCM_PONAB"])
     self.assertEqual(record["DR"][8], ["Q66II5", "PRCM_XENTR"])
Пример #18
0
 def test_proline(self):
     "Parsing ENZYME record for proline racemase (5.1.1.4)"
     filename = os.path.join( 'Enzymes', 'proline.txt')
     handle = open(filename)
     record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "5.1.1.4")
     self.assertEqual(record["DE"], "Proline racemase.")
     self.assertEqual(record["CA"], "L-proline = D-proline.")
     self.assertEqual(len(record["DR"]), 9)
     self.assertEqual(record["DR"][0], ["Q17ZY4", "PRAC_CLOD6"])
     self.assertEqual(record["DR"][1], ["A8DEZ8", "PRAC_CLODI"])
     self.assertEqual(record["DR"][2], ["Q4DA80", "PRCMA_TRYCR"])
     self.assertEqual(record["DR"][3], ["Q868H8", "PRCMB_TRYCR"])
     self.assertEqual(record["DR"][4], ["Q3SX04", "PRCM_BOVIN"])
     self.assertEqual(record["DR"][5], ["Q96EM0", "PRCM_HUMAN"])
     self.assertEqual(record["DR"][6], ["Q9CXA2", "PRCM_MOUSE"])
     self.assertEqual(record["DR"][7], ["Q5RC28", "PRCM_PONAB"])
     self.assertEqual(record["DR"][8], ["Q66II5", "PRCM_XENTR"])
Пример #19
0
    def _create_ec_num_enzyme_name_association_file(enzdat_file, ec_id_file):
        """
        use the Enz class from BioPython to parse the enzyme.dat file,
        which contains all fully specified enzymes (i.e., with 4 levels)
        and annotations. The file is parsed to a dictionary, which is
        then dumped to JSON for easy reading later.

        :param enzdat_file: path to enzyme.dat file
        :param ec_id_file: path to JSON dump
        :return: None
        """
        with open(enzdat_file) as file:
            enzyme_dat = Enz.parse(file)
            enz_id = dict()
            for record in enzyme_dat:
                enz_id[record['ID']] = record['DE']
        with open(ec_id_file, 'w') as ec:
            json.dump(enz_id, ec)
def keggMet(tag):
    request = REST.kegg_get("lpn:" + tag)
    open("lpn:" + tag, "w").write(request.read())
    records = Enzyme.parse(open("lpn:" + tag))
    record = list(records)[0]
    ofile = open("lpn:" + tag, "r")
    owrite = open("kegg/lpn:" + tag, "w")
    flagPath = flagMotifs = flagOrtho = 1
    for line in ofile:
        if "ORTHOLOGY" in line or flagPath == 0:
            if flagOrtho == 1:
                flagOrtho = 0
                owrite.write(line)
            elif "ORGANISM" in line:
                flagOrtho = 1
            else:
                owrite.write(line)
        if "PATHWAY" in line or flagPath == 0:
            if flagPath == 1:
                flagPath = 0
                owrite.write(line)
            elif "BRITE" in line or "MODULE" in line:
                flagPath = 1
            else:
                owrite.write(line)
        if "MOTIF" in line or flagMotifs == 0:
            if flagMotifs == 1:
                flagMotifs = 0
                owrite.write(line)
            elif "DBLINKS" in line:
                flagMotifs = 1
            else:
                owrite.write(line)
        if "NAME" in line:
            name = re.split(r' ', line)
            owrite.write("GENE NAME: " + name[-1])
        if "REACTIONS" in line or flagReactions == 0:
            if flagReactions == 1:
                flagReactions = 0
                owrite.write(line)
            elif "COMPOUND" in line:
                flagReactions = 1
            else:
                owrite.write(line)
Пример #21
0
 def test_proline(self):
     """Parsing ENZYME record for proline racemase (5.1.1.4)."""
     filename = os.path.join("Enzymes", "proline.txt")
     with open(filename) as handle:
         record = Enzyme.read(handle)
     self.assertEqual(record["ID"], "5.1.1.4")
     self.assertEqual(record["DE"], "Proline racemase.")
     self.assertEqual(record["CA"], "L-proline = D-proline.")
     self.assertEqual(len(record["DR"]), 9)
     self.assertEqual(record["DR"][0], ["Q17ZY4", "PRAC_CLOD6"])
     self.assertEqual(record["DR"][1], ["A8DEZ8", "PRAC_CLODI"])
     self.assertEqual(record["DR"][2], ["Q4DA80", "PRCMA_TRYCR"])
     self.assertEqual(record["DR"][3], ["Q868H8", "PRCMB_TRYCR"])
     self.assertEqual(record["DR"][4], ["Q3SX04", "PRCM_BOVIN"])
     self.assertEqual(record["DR"][5], ["Q96EM0", "PRCM_HUMAN"])
     self.assertEqual(record["DR"][6], ["Q9CXA2", "PRCM_MOUSE"])
     self.assertEqual(record["DR"][7], ["Q5RC28", "PRCM_PONAB"])
     self.assertEqual(record["DR"][8], ["Q66II5", "PRCM_XENTR"])
     self.assertTrue(
         str(record).startswith("ID: 5.1.1.4\nDE: Proline racemase.\n"),
         "Did not expect:\n%s" % record)
Пример #22
0
def get_EC_RXNs():
    ec_dict = {}

    # version
    ebi_ec_release = ""

    if args.test == False:
        ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzclass.txt'
        ebi_ec_release = os.popen(ebi_ec_call).read()
    else:
        with open('enzclass.txt', 'r') as myfile:
            ebi_ec_release = myfile.read()

    ebi_ec_release = ebi_ec_release.split("\n")[7].split()[1]

    ec_dict = {
        'data_version': ebi_ec_release,
        'date': timestamp,
        'format_version': 'N/A',
        'ontology': 'ec_orthology'
    }

    # parser

    ebi_ec_enzyme = 'enzyme.dat'

    if args.test == False:
        ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzyme.dat > enzyme.dat'
        os.system(ebi_ec_call)

    records = Enzyme.parse(open(ebi_ec_enzyme))
    for record in records:
        ec_dict[record['ID']] = {
            'id': record['ID'],
            'name': record['DE'],
            'synonyms': record['AN']
        }

    with open('EBI_EC_ontologyDictionary.json', 'w') as outfile:
        json.dump(ec_dict, outfile, indent=2)
def getEC():
    handle = open("enzyme.dat")
    records = Enzyme.parse(handle)
    ecnumbers = [record["ID"] for record in records]
    print(ecnumbers)
Пример #24
0
def get_knndataset(cdhit, output_dir, database):
    #Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is
    #EC number, the second column is the reaction description, the third column is the associated
    #sequenceID ids separated by '|', and the fourth column indicates whether the reactions described
    #by this EC have been transferred to other EC numbers.
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases",
                                   "enzyme", "enzyme.dat")
        subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme,
                                shell=True)
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        print("%s\n", "Missing enzyme database!")
        exit(0)
    input_name = os.path.join("database", "enzyme", "enzyme.dat")
    output_name = os.path.join("database", "enzyme", "enzyme.tsv")
    records = Enzyme.parse(open(input_name))
    out = dict()  # dict of dicts, first key: EC number, second key: field
    transferred = dict()  #dict of lists
    for record in records:
        if 'Transferred entry:' in record['DE']:
            record['DE'] = record['DE'].rstrip('.')
            record['DE'] = record['DE'].replace('Transferred entry:', ' ')
            record['DE'] = record['DE'].replace(',', ' ')
            record['DE'] = record['DE'].replace('and', ' ')
            point_to = record['DE'].split()
            transferred[record['ID']] = point_to
        else:
            out[record['ID']] = dict()
            out[record['ID']]['sequenceID'] = '|'.join(
                [x[0] for x in record['DR']])
            out[record['ID']]['description'] = record['DE']
            out[record['ID']]['transferred'] = False
    for id in transferred:
        out[id] = dict()
        out[id]['sequenceID'] = '|'.join(
            [out[x]['sequenceID'] for x in transferred[id]])
        out[id]['description'] = 'Transferred entry: ' + ' '.join(
            transferred[id])
        out[id]['transferred'] = True
    df = pd.DataFrame.from_dict(out, orient='index')
    df.index.name = 'EC'
    # write all data in a enzyme.csv file
    df.to_csv(output_name, sep='\t')
    # ignore EC numbers with no sequenceID ids associated
    df.dropna(subset=['sequenceID'], inplace=True)
    # ignore EC numbers that are obsolete due to transfer
    df = df[df.transferred == False]

    # The numpy data frame is converted to a python dictionnary
    mydic = df.to_dict()
    enzyme_protIDS = [
        mydic["sequenceID"][ec].split("|")
        for ec in mydic["sequenceID"].keys()
    ]
    enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS)))
    enzyme_protIDS = [
        elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != ""
    ]
    dic_ecs = dict()
    dic_ecs["1"] = set()
    dic_ecs["2"] = set()
    dic_ecs["3"] = set()
    dic_ecs["4"] = set()
    dic_ecs["5"] = set()
    dic_ecs["6"] = set()
    if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")):
        url = os.path.join("http://www.uniprot.org", "uniprot",
                           "?query=reviewed:yes&format=tab")
        subprocess.check_output("wget -cq -P database/uniprot '" + url + "'",
                                shell=True)
        subprocess.check_output(
            "mv database/uniprot/*=tab database/uniprot/sp.tab", shell=True)
    if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")):
        print("%s\n", "Missing uniprot database!")
        exit(0)
    csvfile = os.path.join("database", "uniprot", "sp.tab")
    readCSV = csv.reader(csvfile, delimiter='\t')
    non_valids_enzyme = set()
    dic_sp = dict()
    for row in readCSV:
        if row[0] != "Entry":
            seqID = row[0]
            seqName = row[3]
            seqLength = row[6]
            dic_sp[seqID] = dict()
            dic_sp[seqID]['name'] = seqName
            dic_sp[seqID]['length'] = seqLength

    #===================================================o========================================
    # Selection rules for the Main functional classes
    #===================================================o========================================
    # step 1
    # those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded
    # those enzymes whose sequences had less than 50 amino acids were excluded
    for ec in mydic["description"].keys():
        sequenceID_iDs = mydic["sequenceID"][ec]
        protIDs = sequenceID_iDs.strip(" \n\t\r").split("|")
        protIDs = [elt for elt in protIDs if elt != ""]
        frag_seqs = list()
        short_seqs = list()
        for seqID in protIDs:
            if "Fragment" in dic_sp[seqID]['name']:
                frag_seqs.append(seqID)
            if int(dic_sp[seqID]['length']) < 50:
                short_seqs.append(seqID)
        protIDs = [
            e for e in protIDs if not e in frag_seqs and not e in short_seqs
        ]
        if ec.startswith("1"):
            dic_ecs["1"].update(protIDs)
        elif ec.startswith("2"):
            dic_ecs["2"].update(protIDs)
        elif ec.startswith("3"):
            dic_ecs["3"].update(protIDs)
        elif ec.startswith("4"):
            dic_ecs["4"].update(protIDs)
        elif ec.startswith("5"):
            dic_ecs["5"].update(protIDs)
        elif ec.startswith("6"):
            dic_ecs["6"].update(protIDs)
        non_valids_enzyme.update(frag_seqs)
        non_valids_enzyme.update(short_seqs)

    # step 2
    # for the uniqueness, those enzymes that occur in two or more classes were excluded
    for ec in ["2", "3", "4", "5", "6"]:
        dic_ecs["1"] = dic_ecs["1"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["1"].intersection(dic_ecs[ec]))
    for ec in ["1", "3", "4", "5", "6"]:
        dic_ecs["2"] = dic_ecs["2"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["2"].intersection(dic_ecs[ec]))
    for ec in ["2", "1", "4", "5", "6"]:
        dic_ecs["3"] = dic_ecs["3"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["3"].intersection(dic_ecs[ec]))
    for ec in ["2", "3", "1", "5", "6"]:
        dic_ecs["4"] = dic_ecs["4"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["4"].intersection(dic_ecs[ec]))
    for ec in ["2", "3", "4", "1", "6"]:
        dic_ecs["5"] = dic_ecs["5"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["5"].intersection(dic_ecs[ec]))
    for ec in ["2", "3", "4", "5", "1"]:
        dic_ecs["6"] = dic_ecs["6"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["6"].intersection(dic_ecs[ec]))

    # these following two functions are internal and allow to create processes to parallel the fasta
    # files downloading and their passage to the cd-hit program
    def run_process(list_seqs, filename, output_dir, database, cdhit):
        # @nested function
        # Downlod and constructing fasta file of suclasses
        file = open(os.path.join(output_dir, filename + ".ids.list"), 'w')
        for seqID in list_seqs:
            file.write("%s\n" % seqID)
        file.close()
        fasta = os.path.join(output_dir, filename + ".faa")
        batch = os.path.join(output_dir, filename + ".ids.list")
        print commands.getoutput("blastdbcmd -db " + database +
                                 " -entry_batch " + batch + " > " + fasta)
        os.remove(batch)
        # run cdhit program
        cdhitout = os.path.join(output_dir, filename + ".cdhit.faa")
        cdhitverbose = os.path.join(output_dir, filename + ".out")
        print commands.getoutput(
            cdhit + " -i " + fasta + " -d 0 -o " + cdhitout +
            " -c 0.4 -n 2  -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > "
            + cdhitverbose)

    def create_process(list_seqs, filename, output_dir, database, cdhit):
        # @nested function:
        p = Process(target=run_process,
                    args=(
                        list_seqs,
                        filename,
                        output_dir,
                        database,
                        cdhit,
                    ))
        p.start()
        return p

    # step 3:
    # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
    # those sequences which have >=40% sequence identity to any other in a same functional class
    # making fasta files for the six main classes
    for ec in dic_ecs:
        create_process(dic_ecs[ec], str(ec), output_dir, database, cdhit)

    #===================================================o===========================================
    # Selection rules for the subclasses: same screening procedures	than the Main functional classes
    #===================================================o===========================================
    # step 1
    # those enzymes whose sequences were annotated with 'fragment' were excluded
    # those enzymes whose sequences had less than 50 amino acids were excluded
    dic_subclasses = dict()
    for ec in mydic["description"].keys():
        sequenceID_iDs = mydic["sequenceID"][ec]
        protIDs = sequenceID_iDs.strip(" \n\t\r").split("|")
        protIDs = [elt for elt in protIDs if elt != ""]
        frag_seqs = list()
        short_seqs = list()
        for seqID in protIDs:
            if "Fragment" in dic_sp[seqID]['name']:
                frag_seqs.append(seqID)
            if int(dic_sp[seqID]['length']) < 50:
                short_seqs.append(seqID)
        protIDs = [
            e for e in protIDs if not e in frag_seqs and not e in short_seqs
        ]
        list_ec_digits = [x for x in ec.split(".") if x != "-"]
        if len(list_ec_digits) >= 2:
            ec_on_l2 = '.'.join(list_ec_digits[:2])
            if ec_on_l2 in dic_subclasses:
                dic_subclasses[ec_on_l2].update(protIDs)
            else:
                dic_subclasses[ec_on_l2] = set(protIDs)

    # step 2
    # for the uniqueness, those enzymes that occur in two or more classes were excluded
    for ec1 in dic_subclasses.keys():
        for ec2 in dic_subclasses.keys():
            if ec1 != ec2:
                dic_subclasses[ec1] = dic_subclasses[ec1].difference(
                    dic_subclasses[ec2])
    #print(len(dic_subclasses))
    excluded_ecs = list()
    for ec in dic_subclasses:
        if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec)
    dic_subclasses = {
        k: v
        for k, v in dic_subclasses.items() if k not in excluded_ecs
    }

    # step 3:
    # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
    # those sequences which have >=40% sequence identity to any other in a same functional class
    for ec in dic_subclasses:
        # making fasta files for the subclasses: after retrieving associated fasta file and
        # reducing redundancy with cd-hit program
        create_process(dic_subclasses[ec], str(ec), output_dir, database,
                       cdhit)
Пример #25
0
    'The name of the strain in the input file. This will be used to name the output file. The default behaviour is to take the input filename minus the ".top" part.'
)

args = parser.parse_args()
inputFile = args.inputFile
enzymeDB = args.enzymeDB
outputDir = args.outputDir
sepGenes = args.sepGenes
sepDist = args.sepDist
minClusterSize = args.minClusterSize
strainName = args.strainName

enzymeDB_dict = {}
#db = open(enzymeDB)
with open(enzymeDB) as db:
    for record in Enzyme.parse(db):
        id_ec = record["ID"]
        de = record["DE"]
        enzymeDB_dict[id_ec] = de
#db.close()

fileName = inputFile.split("/")[-1]
if not (fileName.split(".")[-1] == "top"):
    sys.exit('ERROR! Wrong filetype! Input should be a ".top" file!')
if not strainName:
    strainName = fileName.split(".")[0]

# open the input file (for reading by default)
#fh = open(inputFile)
# initialise dictionary to hold enzyme data for each contig
group_enzymes = defaultdict(list)
Пример #26
0
 def test_parse_one(self):
     """Check parse function with one record."""
     with open("Enzymes/lipoprotein.txt") as handle:
         records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 1)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
Пример #27
0
from Bio.ExPASy import Enzyme
with open("/home/koreanraichu/RuBisCO.txt") as handle:
    record = Enzyme.read(handle)
    print(record['ID']) # EC no.
    print(record['DE']) # description
    print(record['AN']) # 대충 synonyms같은건가? 뭐 얘 이렇게도 불러요 이런거
    print(record["CA"]) # 촉매하는 반응(오 이거 식으로 나온다)
    print(record["PR"]) # 이건 모르겠다... 데이터베이스 번호인가...
    print(record["CC"]) # 아마도 뭐 하는 효소인가에 대한 설명인 듯
    print(record['DR']) # 뭔진 모르겠지만 일단 잘못했어요... 뭐가 되게 많이떴는데 넘파이 마려웠음
Пример #28
0
def do_oxyphen(proteome, output_filename, ec_classes_file):
    '''
    Read and parse enzyme.dat file
    '''
    input_name = "DATA/enzyme.dat"
    output_name = "DATA/ec_uniprot.tsv"

    ### program ###
    handle = open(input_name)
    records = Enzyme.parse(handle)

    out = dict()  #dict of dicts, first key: EC number, second key: field
    transferred = dict()  #dict of lists
    for record in records:
        if 'Transferred entry:' in record['DE']:
            record['DE'] = record['DE'].rstrip('.')  #remove period
            record['DE'] = record['DE'].replace('Transferred entry:',
                                                ' ')  #remove title
            record['DE'] = record['DE'].replace(',', ' ')  #remove commas
            record['DE'] = record['DE'].replace('and', ' ')  #remove and
            point_to = record['DE'].split()
            transferred[record['ID']] = point_to
        else:
            out[record['ID']] = dict()
            out[record['ID']]['uniprot'] = ' '.join(
                [x[0] for x in record['DR']])
            out[record['ID']]['description'] = record['DE']
            out[record['ID']]['transferred'] = False

    # for id in transferred:
    #     out[id] = dict()
    #     out[id]['uniprot'] = ' '.join([out[x]['uniprot'] for x in transferred[id]])
    #     out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id])
    #     out[id]['transferred'] = True

    df = pd.DataFrame.from_dict(out, orient='index')
    df.index.name = 'EC'
    df.to_csv(output_name, sep='\t')
    '''
    Take a subset of ecs of interest
    '''

    oxidases = tuple(open("DATA/oxygen_ecclasses", "r").read().splitlines())

    infile = open("DATA/ec_uniprot.tsv", "r").readlines()
    outfile = open("DATA/ec_uniprot_oxidases.tsv", "w")

    for line in infile:
        if line.startswith("EC"):
            outfile.write(line)
        elif line.startswith(oxidases):
            outfile.write(line)

    outfile.close()
    '''
    write a file with one uniprot ID per line, containing all of the
    uniprot IDs mentioned in uniprot column of the input file

    Ignore EC numbers that have been transferred
    '''

    input = "DATA/ec_uniprot_oxidases.tsv"
    output = "DATA/uniprot_ids.txt"

    df = pd.read_table(input)
    df.dropna(subset=['uniprot'],
              inplace=True)  #ignore EC numbers with no uniprot ids associated

    #df = df[df.transferred == False] #ignore EC numbers that are obsolete due to transfer

    unique_uniprot = set(" ".join(df.uniprot.values).split(" "))

    with open(output, "w") as outfile:
        for id in unique_uniprot:
            outfile.write(id + "\n")
    outfile.close()
    '''
    Make blastdb out of the swissprot subset
    '''

    blast_path, num_threads, multinome_folder = read_config()

    os.system(
        "%s -in DATA/sprot_subset.fasta -dbtype prot -out DATA/sprot_subset -hash_index"
        % (os.path.join(blast_path, "makeblastdb")))
    '''
    Blast our pre-selected proteomes against the uniprot subset
    '''
    print "Performing Blast searches against oxygen-utilizing database..."
    os.system(
        "%s -max_target_seqs 1 -outfmt '6 qseqid sseqid pident evalue qcovs' -query %s -db DATA/sprot_subset -out DATA/new_sequences_sprot_enzyme.tab -num_threads %d"
        % (os.path.join(blast_path, "blastp"), proteome, num_threads))
    '''
    Filter Blast output.
    '''
    evalue = 10e-3
    identity = 40.0
    coverage = 40.0

    print "Filtering Blast output: evalue", evalue, " identity", identity, " coverage", coverage
    hits_table_file_name = "DATA/new_sequences_sprot_enzyme.tab"
    hits_table_file_name_filtered_out = open(
        "DATA/new_sequences_sprot_enzyme_filtered.tab", "w")

    hits_table_file_name_filtered_out.write(
        "\t".join(["hit", "subject", "id", "len", "eval", "cov"]) + "\n")

    for line in open(hits_table_file_name, "r").read().splitlines():
        if line.startswith("#"): continue

        query, target, ident, eval, cover = line.split("\t")
        eval = float(eval)
        ident = float(ident)
        cover = float(cover)

        if eval <= evalue and ident >= identity and cover >= coverage:
            hits_table_file_name_filtered_out.write(line + "\n")

    hits_table_file_name_filtered_out.close()

    hits_table_file_name_filtered = "DATA/new_sequences_sprot_enzyme_filtered.tab"
    enzyme_table_file_name = 'DATA/ec_uniprot_oxidases.tsv'

    hits = pd.read_csv(hits_table_file_name_filtered, sep="\t", header=0)
    enzyme = pd.read_csv(enzyme_table_file_name, sep="\t", header=0)

    hits.fillna('', inplace=True)  #replace empty values with blank spaces
    enzyme.fillna('', inplace=True)

    enzyme = enzyme[enzyme.transferred == False]  #drop transferred EC numbers

    hits.subject = hits.subject.str[3:
                                    9]  #take just the uniprot ID from the name

    def get_ecs(uniprot):
        if uniprot == '':  #ignore invalid uniprot ids
            return ''
        else:
            return ' '.join(
                enzyme.EC[enzyme.uniprot.str.contains(uniprot)].values)

    hits['EC'] = hits.subject.apply(get_ecs)

    output_file_name = output_filename
    hits.to_csv(output_file_name, sep="\t", index=False)

    ### read final mapping output

    mapping_out = open(output_file_name, "r").read().splitlines()
    ecs_dict = {}

    for line in mapping_out[1:]:
        splitted = line.split("\t")
        ecs = splitted[-1]

        for ec in ecs.split():
            if ec not in ecs_dict:
                ecs_dict[ec] = []
            ecs_dict[ec].append(splitted[0])

    print "\n\n"
    print len(
        ecs_dict
    ), "oxygen-utilizing enzymes were found from classes", ecs_dict.keys()

    ec_out = open(ec_classes_file, "w")
    ec_out.write("\t".join(ecs_dict.keys()))

    ec_out.close()

    GLOBAL_RESULTS.write(
        os.path.basename(proteome) + "\t" + str(len(ecs_dict)) + "\t" +
        ",".join(ecs_dict.keys()) + "\n")
    #print "Detailed mapping can be found in OUTPUT/oxygen_utilizing_annot.tsv file"
    #print "Executing SVM classifier..."

    infile = open("DATA/model_svm", "r").read().splitlines()

    classifier_input = []
    classes = []
    ec_classes = []

    for line in infile:

        if line.startswith("@attribute") and "class" not in line:
            ec_classes.append(line.split()[1].replace("'", ""))
Пример #29
0
def get_knndataset(cdhit, output_dir, database): 
	'''
	Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is 
	EC number, the second column is the reaction description, the third column is the associated 
	uniprot ids separated by '|', and the fourth column indicates whether the reactions described 
	by this EC have been transferred to other EC numbers.
	'''
	if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
		curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases","enzyme", "enzyme.dat")
		subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell = True)
	if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): 
		print ("%s\n", "Missing enzyme database!")
		exit(0)
	input_name = os.path.join("database", "enzyme", "enzyme.dat")
	output_name = os.path.join("database", "enzyme", "enzyme.tsv")
	records = Enzyme.parse(open(input_name))
	out = dict() # dict of dicts, first key: EC number, second key: field
	transferred = dict() #dict of lists
	for record in records:
		if 'Transferred entry:' in record['DE']:
			record['DE'] = record['DE'].rstrip('.')
			record['DE'] = record['DE'].replace('Transferred entry:',' ')
			record['DE'] = record['DE'].replace(',',' ')
			record['DE'] = record['DE'].replace('and',' ')
			point_to = record['DE'].split()
			transferred[record['ID']] = point_to
		else:
			out[record['ID']] = dict()
			out[record['ID']]['uniprot'] = '|'.join([x[0] for x in record['DR']])
			out[record['ID']]['description'] = record['DE']
			out[record['ID']]['transferred'] = False
	for id in transferred:
		out[id] = dict()
		out[id]['uniprot'] = '|'.join([out[x]['uniprot'] for x in transferred[id]])
		out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id])
		out[id]['transferred'] = True
	df = pd.DataFrame.from_dict(out, orient = 'index')
	df.index.name = 'EC'
	
	# write all data in a enzyme.csv file
	df.to_csv(output_name, sep = '\t')
	
	# ignore EC numbers with no uniprot ids associated
	df.dropna(subset = ['uniprot'], inplace = True)
	
	# ignore EC numbers that are obsolete due to transfer 
	df = df[df.transferred == False]
	
	# construct a dictionnary from dataframe
	mydic = df.to_dict()
	
	enzyme_protIDS = [mydic["uniprot"][ec].split("|") for ec in mydic["uniprot"].keys()]
	enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS)))
	enzyme_protIDS = [elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != ""]
	dic_ecs = dict()
	dic_ecs["1."] = set()
	dic_ecs["2."] = set()
	dic_ecs["3."] = set()
	dic_ecs["4."] = set()
	dic_ecs["5."] = set()
	dic_ecs["6."] = set()

	csvfile = open('uniprot-reviewed%3Ayes.tab', 'r')
	readCSV = csv.reader(csvfile, delimiter = '\t')
	non_valids_enzyme = set()
	dic_sp = dict()
	for row in readCSV: 
		if row[0] != "Entry":
			seqID = row[0]
			seqName = row[3]
			seqLength = row[6]
			dic_sp[seqID] = dict()
			dic_sp[seqID]['name'] = seqName
			dic_sp[seqID]['length'] = seqLength

	#===================================================o========================================
	# Selection rules for the Main functional classes							 
	#===================================================o========================================
	# step 1
	# those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded
	# those enzymes whose sequences had less than 50 amino acids were excluded
	for ec in mydic["description"].keys():
		uniprot_iDs = mydic["uniprot"][ec]
		protIDs = uniprot_iDs.strip(" \n\t\r").split("|")
		protIDs = [elt for elt in protIDs if elt != ""]
		frag_seqs = list()
		short_seqs = list()
		for seqID in protIDs:
			if "Fragment" in dic_sp[seqID]['name']:
				 frag_seqs.append(seqID)
			if int(dic_sp[seqID]['length']) < 50:
				short_seqs.append(seqID)
		protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs]
		if ec.startswith("1."):
			dic_ecs["1."].update(protIDs)
		elif ec.startswith("2."):
			dic_ecs["2."].update(protIDs)
		elif ec.startswith("3."):
			dic_ecs["3."].update(protIDs)
		elif ec.startswith("4."):
			dic_ecs["4."].update(protIDs)
		elif ec.startswith("5."):
			dic_ecs["5."].update(protIDs)
		elif ec.startswith("6."):
			dic_ecs["6."].update(protIDs)
		non_valids_enzyme.update(frag_seqs)
		non_valids_enzyme.update(short_seqs)
	
	# step 2
	# for the uniqueness, those enzymes that occur in two or more classes were excluded
	for ec in ["2.", "3.", "4.", "5.", "6."]:
		dic_ecs["1."] = dic_ecs["1."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["1."].intersection(dic_ecs[ec]))
	for ec in ["1.", "3.", "4.", "5.", "6."]:
		dic_ecs["2."] = dic_ecs["2."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["2."].intersection(dic_ecs[ec]))
	for ec in ["2.", "1.", "4.", "5.", "6."]:
		dic_ecs["3."] = dic_ecs["3."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["3."].intersection(dic_ecs[ec]))
	for ec in ["2.", "3.", "1.", "5.", "6."]:
		dic_ecs["4."] = dic_ecs["4."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["4."].intersection(dic_ecs[ec]))
	for ec in ["2.", "3.", "4.", "1.", "6."]:
		dic_ecs["5."] = dic_ecs["5."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["5."].intersection(dic_ecs[ec]))
	for ec in ["2.", "3.", "4.", "5.", "1."]:
		dic_ecs["6."] = dic_ecs["6."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["6."].intersection(dic_ecs[ec]))
	
	# step 3: 
	# to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
	# those sequences which have >=40% sequence identity to any other in a same functional class

	#
	# Downlod and constructing fasta file of the Main functional classes
	def split_sequence(seq, l):
		new_seq = ""
		if len(seq) > l:
			new_seq = seq[:l]
			k = l
			while k + l < len(seq):
				new_seq+= "\n"+str(seq[k:k+l])
				k+= l
			new_seq+= "\n" + str(seq[k:])
			return new_seq + "\n"
		else: return seq + "\n"
	def run_process(list_seqs, filename):
		# @nested function
		session = requests.Session()
		outfile = open(filename, "a")
		for seqID in list_seqs:
			#handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t"))
			#record = SeqIO.read(handle, "swiss")
			#SeqIO.write(record, outfile, "fasta")
			req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID)
			res = session.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
			"content-type":"text"})
			# parse the returned XML
			uniprot = ET.fromstring(res.text)
			for isoform in uniprot.getchildren():
				# get the sequence
				iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence')
				# get the accession number
				iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
				outfile.write(">"+str(iso_accession.text)+"\n")
				outfile.write(split_sequence(str(iso_sequence.text), 60))		   		
		outfile.close()
	def create_process(list_seqs, filename):
		# @nested function:
		p = Process(target = run_process, args = (list_seqs, filename,))
		p.start()
		return p
	
	#ec1 = create_process(dic_ecs["1."], "knnDataset/ec_1.*.faa")
	#ec2 = create_process(dic_ecs["2."], "knnDataset/ec_2.*.faa")
	#ec3 = create_process(dic_ecs["3."], "knnDataset/ec_3.*.faa")
	#ec4 = create_process(dic_ecs["4."], "knnDataset/ec_4.*.faa")
	#ec5 = create_process(dic_ecs["5."], "knnDataset/ec_5.*.faa")
	#ec6 = create_process(dic_ecs["6."], "knnDataset/ec_6.*.faa")
	
	
	#===================================================o===========================================
	# Selection rules for the subclasses: same screening procedures	than the Main functional classes 
	#===================================================o===========================================
	# step 1
	# those enzymes whose sequences were annotated with 'fragment' were excluded
	# those enzymes whose sequences had less than 50 amino acids were excluded
	dic_subclasses = dict()
	for ec in mydic["description"].keys():
		uniprot_iDs = mydic["uniprot"][ec]
		protIDs = uniprot_iDs.strip(" \n\t\r").split("|")
		protIDs = [elt for elt in protIDs if elt != ""]
		frag_seqs = list()
		short_seqs = list()
		for seqID in protIDs:
			if "Fragment" in dic_sp[seqID]['name']:
				 frag_seqs.append(seqID)
			if int(dic_sp[seqID]['length']) < 50:
				short_seqs.append(seqID)
		protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs]
		list_ec_digits = [x for x in ec.split(".") if x != "-"]
		if len(list_ec_digits) >= 2:
			ec_on_l2 = '.'.join(list_ec_digits[:2])
			if ec_on_l2 in dic_subclasses: dic_subclasses[ec_on_l2].update(protIDs)
			else: dic_subclasses[ec_on_l2] = set(protIDs)
	
	# step 2
	# for the uniqueness, those enzymes that occur in two or more classes were excluded
	for ec1 in dic_subclasses.keys():
		for ec2 in dic_subclasses.keys():
			if ec1 != ec2: dic_subclasses[ec1] = dic_subclasses[ec1].difference(dic_subclasses[ec2])
	#print(len(dic_subclasses))
	excluded_ecs = list()
	for ec in dic_subclasses:
		if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec)
	dic_subclasses = {k: v for k, v in dic_subclasses.items() if k not in excluded_ecs}
	
	# making fasta files
#	list_process = list()
#	for ec in dic_subclasses:
#		process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa"))
#		list_process.append(process)
#	for i in range(len(list_process)):
#		while list_process[i].is_alive(): time.sleep(60)
	for ec in dic_subclasses:
		file = open(os.path.join(output_dir, str(ec)+".ids.list"), 'w')
		for seqID in dic_subclasses[ec]: file.write("%s\n" % seqID)
	  	file.close()
	for ec in dic_subclasses:
		batch = os.path.join(output_dir, str(ec)+".ids.list")
		fasta  = os.path.join(output_dir, str(ec)+".faa")
		print commands.getoutput("blastdbcmd -db "+ database +" -entry_batch "+ batch +" > "+ fasta)
		#outfile = open(os.path.join(output_dir, str(ec)+".faa"), "a")
		
		#for seqID in dic_subclasses[ec]:
#			handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t"))
#			record = SeqIO.read(handle, "swiss")
#			SeqIO.write(record, outfile, "fasta") 
#			req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID)
#			#res = requests.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})
#			print commands.getoutput("wget -cq -P "+ output_dir +" '" + req + "'")
#			tree = ET.parse(os.path.join(output_dir, os.path.basename(req)))
#			uniprot = tree.getroot()
#			# parse the returned XML
#			#uniprot = ET.fromstring(res.text)
#			for isoform in uniprot.getchildren():
#				# get the sequence
#				iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence')
#				# get the accession number
#				iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
#				outfile.write(">"+str(iso_accession.text)+"\n")
#				outfile.write(split_sequence(str(iso_sequence.text), 60))
#			os.remove(os.path.join(output_dir, os.path.basename(req)))		
		#outfile.close()
		#process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa"))
		#while process.is_alive():
		#	time.sleep(60)
	
	# step 3: 
	# to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
	# those sequences which have >=40% sequence identity to any other in a same functional class
#	for ec in dic_subclasses:
#		print commands.getoutput(cdhit +" -i "+os.path.join(output_dir, str(ec)+".faa")
#			+" -d 0 -o "+ os.path.join(output_dir, str(ec) +".cdhit.faa")
#			+" -c 0.4 -n 2  -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > "
#			+ os.path.join(output_dir, str(ec) +".out"))

	print "\tFINISHED"
def get_expasy_enzyme():
    """

    """
    url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
    print("Retrieving enzyme records from Expasy Enzyme")
    enzyme = urllib.request.urlretrieve(url)
    enzyme_p = bee.parse(open(enzyme[0], 'r'))
    chebiout = open('chebi_list.txt', 'w')
    annotations = open('annotations_out.txt', 'w')

    enz_records = []
    chebi_list = []
    count = 0
    tester = []
    for record in enzyme_p:
        enz_rec = {}
        count += 1
        print(count)
        enz_rec['ECNumber'] = record['ID']
        enz_rec['Reaction(s)'] = []
        enz_rec['Substrates'] = {}
        enz_rec['Products'] = {}
        #enz_records.append(enz_rec)

        # split split to seperate multiple reactions
        reaction1 = record['CA'].split('.')

        for rxn in reaction1:
            try:
                if len(reaction1) > 2:
                    rxn = rxn[3:]
                enz_rec['Reaction(s)'].append(rxn)
                #split reactions into [substrates, products]
                constituents = rxn.split('=')
                # split each side of reaction on '+' not '(+)'
                r = re.compile(r'(?:[^\+(]|\([^)]*\))+')
                subr = r.findall(constituents[0])
                for sub in subr:
                    sub = sub.lstrip().rstrip()
                    sub = replace_strings(sub)
                    schebi = link_compound2chebi(sub)
                    enz_rec['Substrates'][sub] = schebi

                    if schebi:
                        chebi_list.append(schebi)
                prodr = r.findall(constituents[-1])
                for prod in prodr:
                    prod = prod.lstrip().rstrip()
                    prod = replace_strings(prod)
                    pchebi = link_compound2chebi(prod)
                    enz_rec['Products'][prod] = pchebi
                    if pchebi:
                        chebi_list.append(pchebi)
            except Exception as e:
                print(e)
                continue

        enz_records.append(enz_rec)
    print(chebi_list, file=chebiout)
    print(enz_records, file=annotations)
    return enz_records
Пример #31
0
#Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is
#EC number, the second column is the reaction description, the third column is the associated
#sequenceID ids separated by '|', and the fourth column indicates whether the reactions described
#by this EC have been transferred to other EC numbers.
if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
    curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme",
                               "enzyme.dat")
    subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme,
                            shell=True)
if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
    print("%s\n", "Missing enzyme database!")
    exit(0)
input_name = os.path.join("database", "enzyme", "enzyme.dat")
output_name = os.path.join("database", "enzyme", "enzyme.tsv")
records = Enzyme.parse(open(input_name))
out = dict()  # dict of dicts, first key: EC number, second key: field
transferred = dict()  #dict of lists
for record in records:
    if 'Transferred entry:' in record['DE']:
        record['DE'] = record['DE'].rstrip('.')
        record['DE'] = record['DE'].replace('Transferred entry:', ' ')
        record['DE'] = record['DE'].replace(',', ' ')
        record['DE'] = record['DE'].replace('and', ' ')
        point_to = record['DE'].split()
        transferred[record['ID']] = point_to
    else:
        out[record['ID']] = dict()
        out[record['ID']]['sequenceID'] = '|'.join(
            [x[0] for x in record['DR']])
        out[record['ID']]['description'] = record['DE']
Пример #32
0
 def test_parse_one(self):
     """Check parse function with one record."""
     with open("Enzymes/lipoprotein.txt") as handle:
         records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 1)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
Пример #33
0
 def test_parse_zero(self):
     handle = StringIO("")
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 0)
Пример #34
0
        if line.startswith("NUM_THREADS"):
            num_threads = float(line.split("=")[1])

    return input_file, blast_path, num_threads


print(read_config())
'''
Read and parse enzyme.dat file
'''
input_name = "DATA/enzyme.dat"
output_name = 'DATA/ec_uniprot.tsv'

### program ###
handle = open(input_name)
records = Enzyme.parse(handle)

out = dict()  #dict of dicts, first key: EC number, second key: field
transferred = dict()  #dict of lists
for record in records:
    if 'Transferred entry:' in record['DE']:
        record['DE'] = record['DE'].rstrip('.')  #remove period
        record['DE'] = record['DE'].replace('Transferred entry:',
                                            ' ')  #remove title
        record['DE'] = record['DE'].replace(',', ' ')  #remove commas
        record['DE'] = record['DE'].replace('and', ' ')  #remove and
        point_to = record['DE'].split()
        transferred[record['ID']] = point_to
    else:
        out[record['ID']] = dict()
        out[record['ID']]['uniprot'] = ' '.join([x[0] for x in record['DR']])
Пример #35
0
 def test_parse_zero(self):
     handle = StringIO("")
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 0)
Пример #36
0
def get_enzyme_ecs(level):
    '''
	Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is 
	EC number, the second column is the reaction description, the third column is the associated 
	uniprot ids separated by '|', and the fourth column indicates whether the reactions described 
	by this EC have been transferred to other EC numbers.
	
	It return the list of the whole ecs numbers of ENZYME database if the latter is complete
	'''
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases",
                                   "enzyme", "enzyme.dat")
        subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme,
                                shell=True)
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        print("%s\n", "Missing enzyme database!")
        exit(0)
    input_name = os.path.join("database", "enzyme", "enzyme.dat")
    output_name = os.path.join("database", "enzyme", "enzyme.tsv")
    records = Enzyme.parse(open(input_name))

    out = dict()  # dict of dicts, first key: EC number, second key: field
    transferred = dict()  #dict of lists
    for record in records:
        if 'Transferred entry:' in record['DE']:
            record['DE'] = record['DE'].rstrip('.')
            record['DE'] = record['DE'].replace('Transferred entry:', ' ')
            record['DE'] = record['DE'].replace(',', ' ')
            record['DE'] = record['DE'].replace('and', ' ')
            point_to = record['DE'].split()
            transferred[record['ID']] = point_to
        else:
            out[record['ID']] = dict()
            out[record['ID']]['uniprot'] = '|'.join(
                [x[0] for x in record['DR']])
            out[record['ID']]['description'] = record['DE']
            out[record['ID']]['transferred'] = False
    for id in transferred:
        out[id] = dict()
        out[id]['uniprot'] = '|'.join(
            [out[x]['uniprot'] for x in transferred[id]])
        out[id]['description'] = 'Transferred entry: ' + ' '.join(
            transferred[id])
        out[id]['transferred'] = True
    df = pd.DataFrame.from_dict(out, orient='index')
    df.index.name = 'EC'

    # write all data in a enzyme.csv file
    df.to_csv(output_name, sep='\t')
    # ignore EC numbers with no uniprot ids associated
    df.dropna(subset=['uniprot'], inplace=True)
    # ignore EC numbers that are obsolete due to transfer
    df = df[df.transferred == False]
    all_ECs = list(set(df.index.values))
    if 4 - int(level) == 0:
        all_ECs = [
            ec for ec in all_ECs
            if len([x for x in ec.split(".") if x != "-"]) == int(level)
        ]
    else:
        all_ECs = ['.'.join(ec.split('.')[:-4 + int(level)]) for ec in all_ECs \
        if len([x for x in ec.split(".")[:-4 + int(level)] if x != "-"]) == int(level)]
    return list(set(all_ECs))
Пример #37
0
def load_enzyme_nomenclature_table():
    '''

    download all SIB enzyme nomenclature from FTP (ftp://ftp.expasy.org/databases/enzyme/)
    create the enzyme.enzymes table with the list of all EC with associated description
    create the enzyme.enzymes_dat with detailed information about each EC
    todo: remove existing tables for uptade if rerun

    :return: nothing
    '''

    from Bio.ExPASy import Enzyme
    import MySQLdb
    import urllib.request
    import os
    from io import StringIO
    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="enzyme")  # name of the data base
    cursor = conn.cursor()
    conn.set_character_set('utf8')
    cursor.execute('SET NAMES utf8;')
    cursor.execute('SET CHARACTER SET utf8;')
    cursor.execute('SET character_set_connection=utf8;')
    '''
    ID  Identification                         (Begins each entry; 1 per entry)
    DE  Description (official name)            (>=1 per entry)
    AN  Alternate name(s)                      (>=0 per entry)
    CA  Catalytic activity                     (>=1 per entry)
    CF  Cofactor(s)                            (>=0 per entry)
    CC  Comments                               (>=0 per entry)
    PR  Cross-references to PROSITE            (>=0 per entry)
    DR  Cross-references to Swiss-Prot         (>=0 per entry)
    '''

    enzyme_file = 'ftp://ftp.expasy.org/databases/enzyme/enzyme.dat'

    data = urllib.request.urlopen(enzyme_file).read().decode('utf-8')

    sql1 = 'CREATE TABLE IF NOT EXISTS enzymes (enzyme_id INT AUTO_INCREMENT PRIMARY KEY,' \
          ' ec VARCHAR(200));'

    sql2 = 'CREATE TABLE IF NOT EXISTS enzymes_dat (enzyme_dat_id INT,' \
          ' line VARCHAR(20),' \
          ' value LONG,' \
           ' CONSTRAINT fk_enzyme_id' \
           ' FOREIGN KEY(enzyme_dat_id) REFERENCES enzymes(enzyme_id)' \
           ' ON DELETE CASCADE);'

    print('create enzyme table')
    print(sql1)
    cursor.execute(sql1, )
    print('create dat table')
    print(sql2)
    cursor.execute(sql2)

    for n, data in enumerate(Enzyme.parse(StringIO(data))):
        enzyme = data['ID']
        # insert enzyme id into primary TABLE
        sql = 'INSERT into enzymes (ec) values ("%s");' % enzyme

        print(n, sql)

        cursor.execute(sql, )
        conn.commit()

        sql = 'SELECT LAST_INSERT_ID();'

        cursor.execute(sql, )
        id = cursor.fetchall()[0][0]

        # description
        sql = 'INSERT into enzymes_dat (enzyme_dat_id, line, value) values (%s, "description", "%s");' % (
            id, data['DE'])
        cursor.execute(sql, )
        # alternative names
        for i in data['AN']:
            sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "alternative name", "%s");' % (
                id, i)
            cursor.execute(sql, )

        # Catalytic activity
        sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s,"catalytic activity", "%s");' % (
            id, data['CA'])
        cursor.execute(sql, )

        # Cofactors
        sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "cofactors", "%s");' % (
            id, data['CF'])
        cursor.execute(sql, )

        # prosite crossref
        for i in data['PR']:
            sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "prosite", "%s");' % (
                id, i)
            cursor.execute(sql, )
        # comments
        for i in data['CC']:
            sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "comment", "%s");' % (
                id, i)
            cursor.execute(sql, )

        conn.commit()