Пример #1
0
class Peptides(object):
    """

    ::

        >>> p = Peptides()
        >>> p.get_fasta_sequence("Q8IYB3")
        >>> p.get_peptide_position("Q8IYB3", "VPKPEPIPEPKEPSPE")
        189


    Sometimes, peptides are provided with a pattern indicating the phospho site.
    e.g., ::

        >>>

    """
    def __init__(self, verbose=False):
        self.u = UniProt(verbose=verbose)
        self.sequences = {}

    def get_fasta_sequence(self, uniprot_name):
        seq = self.u.get_fasta_sequence(uniprot_name)
        return seq

    def get_phosphosite_position(self, uniprot_name, peptide):
        if uniprot_name not in self.sequences.keys():
            seq = self.get_fasta_sequence(uniprot_name)
            self.sequences[uniprot_name] = seq[:]
        else:
            seq = self.sequences[uniprot_name][:]
        positions = [x.start() for x in re.finditer("PQS", seq)]
        return positions
Пример #2
0
def add_sequence_to_nodes(n: str, d: Dict[str, Any]):
    """
    Maps UniProt ACC to UniProt ID. Retrieves sequence from UniProt and adds it to the node as a feature

    :param n: Graph node.
    :type n: str
    :param d: Graph attribute dictionary.
    :type d: Dict[str, Any]
    """
    h = HGNC(verbose=False)
    u = UniProt(verbose=False)

    d["uniprot_ids"] = h.fetch(
        "symbol", d["protein_id"])["response"]["docs"][0]["uniprot_ids"]

    # Todo these API calls should probably be batched
    # Todo mapping with bioservices to support other protein IDs?

    for id in d["uniprot_ids"]:
        d[f"sequence_{id}"] = u.get_fasta_sequence(id)
Пример #3
0
def make_files(out_dir=os.path.dirname(os.path.realpath(__file__))):

    # Initialize
    cwd = os.getcwd()
    matrix_ids = set()
    codec = coreapi.codecs.CoreJSONCodec()
    uniprot = UniProt(verbose=False, cache=False)

    # Create output dir
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # For each taxon...
    for taxon in taxons:
        # Skip if taxon profiles JSON file already exists
        profiles_json_file = os.path.join(
            out_dir, "%s.profiles.json" % taxon)
        if not os.path.exists(profiles_json_file):
            try:
                # Initialize
                profiles = {}
                client = coreapi.Client()        
                response = client.get(
                    "http://jaspar.genereg.net/api/v1/taxon/%s/" % taxon)
                json_obj = json.loads(codec.encode(response))
                # While there are more pages...
                while json_obj["next"] is not None:
                    # For each profile...
                    for profile in json_obj["results"]:
                        # If CORE collection profile...
                        if profile["collection"] == "CORE":
                            # Add profile
                            profiles.setdefault(profile["matrix_id"], profile["name"])
                    # Go to next page
                    response = client.get(json_obj["next"])
                    json_obj = json.loads(codec.encode(response))
                # Do last page
                for profile in json_obj["results"]:
                    # If CORE collection profile...
                    if profile["collection"] == "CORE":
                        # Add profile
                        profiles.setdefault(profile["matrix_id"], profile["name"])
                # Write
                functions.write(profiles_json_file, json.dumps(
                    profiles, sort_keys=True, indent=4, separators=(",", ": ")))
            except:
                raise ValueError("Could not fetch %s profiles from JASPAR" % taxon)

        # Skip if taxon uniprot JSON file already exists
        uniprot_json_file = os.path.join(out_dir, "%s.uniprot.json" % taxon)
        if not os.path.exists(uniprot_json_file):
            try:
                # Initialize
                uniaccs = {}
                client = coreapi.Client()
                # Load JSON file
                with open(profiles_json_file) as f:
                    profiles = json.load(f)
                # For each profile...
                for profile in sorted(profiles):
                    # Get profile detailed info
                    response = client.get(
                        "http://jaspar.genereg.net/api/v1/matrix/%s/" % profile)
                    json_obj = json.loads(codec.encode(response))
                    # Fix bugged cases
                    if json_obj["matrix_id"] == "MA0328.1":
                        json_obj["uniprot_ids"] = ["P0CY08"]
                    if json_obj["matrix_id"] == "MA0110.1":
                        json_obj["uniprot_ids"] = ["P46667"]
                    if json_obj["matrix_id"] == "MA0058.1":
                        json_obj["uniprot_ids"] = ["P61244"]
                    if json_obj["matrix_id"] == "MA0046.1":
                        json_obj["uniprot_ids"] = ["P20823"]
                    if json_obj["matrix_id"] == "MA0098.1":
                        json_obj["uniprot_ids"] = ["P14921"]
                    if json_obj["matrix_id"] == "MA0052.1":
                        json_obj["uniprot_ids"] = ["Q02078"]
                    if json_obj["matrix_id"] == "MA0024.1":
                        json_obj["uniprot_ids"] = ["Q01094"]
                    if json_obj["matrix_id"] == "MA0138.1":
                        json_obj["uniprot_ids"] = ["Q13127"]
                    # For each UniProt Accession...
                    for uniacc in json_obj["uniprot_ids"]:
                        # Strip
                        uniacc = uniacc.strip(" ")
                        # Add uniacc
                        uniaccs.setdefault(uniacc, [[], None])
                        if profile not in uniaccs[uniacc][0]:
                            uniaccs[uniacc][0].append(profile)
                # For each UniProt Accession...
                for uniacc in uniaccs:
                    # Get UniProt sequence
                    uniaccs[uniacc][1] = uniprot.get_fasta_sequence(uniacc)
                # Write
                functions.write(uniprot_json_file, json.dumps(
                    uniaccs, sort_keys=True, indent=4, separators=(",", ": ")))
            except:
                raise ValueError("Could not fetch %s sequences from UniProt" % taxon)

        # Skip if taxon FASTA file already exists
        fasta_file = os.path.join(out_dir, "%s.fa" % taxon)
        if not os.path.exists(fasta_file):
            # Load JSON file
            with open(uniprot_json_file) as f:
                uniaccs = json.load(f)
            # For each UniProt Accession...
            for uniacc in sorted(uniaccs):
                # Write
                functions.write(fasta_file,
                    ">%s\n%s" % (uniacc, uniaccs[uniacc][1]))
            # Create BLAST+ db
            try:
                process = subprocess.check_output([
                    "makeblastdb",
                    "-in", fasta_file,
                    "-dbtype", "prot"],
                    stderr=subprocess.STDOUT)
            except:
                raise ValueError("Could not create BLAST+ database: %s" % fasta_file)

    # Skip if Cis-BP JSON file already exists
    cisbp_json_file = os.path.join(out_dir, "cisbp.json")
    if not os.path.exists(cisbp_json_file):
        # Initialize
        proteins = {}
        prot_features = {}
        tfs = {}
        tf_families = {}
        # Create Cis-BP dir
        cisbp_dir = os.path.join(out_dir, "cisbp")
        if not os.path.exists(cisbp_dir):
            os.makedirs(cisbp_dir)
        # Change dir
        os.chdir(cisbp_dir)
        # Skip if TFs file already exists
        if not os.path.exists("cisbp_1.02.tfs.sql"):
            # Download SQL files
            os.system("curl --silent -O http://cisbp.ccbr.utoronto.ca/data/1.02/DataFiles/SQLDumps/SQLArchive_cisbp_1.02.zip")
            # Unzip
            os.system("unzip -qq SQLArchive_cisbp_1.02.zip")
            # Remove SQL files
            os.remove("SQLArchive_cisbp_1.02.zip")
            # For each ZIP file...
            for zip_file in frozenset(os.listdir(os.getcwd())):
                # Skip non-zip files
                if not zip_file.endswith(".zip"): continue
                # Unzip
                os.system("unzip -qq %s" % zip_file)
                os.remove(zip_file)
        # Return to original dir
        os.chdir(cwd)
        # Get protein features
        with open(os.path.join(cisbp_dir, "cisbp_1.02.prot_features.sql")) as f:
            # For each line...
            for line in f:
                m = re.search("\('.+', '(.+)', '.+', \d+, \d+, '(.+)'\)", line)
                if m:
                    prot_features.setdefault(m.group(1), set())
                    prot_features[m.group(1)].add(m.group(2))
        # Get TFs
        with open(os.path.join(cisbp_dir, "cisbp_1.02.tfs.sql")) as f:
            # For each line...
            for line in f:
                m = re.search("\('(.+)', '(.+)', '.+', '.+', '.+', '.+', '.+'\)", line)
                if m:
                    tfs.setdefault(m.group(1), m.group(2))
        # Get TF families
        with open(os.path.join(cisbp_dir, "cisbp_1.02.tf_families.sql")) as f:
            # For each line...
            for line in f:
                m = re.search("\('(.+)', '.+', '.+', \d+, (.+)\)", line)
                if m:
                    tf_families.setdefault(m.group(1), m.group(2))
        # Get proteins
        with open(os.path.join(cisbp_dir, "cisbp_1.02.proteins.sql")) as f:
            # For each line...
            for line in f:
                m = re.search("\('(.+)', '(.+)', '.+', '.+', '([A-Z]+)\W*'\)", line)
                if m:
                    if m.group(1) not in prot_features: continue
                    # Digest to MD5
                    h = hashlib.new("md5")
                    h.update(m.group(3).encode("utf-8"))
                    md5 = h.hexdigest() + m.group(3)[:4] + m.group(3)[-4:]
                    proteins.setdefault(md5, [tf_families[tfs[m.group(2)]], []])
                    # For each domain...
                    for domain in prot_features[m.group(1)]:
                        if domain not in proteins[md5][1]:
                            proteins[md5][1].append(domain)
        # Write
        functions.write(cisbp_json_file, json.dumps(
            proteins, sort_keys=True, indent=4, separators=(",", ": ")))
        # Remove Cis-BP dir
        shutil.rmtree(cisbp_dir)

    # Skip if JSON files already exist
    domains_json_file = os.path.join(out_dir, "domains.json")
    jaspar_json_file = os.path.join(out_dir, "jaspar.json")
    if not os.path.exists(domains_json_file) or not os.path.exists(jaspar_json_file):
        # Initialize
        domains = {}
        jaspar = {}
        # Load JSON file
        with open(cisbp_json_file) as f:
            cisbp = json.load(f)
        # Remove JSON files
        if os.path.exists(domains_json_file): domains_json_file
        if os.path.exists(jaspar_json_file): jaspar_json_file
        # For each taxon...
        for taxon in taxons:
            # Load JSON files
            profiles_json_file = os.path.join(
                out_dir, "%s.profiles.json" % taxon)
            with open(profiles_json_file) as f:
                profiles = json.load(f)
            uniprot_json_file = os.path.join(
                out_dir, "%s.uniprot.json" % taxon)
            with open(uniprot_json_file) as f:
                uniaccs = json.load(f)
            # For each UniProt Accession...
            for uniacc in sorted(uniaccs):
                # Skip if no sequence
                if uniaccs[uniacc][1] is None: continue
                # Digest to MD5
                h = hashlib.new("md5")
                h.update(uniaccs[uniacc][1].encode("utf-8"))
                md5 = h.hexdigest() + uniaccs[uniacc][1][:4] + uniaccs[uniacc][1][-4:]
                # If sequence in Cis-BP...
                if md5 in cisbp:
                    # Add to domains
                    domains.setdefault(uniacc, [cisbp[md5][1], cisbp[md5][0]])
                    # For each profile...
                    for profile in uniaccs[uniacc][0]:
                        # Add to JASPAR
                        jaspar.setdefault(uniacc, [])
                        jaspar[uniacc].append([profile, profiles[profile]])
        # Write
        functions.write(domains_json_file, json.dumps(
            domains, sort_keys=True, indent=4, separators=(",", ": ")))
        functions.write(jaspar_json_file, json.dumps(
            jaspar, sort_keys=True, indent=4, separators=(",", ": ")))