def cache_pathway(path_id): """ Lookup and/or cache pathway """ result = DataStore.get_entry("pathways").exec_query("pathway-lookup", [path_id]).fetchone() if result: # Cache hit return parse_kegg(result[0]) # Cache miss pathway_raw = retrieve_kegg(path_id) DataStore.get_entry("pathways").insert_rows("pathway", [(path_id, pathway_raw)]) return parse_kegg(pathway_raw)
def cache_enzyme(enzyme_id): """ Lookup and/or cache enzyme """ result = DataStore.get_entry("pathways").exec_query("enzyme-lookup", [enzyme_id]).fetchone() if result: # Cache hit return parse_kegg(result[0]) # Cache miss enzyme_raw = retrieve_kegg(enzyme_id) DataStore.get_entry("pathways").insert_rows("enzyme", [(enzyme_id, enzyme_raw)]) return parse_kegg(enzyme_raw)
def build_tree(entries): """ Create tree structure out of taxon lineage data """ ret_tree = {"name": "all", "children": dict(), "enzymes": Counter()} for entry in entries: result = DataStore.get_entry("taxonomy").exec_query("lineage-lookup", [entry[0]]).fetchone() if not result: continue raw_lineage = result[0] # If a UniProt species-level species ID (non 9CCCC), append species to lineage if not entry[0].startswith("9"): raw_lineage += ";{0}".format(entry[1]) ranks = raw_lineage.split(";") parent = ret_tree["children"] for rank in ranks: rank = "Unknown" if rank == "" else rank.strip() # Initialize dictionary for this rank, add child values parent.setdefault(rank, {"name": rank, "children": dict(), "enzymes": Counter()}) parent[rank]["enzymes"] += Counter(entries[entry]) parent = parent[rank]["children"] # Update root level ret_tree["enzymes"] += Counter(entries[entry]) return ret_tree
def treeify_lineage(taxa_entries, taxa_count): """ Create tree structure out of taxa/lineage data """ ret_tree = (dict(), taxa_count) for taxon in taxa_entries: result = DataStore.get_entry("taxonomy").exec_query( "lineage-lookup", [taxon[0]]).fetchone() if not result: continue raw_lineage = result[0] ranks = raw_lineage.split(";") parent = ret_tree[0] for rank in ranks: rank = rank.strip() # Start new dictionary, initialize size cache if rank not in parent: parent[rank] = (dict(), 0) parent[rank] = (parent[rank][0], parent[rank][1] + taxa_entries[taxon]) parent = parent[rank][0] return ret_tree
def taxonomy_init(): # Setup FileStore FileStore("taxonomy-db", "taxonomy-db", "taxonomy.db", None, FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL) FileStore("taxonomy-lineage", "taxonomy-lineage", "taxonomy-lineage.dat", "http://www.uniprot.org/taxonomy/?query=&sort=score&format=tab", FileStore.FTYPE_TEMP, FileStore.FOPT_NORMAL) # Setup DataStore DataStore("taxonomy", FileStore.get_entry("taxonomy-db").path) DataStore.get_entry("taxonomy").create_table( "lineage", [("mnemonic", "text", "PRIMARY KEY"), ("lineage", "text", "")]) DataStore.get_entry("taxonomy").define_query( "lineage-lookup", "SELECT lineage FROM lineage WHERE mnemonic = ?") # Populate database populate_database()
def render_sequences(cluster_ids): """ Retrieve all members of requested UniRef90 clusters and render fasta data """ # Prepare all sequence files for reading for entry in FileStore.get_group("decluster-seqs"): entry.get_handle("rt") for cluster_id in cluster_ids: for result in DataStore.get_entry("crossref").exec_query( "uniprot_cross_acc", ("UniRef90", cluster_id)).fetchmany(): core.main.send_output(get_sequence(result[0]), "stdout", "")
def decluster_init(): # Setup FileStore FileStore("decluster-db", "decluster-db", "decluster.db", None, FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL) FileStore( "decluster-seqs", "decluster-swissprot", "decluster_uniprot_sprot.fasta.gz", "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz", FileStore.FTYPE_CACHE, FileStore.FOPT_GZIP_DECOMPRESS) FileStore( "decluster-seqs", "decluster-trembl", "decluster_uniprot_trembl.fasta.gz", "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz", FileStore.FTYPE_CACHE, FileStore.FOPT_GZIP_DECOMPRESS) # Setup DataStore DataStore("decluster", FileStore.get_entry("decluster-db").path) DataStore.get_entry("decluster").create_table( "indices", [("id", "text", "PRIMARY KEY"), ("file", "text", ""), ("pos", "integer", "")]) DataStore.get_entry("decluster").define_query( "index-lookup", "SELECT file, pos FROM indices WHERE id = ?") populate_database()
def filter_taxa(taxa, pattern): """ Filter taxa having a specific taxonomic rank (as regex) """ ret_taxa = dict() ret_count = 0 for taxon in taxa: result = DataStore.get_entry("taxonomy").exec_query( "lineage-lookup", [taxon[0]]).fetchone() lineage = result[0] if result else "Unknown" if re.search(pattern, lineage): ret_taxa[taxon] = taxa[taxon] ret_count += taxa[taxon] return ret_taxa, ret_count
def group_sam(data, sam_file, species): """ Group SAM reads per taxonomic rank """ retData = dict() # Simplify species lookup keys species_lookup = {item[0]: item[1] for item in species} # Read SAM entries entries = SamEntry.get_entries(sam_file, 0) for entry in entries: reference = entries[entry].reference # Currently only handles UniProt if "_" not in reference: continue mnemonic = reference.split("_")[1] # Skip entries that are too ambiguous (e.g. 9ZZZZ) or recently moved by UniProt if mnemonic not in species_lookup: continue result = DataStore.get_entry("taxonomy").exec_query( "lineage-lookup", [mnemonic]).fetchone() if not result: continue # Combine lineage and species for lookup into taxonomy results lineage = [item.strip() for item in result[0].split(";")] lineage.append(species_lookup[mnemonic]) for rank in lineage: if rank in data: query = entries[entry].query if query not in retData: retData[query] = list() retData[query].append(rank) return retData
def get_sequence(acc): """ Lookup sequence for given acc """ result = DataStore.get_entry("decluster").exec_query( "index-lookup", [acc]).fetchone() if not result: core.main.send_output( "Sequence not found for UniProt accession '{0}'".format(acc)) sys.exit(1) # Seek the index position in the appropriate file handle = FileStore.get_entry(result[0], "decluster-seqs").get_handle() handle.seek(result[1]) # Append sequence data until next header ret_seq = "" for line in handle: if line.startswith(">") and ret_seq: break ret_seq += line return ret_seq
def pathways_init(): # Setup FileStore FileStore("pathways-db", "pathways-db", "pathways.db", None, FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL) # Setup DataStore DataStore("pathways", FileStore.get_entry("pathways-db").path) DataStore.get_entry("pathways").create_table("enzyme", [("ec", "text", "PRIMARY KEY"), ("pathway", "text", "")]) DataStore.get_entry("pathways").create_table("pathway", [("pathway", "text", "PRIMARY KEY"), ("info", "text", "")]) DataStore.get_entry("pathways").define_query("enzyme-lookup", "SELECT pathway FROM enzyme WHERE ec = ?") DataStore.get_entry("pathways").define_query("pathway-lookup", "SELECT info FROM pathway WHERE pathway = ?") # Check for expired database if DataStore.get_entry("pathways").get_expired("enzyme", 30): DataStore.get_entry("pathways").delete_rows("enzyme") DataStore.get_entry("pathways").delete_rows("pathway") DataStore.get_entry("pathways").update_age("enzyme")
def combined_compare(taxonomy_entries, sam_entries, uniprot_entries, pass_total): """ Combined compare breaks down taxonomy report on per read basis, then aggregates """ ret_entries = dict() # Calculate unmapped difference and add to taxonomy set unmapped_diff = 0 for sam_entry in sam_entries: for set_idx in range(2): if sam_entry[set_idx] == "*": if set_idx == 0: pass_total += 1 unmapped_diff += [1, -1][set_idx] # Add unmapped entry to taxonomy list taxonomy_entries["Unmapped"] = unmapped_diff # Iterate through each taxonomy grouping for taxon_entry in taxonomy_entries: # Skip unchanged entries if taxonomy_entries[taxon_entry] == 0: continue # Prepare entry if new if taxon_entry not in ret_entries: ret_entries[taxon_entry] = (dict(), taxonomy_entries[taxon_entry] / pass_total) # Scan SAM entries for matching lineage for sam_entry in sam_entries: # Lookup species and lineage sam_records = list() lineage = list() for set_idx in range(2): if sam_entry[set_idx] == "*": miss_record = type("miss_record", (object, ), {})() miss_record.species_id = "Unmapped" miss_record.species_full = "Unmapped" sam_records.append(miss_record) lineage.append("Unmapped") else: sam_records.append(uniprot_entries[set_idx][ sam_entry[set_idx].split("|")[-1]]) species_id = sam_records[set_idx].species_id result = DataStore.get_entry("taxonomy").exec_query( "lineage-lookup", [species_id]).fetchone() if result: lineage.append( [x.strip() for x in result[0].split(";")]) else: lineage.append("Unknown") for set_idx in range(2): # Attempt to match on species if sam_records[set_idx].species_full == taxon_entry: sam_species = sam_records[1 - set_idx].species_full sam_amount = [1, -1][set_idx] * sam_entries[sam_entry] if sam_species not in ret_entries[taxon_entry][0]: ret_entries[taxon_entry][0][sam_species] = (0, 0) sam_parts = ret_entries[taxon_entry][0][sam_species] if sam_amount > 0: ret_entries[taxon_entry][0][sam_species] = ( sam_parts[0] + sam_amount, sam_parts[1]) else: ret_entries[taxon_entry][0][sam_species] = ( sam_parts[0], sam_parts[1] + sam_amount) break # Attempt to match on lineage for rank_idx in range(len(lineage[set_idx])): if lineage[set_idx][rank_idx] == taxon_entry: compare_idx = rank_idx if compare_idx >= len(lineage[1 - set_idx]): compare_idx = len(lineage[1 - set_idx]) - 1 sam_lineage = lineage[1 - set_idx][compare_idx] sam_amount = [1, -1][set_idx] * sam_entries[sam_entry] if sam_lineage not in ret_entries[taxon_entry][0]: ret_entries[taxon_entry][0][sam_lineage] = (0, 0) sam_parts = ret_entries[taxon_entry][0][sam_lineage] if sam_amount > 0: ret_entries[taxon_entry][0][sam_lineage] = ( sam_parts[0] + sam_amount, sam_parts[1]) else: ret_entries[taxon_entry][0][sam_lineage] = ( sam_parts[0], sam_parts[1] + sam_amount) break # Calculate percentages for taxonomy_entry in ret_entries: for sam_entry in ret_entries[taxonomy_entry][0]: sam_parts = ret_entries[taxonomy_entry][0][sam_entry] ret_entries[taxonomy_entry][0][sam_entry] = (sam_parts[0] / pass_total, sam_parts[1] / pass_total) return ret_entries
def populate_database(): """ Populate sequence header indices """ if not DataStore.get_entry("decluster").get_expired("indices", 30): return core.main.send_output("Populating UniProt sequences...", "stderr") # Start transaction and empty any existing data DataStore.get_entry("decluster").process_trans() DataStore.get_entry("decluster").delete_rows("indices") # Download each sequence file for entry in FileStore.get_group("decluster-seqs"): entry.prepare() with entry.get_handle("rt") as handle: acc = "" while True: line = handle.readline() if not line: break if line.startswith(">"): fields = line.rstrip().split() acc = fields[0].split("|")[1] DataStore.get_entry("decluster").insert_rows( "indices", [(acc, entry.fid, handle.tell() - len(line))]) # Finalize transaction and current table age DataStore.get_entry("decluster").process_trans() DataStore.get_entry("decluster").update_age("indices")
def populate_database(): """ Generate (if necessary) and get lineage lookup """ if not DataStore.get_entry("taxonomy").get_expired("lineage", 30): return core.main.send_output("Populating taxonomic lineage data...", "stderr") # Download tab delimited data entry = FileStore.get_entry("taxonomy-lineage") entry.prepare() # Start transaction and empty any existing data DataStore.get_entry("taxonomy").process_trans() DataStore.get_entry("taxonomy").delete_rows("lineage") # Iterate through downloaded table and add rows with entry.get_handle("r") as handle: for line in handle: fields = line.rstrip().split("\t") if len(fields) < 9 or fields[1] == "": continue # Add to database DataStore.get_entry("taxonomy").insert_rows( "lineage", [(fields[1], fields[8])]) # Finalize transaction and current table age DataStore.get_entry("taxonomy").process_trans() DataStore.get_entry("taxonomy").update_age("lineage")
def populate_database(): """ Generate cross-reference database """ if not DataStore.get_entry("crossref").get_expired("uniprot", 30): return core.main.send_output("Populating UniProt database cross-references...", "stderr") # Download tab delimited data entry = FileStore.get_entry("crossref-uniprot") entry.prepare() # Start transaction and empty any existing data DataStore.get_entry("crossref").drop_index("uniprot_acc") DataStore.get_entry("crossref").drop_index("uniprot_acc_db") DataStore.get_entry("crossref").drop_index("uniprot_db_cross") DataStore.get_entry("crossref").process_trans() DataStore.get_entry("crossref").delete_rows("uniprot") # Iterate through downloaded table and add rows with entry.get_handle("rt") as handle: for line in handle: fields = line.rstrip().split("\t") if len(fields) < 3: continue # Add to database DataStore.get_entry("crossref").insert_rows("uniprot", [fields]) # Finalize transaction and current table age DataStore.get_entry("crossref").process_trans() DataStore.get_entry("crossref").create_index("uniprot_acc") DataStore.get_entry("crossref").create_index("uniprot_acc_db") DataStore.get_entry("crossref").create_index("uniprot_db_cross") DataStore.get_entry("crossref").update_age("uniprot")
def crossref_init(): # Setup FileStore FileStore("crossref-db", "crossref-db", "crossref.db", None, FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL) FileStore( "crossref-uniprot", "crossref-uniprot", "idmapping.dat.gz", "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz", FileStore.FTYPE_TEMP, FileStore.FOPT_GZIP) # Setup DataStore DataStore("crossref", FileStore.get_entry("crossref-db").get_path()) DataStore.get_entry("crossref").create_table("uniprot", [("acc", "text", ""), ("db", "text", ""), ("cross", "text", "")]) DataStore.get_entry("crossref").define_query( "uniprot_acc_cross", "SELECT cross FROM uniprot WHERE acc = ? AND db = ?") DataStore.get_entry("crossref").define_query( "uniprot_acc_all", "SELECT db, cross FROM uniprot WHERE acc = ?") DataStore.get_entry("crossref").define_query( "uniprot_cross_acc", "SELECT acc FROM uniprot WHERE db = ? AND cross = ?") DataStore.get_entry("crossref").define_query( "uniprot_cross_cross", "SELECT t2.cross FROM uniprot AS t1 JOIN uniprot AS t2 ON acc WHERE t1.db = ? AND t1.cross = ? AND t2.db = ?" ) DataStore.get_entry("crossref").define_index("uniprot_acc", "uniprot", ["acc"], False) DataStore.get_entry("crossref").define_index("uniprot_acc_db", "uniprot", ["acc", "db"], False) DataStore.get_entry("crossref").define_index("uniprot_db_cross", "uniprot", ["db", "cross"], False) # Populate database populate_database()