def create_gecko_model_reaction_wise(model: cobra.Model, output_sbml_name: str, project_folder: str, project_name: str, excluded_reactions: List[str]) -> cobra.Model: """Creates a GECKO model as described in <i> Sánchez, B. J., Zhang, C., Nilsson, A., Lahtvee, P. J., Kerkhoven, E. J., & Nielsen, J. (2017). Improving the phenotype predictions of a yeast genome‐scale metabolic model by incorporating enzymatic constraints. Molecular systems biology, 13(8). </i> Arguments ---------- * model: cobra.Model ~ A cobra Model representation of the metabolic network. This model will be changed using cobrapy functions in order to add the proteomic constraints. * output_sbml_name: str ~ The base name of the created SBML. * project_folder: str ~ The folder in which the spreadsheets and JSONs with the model's supplemental data can be found. * project_name: str ~ The sMOMENTed model creation's name, which will be added at the beginning of the created SBML's name. * excluded_reactions: List[str] ~ A string list of reaction IDs (the 'reverse' and 'forward' name additions must not be added, i.e. for 'ACALD_forward' just 'ACALD' has to be given) to which no kcat shall be added. Typically used for gas exchange reactions such as 'CO2tex'. """ # Standardize project folder project_folder = standardize_folder(project_folder) # This base path is the location were the generated files wil be stored basepath: str = project_folder + project_name # READ REACTIONS<->KEGG ID XLSX protein_id_mass_mapping: Dict[str, float] = json_load(basepath+"_protein_id_mass_mapping.json") # LOAD XLSX WITH PROTEIN DATA # Load protein data XLSX protein_id_concentration_mapping, p_total, unmeasured_protein_fraction, mean_saturation = \ read_protein_data_xlsx(basepath) # Read enzyme kinetics xlsx reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping = \ read_enzyme_stoichiometries_xlsx(basepath) # Read reaction <-> kcat mapping :D reactions_kcat_mapping_database = json_load(basepath + "_reactions_kcat_mapping_combined.json") all_kcats = [x["forward"] for x in reactions_kcat_mapping_database.values()] + \ [x["reverse"] for x in reactions_kcat_mapping_database.values()] all_kcats = [x for x in all_kcats if not math.isnan(x)] default_kcat = statistics.median(all_kcats) print(f"Default kcat is: {default_kcat}") # GECKO :D # # This ID addition will be added to all reactions which are modified by this method id_addition = "_TG_" # Calculate p_measured p_measured = get_p_measured(protein_id_concentration_mapping, protein_id_mass_mapping) # Make model irreversible model = get_irreversible_model(model, id_addition) # Add prot_pool reaction model, prot_pool_metabolite = add_prot_pool_reaction(model, id_addition, p_total, p_measured, unmeasured_protein_fraction, mean_saturation) # Add enzyme source reaction for every unmeasured protein for protein_id in list(protein_id_mass_mapping.keys()): if protein_id in list(protein_id_concentration_mapping.keys()): # Measured eu = cobra.Reaction(id=id_addition+"EU_"+protein_id, name=f"Enzyme usage reaction of measured protein {protein_id}", subsystem="AutoPACMEN") enzyme = cobra.Metabolite(id=protein_id+"_met", name=f"Protein {protein_id}", compartment="AutoPACMEN") eu.add_metabolites({enzyme: 1.0}) eu.lower_bound = 0 eu.upper_bound = protein_id_concentration_mapping[protein_id] model.add_reactions([eu]) else: # Unmeasured er = cobra.Reaction(id=id_addition+"ER_"+protein_id, name=f"Enzyme usage reaction of unmeasured protein {protein_id}", subsystem="AutoPACMEN") enzyme = cobra.Metabolite(id=protein_id+"_met", name=f"Protein {protein_id}", compartment="AutoPACMEN") molecular_weight = protein_id_mass_mapping[protein_id] / 1000 # Mapping is in Da, GECKO uses kDa (g/mmol) er.add_metabolites({prot_pool_metabolite: -molecular_weight, enzyme: 1}) er.lower_bound = 0 er.upper_bound = 1000.0 model.add_reactions([er]) # Add enzymes to reactions current_arm_reaction = 1 model_reaction_ids = [x.id for x in model.reactions] for model_reaction_id in model_reaction_ids: reaction = model.reactions.get_by_id(model_reaction_id) splitted_id = reaction.id.split(id_addition) # If the reaction has no name, ignore it if splitted_id[0] == "": continue # Take the reaction ID from the first part of the split reaction_id = splitted_id[0] # If the reaction has no associated enzyme stoichiometries, ignore it if reaction_id not in list(reaction_id_gene_rules_mapping.keys()): continue # If the reaction has no gene rule, ignore it gene_rule = reaction_id_gene_rules_mapping[reaction_id] if gene_rule == [""]: continue # If the reaction is manually excluded, ignore it if reaction_id in excluded_reactions: continue all_available = True for enzyme in gene_rule: if type(enzyme) == str: try: model.metabolites.get_by_id(enzyme+"_met") except Exception: all_available = False break else: for enzyme_id in enzyme: try: model.metabolites.get_by_id(enzyme_id+"_met") except Exception: all_available = False break if not all_available: continue # Retrieve the reaction's forward and reverse kcats from the given reaction<->kcat database if reaction_id in reactions_kcat_mapping_database.keys(): forward_kcat = reactions_kcat_mapping_database[reaction_id]["forward"] reverse_kcat = reactions_kcat_mapping_database[reaction_id]["reverse"] # If the reaction is not in the database, set the default kcat else: forward_kcat = default_kcat reverse_kcat = default_kcat # If the given reaction<->kcat database contains math.nan as the reaction's kcat, # set the default kcat as math.nan means that no kcat could be found. if math.isnan(forward_kcat): forward_kcat = default_kcat if math.isnan(reverse_kcat): reverse_kcat = default_kcat # Add the given forward or reverse kcat is the reaction was # splitted due to its reversibility. # If the reaction is not splitted, add the forward kcat (this # is the only possible direction for non-splitted=non-reversible # reactions) if model_reaction_id.endswith(id_addition + "forward"): reaction_kcat = forward_kcat elif model_reaction_id.endswith(id_addition + "reverse"): reaction_kcat = reverse_kcat else: reaction_kcat = forward_kcat # Add arm reaction if isozymes occur if len(gene_rule) > 1: # Isozymes occur :O arm_reaction_id = id_addition+f"arm_reaction_{current_arm_reaction}" arm_reaction_name = f"Arm reaction no. {current_arm_reaction} for gene rule {str(gene_rule)}" arm_reaction = cobra.Reaction(id=arm_reaction_id, name=arm_reaction_name, subsystem="AutoPACMEN") arm_reaction_metabolites = {} for metabolite in list(reaction.metabolites.keys()): stoichiometry = reaction.metabolites[metabolite] if stoichiometry < 0: # Educt arm_reaction_metabolites[metabolite] = stoichiometry reaction.add_metabolites({metabolite: -stoichiometry}) im_id = f"im_{current_arm_reaction}" im_name = f"Intermediate metabolite of arm reaction {current_arm_reaction}" intermediate_metabolite = cobra.Metabolite(id=im_id, name=im_name, compartment="AutoPACMEN") arm_reaction_metabolites[intermediate_metabolite] = 1 arm_reaction.add_metabolites(arm_reaction_metabolites) reaction.add_metabolites({intermediate_metabolite: -1}) arm_reaction.lower_bound = 0 arm_reaction.upper_bound = reaction.upper_bound model.add_reactions([arm_reaction]) current_arm_reaction += 1 # Add reactions depending on isozyme complex presence new_reactions = [] i = 1 for isozyme_id in gene_rule: new_reaction = copy.deepcopy(reaction) new_reaction.id = new_reaction.id + id_addition + str(i) protein_ids = [] if type(isozyme_id) is str: # No complex :O protein = model.metabolites.get_by_id(isozyme_id+"_met") reaction_id = reaction_id.split("_TG_")[0] stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[reaction_id][isozyme_id][isozyme_id] stoichiometry /= (reaction_kcat * 3600) stoichiometry *= -1 metabolites = {} metabolites[protein] = stoichiometry protein_ids.append(isozyme_id) new_reaction.add_metabolites(metabolites) else: # Complex :O metabolites = {} isozyme_id = tuple(isozyme_id) for single_id in isozyme_id: protein = model.metabolites.get_by_id(single_id+"_met") reaction_id = reaction_id.split("_TG_")[0] stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[reaction_id][isozyme_id][single_id] stoichiometry /= (reaction_kcat * 3600) stoichiometry *= -1 metabolites[protein] = stoichiometry protein_ids.append(single_id) new_reaction.add_metabolites(metabolites) gene_reaction_rule = " and ".join(protein_ids) new_reaction.gene_reaction_rule = gene_reaction_rule new_reactions.append(new_reaction) i += 1 model.add_reactions(new_reactions) model.remove_reactions([reaction]) cobra.io.write_sbml_model(model, project_folder+output_sbml_name) return model
def parse_brenda_json_for_model(sbml_path: str, brenda_json_path: str, output_json_path: str) -> None: """Reads out a BRENDA JSON file created with parse_brenda_textfile and creates a model-specific JSON. Arguments ---------- * sbml_path: str ~ The path of the SBML model of which a specific BRENDA JSON kcat database shall be created * brenda_json_path: str ~ The full path to the BRENDA JSON created with parse_brenda_textfile. * output_json_path: str ~ The full path to the newly created JSON. Output ---------- A JSON in the given folder and the name 'kcat_database_brenda.json', and with the following structure: <pre> { '$EC_NUMBER': { '$BIGG_ID_METABOLITE': { '$ORGANISM': [ kcat_list: float ], (...) }, (...) }, (...) } </pre> """ model: cobra.Model = cobra.io.read_sbml_model(sbml_path) # Get EC numbers of the model's reactions ec_numbers_of_model: List[str] = [] for reaction in model.reactions: if "ec-code" not in reaction.annotation.keys(): continue ec_numbers_of_reaction = reaction.annotation["ec-code"] if type(ec_numbers_of_reaction) is str: ec_numbers_of_reaction = [ec_numbers_of_reaction] ec_numbers_of_model += ec_numbers_of_reaction ec_numbers_of_model = list(set(ec_numbers_of_model)) # Get EC number entries for each EC number of the model brenda_kcat_database_original = json_load(brenda_json_path) brenda_kcat_database_for_model = {} for ec_number in ec_numbers_of_model: entry_error = False if ec_number in brenda_kcat_database_original.keys(): ec_number_entry = _get_transfer_ec_number_entry( ec_number, brenda_kcat_database_original) if "ERROR" in ec_number_entry.keys(): entry_error = True else: ec_number_entry["WILDCARD"] = False brenda_kcat_database_for_model[ec_number] = ec_number_entry if (ec_number not in brenda_kcat_database_original.keys()) or entry_error: eligible_ec_number_entries: List[Dict[str, Any]] = [] for wildcard_level in range(1, 5): for database_ec_number in list( brenda_kcat_database_original.keys()): if is_fitting_ec_numbers(ec_number, database_ec_number, wildcard_level): database_ec_number_entry = _get_transfer_ec_number_entry( database_ec_number, brenda_kcat_database_original) if "ERROR" not in database_ec_number_entry.keys(): eligible_ec_number_entries.append( database_ec_number_entry) if len(eligible_ec_number_entries) > 0: break ec_number_entry = {} for eligible_ec_number_entry in eligible_ec_number_entries: for metabolite_key in eligible_ec_number_entry.keys(): metabolite_entry = eligible_ec_number_entry[metabolite_key] if metabolite_key not in ec_number_entry.keys(): ec_number_entry[metabolite_key] = metabolite_entry else: ec_number_entry[metabolite_key] = { **ec_number_entry[metabolite_key], **metabolite_entry } ec_number_entry["WILDCARD"] = True brenda_kcat_database_for_model[ec_number] = ec_number_entry json_write(output_json_path, brenda_kcat_database_for_model)
def get_reactions_kcat_mapping(sbml_path: str, project_folder: str, project_name: str, organism: str, kcat_database_path: str, protein_kcat_database_path: str, type_of_kcat_selection: str) -> None: """Returns a reaction<->kcat mapping for the given model :D The selection of kcats is depending on the affected metabolites of the reaction direction (one kcat is given for each the forward and reverse direction), and on the organism (the kcats from the taxonomically nearest organism is prefered). Arguments ---------- *sbml_path: str ~ Te SBML path to the model *project_folder: str ~ The folder in which the model data files are sored *project_name: str ~ The name of the used project *organism: str ~ The organism's name *kcat_database_path: str ~ A path to an already created EC number<->kcats database *protein_kcat_database_path: str ~ A path to the custom protein<->kcat database *type_of_kcat_selection: str ~ Can be "mean", "median" or "random". Refers to the selection of found kcats of a reaction. Is "mean" by default. Output ---------- A JSON in the given project folder with the name $project_name+'_reactions_kcat_mapping_combined.json' and the following structure: <pre> { "$REACTION_NAME": { "forward": $forward_kcat, "reverse": $reverse_kcat }, (...) } </pre> """ # Standardize project folder project_folder = standardize_folder(project_folder) # Set the path for the output JSON basepath = project_folder + project_name # Load the combined, EC-number-dependent kcat database :D kcat_database = json_load(kcat_database_path) # If given, load the protein-dependent kcat database :D if protein_kcat_database_path != "none": protein_kcat_database = json_load(protein_kcat_database_path) else: protein_kcat_database = {} # Load the given stoichiometric model model = cobra.io.read_sbml_model(sbml_path) # Set-up dictionary which will be the content of the output JSON reactions_kcat_mapping: Dict[str, Dict[str, float]] = {} # Go through each reaction in order to assign kcats for it :D for reaction in model.reactions: # If no EC number is given in the reaction's annotations, # the protein-dependent database is read out in order to # find a kcat. This only works if at least one of the assigned # enzymes of the reaction's gene rule has a kcat in the # protein-dependent database. if "ec-code" not in reaction.annotation.keys(): # 0 means that no kcat can be assigned forward_kcat: Any = 0 reverse_kcat: Any = 0 if protein_kcat_database != {}: # Retrieve the kcats from the protein-dependent database :D forward_kcat = _get_kcat_from_protein_kcat_database( "forward", reaction, protein_kcat_database) reverse_kcat = _get_kcat_from_protein_kcat_database( "reverse", reaction, protein_kcat_database) # If no kcat could be assigned, set the kcat to math.nan # which indicates this case if forward_kcat == 0.0: forward_kcat = math.nan if reverse_kcat == 0.0: reverse_kcat = math.nan # Add the retrieved forward and reverse kcats to the reaction<->kcat mapping dictionary :D reactions_kcat_mapping[reaction.id] = {} reactions_kcat_mapping[reaction.id]["forward"] = forward_kcat reactions_kcat_mapping[reaction.id]["reverse"] = reverse_kcat # Print the assigned kcats _print_assigned_kcats(reaction.id, forward_kcat, reverse_kcat) continue # Retrieve the reaction's associated EC numbers reaction_ids = reaction.annotation["ec-code"] # If only one EC number is given, set the EC number string to # a list in order to make it work with the following code lines if type(reaction_ids) is str: reaction_ids = [reaction_ids] # Get all EC numbers which do not contain a - wildcard, such as # in 2.1.1.- # These wildcarded EC numbers are in general too permissive in order # to get useful kcats eligible_reaction_ids = [x for x in reaction_ids if "-" not in x] if len(eligible_reaction_ids) == 0: eligible_reaction_ids = [x for x in reaction_ids] # Create a 'complete entry' from all eligible (i.e., non-wildcarded) # EC numbers. This complete entry contains - for every organism # and substrate given in the EC number kcat entries - all kcats # of all eligible EC numbers. In addition, the pseudo-substrate # "ALL" is added which contains all organisms. "ALL" is used # later if no fitting substrate can be found. complete_entry: Dict[str, Any] = {} complete_entry["ALL"] = {} # Go through each reaction ID :D for reaction_id in eligible_reaction_ids: # If the EC number could not be found in the given EC number<->kcat # database, print it and proceed with the next eligible EC number if reaction_id not in kcat_database.keys(): print(f"INFO: No entry for EC number {reaction_id}") print("") continue # Otherwise, get the reaction ID entry from the given database :D reaction_id_entry = kcat_database[reaction_id] # Exclude all kcat entries which come from a wildcard search # with * if reaction_id_entry["WILDCARD"]: continue # Go trough each metabolite in the EC number<->kcat database entries for metabolite_key in reaction_id_entry.keys(): # Ignore the keys which show additional information # about the nature of the kcat data if metabolite_key in ("WILDCARD", "SOURCE", "TRANSFER"): continue # Add the metabolite to the complete entry if it does not already occur if metabolite_key not in complete_entry: complete_entry[metabolite_key] = {} # Go throudh each species in the currently analyzed EC number for species_key in reaction_id_entry[metabolite_key]: # Add the species to the metabolite entry if it does not already occur if species_key not in complete_entry[metabolite_key]: complete_entry[metabolite_key][species_key] = [] # ...and do the same for the pseudo-metabolite "ALL" if species_key not in complete_entry["ALL"].keys(): complete_entry["ALL"][species_key] = [] # Add the list of kcats of the currently analyzed EC number to the current species # and the current metabolite, and for "ALL" complete_entry[metabolite_key][ species_key] += reaction_id_entry[metabolite_key][ species_key] complete_entry["ALL"][species_key] += reaction_id_entry[ metabolite_key][species_key] # If no entries with kcats could be found for any of the eligible EC numbers, continue with the next reaction. if complete_entry["ALL"] == {}: continue # Get the BIGG IDs of the educts and products uusing the SBML's BIGG ID annotation educt_bigg_ids: List[str] = [] for reactant in reaction.reactants: if "bigg.metabolite" in reactant.annotation.keys(): educt_bigg_ids.append(reactant.annotation["bigg.metabolite"]) product_bigg_ids: List[str] = [] for product in reaction.products: if "bigg.metabolite" in product.annotation.keys(): product_bigg_ids.append(product.annotation["bigg.metabolite"]) # If no bigg IDs could be found in the SBML, add the pseudo-metabolite "X" # which indicated that "ALL" should be used later. if len(educt_bigg_ids) == 0: educt_bigg_ids = ["X"] if len(product_bigg_ids) == 0: product_bigg_ids = ["X"] # Get the metabolites which are used in the subsequent forward kcat search searched_educts = _get_searched_metabolites(complete_entry, educt_bigg_ids) # Get the forward kcat depending on the educts and the organism forward_kcat = _get_kcat(searched_educts, complete_entry, organism, "forward", reaction, protein_kcat_database, type_of_kcat_selection) # Get the metabolites which are used in the subsequent forward kcat search searched_products = _get_searched_metabolites(complete_entry, product_bigg_ids) # Get the reverse kcat depending on the products and the organism reverse_kcat = _get_kcat(searched_products, complete_entry, organism, "reverse", reaction, protein_kcat_database, type_of_kcat_selection) # Set the found out kcats in the reactions<->kcat mapping :D reactions_kcat_mapping[reaction.id] = {} reactions_kcat_mapping[reaction.id]["forward"] = forward_kcat reactions_kcat_mapping[reaction.id]["reverse"] = reverse_kcat # display the found out kcats for this reaction \o/ _print_assigned_kcats(reaction.id, forward_kcat, reverse_kcat) # Export the kcat mapping results as JSON :D json_write(basepath + "_reactions_kcat_mapping_combined.json", reactions_kcat_mapping)
def get_id_associated_kcats(searched_ids: List[str], id_type: str, bigg_id_name_mapping_path: str, batch_size: int = 5) -> Dict[str, Any]: """Returns a dictionary with SABIO-RK kcat data for the given EC numbers or KEGG IDs. This function calls the SABIO-RK API. Input ---------- * searched_ids: List[str] ~ The list of searched IDs * id_type: str ~ Must be either 'EC' or 'KEGG', depending on whether you are looking for kcats for EC numbers or KEGG IDs. * batch_size: int = 5 ~ The SABIO-RK API search batching number (i.e., with satch_size=5 five IDs are searched at once) Output ---------- A dictionary with the following content: <pre> { "$EC_NUMBER_OR_KEGG_REACTION_ID": { "$SUBSTRATE_WITH_BIGG_ID_1": { "$ORGANISM_1": [ $kcat_1, (...) $kcat_n, ] }, (...), "REST": { "$ORGANISM_1": [ $kcat_1, (...) $kcat_n, ] } } (...), } </pre> 'REST' stands for a substrate without found BIGG ID. """ # Set-up the cache if it does not exist yet \o/ cache_basepath = "./_cache/sabio_rk_total/" ensure_folder_existence("./_cache/") ensure_folder_existence(cache_basepath) cache_files = get_files(cache_basepath) # Load the given BIGG ID<->metabolite common name mapping bigg_id_name_mapping = json_load(bigg_id_name_mapping_path) # In order to save search time, use the seat (i.e., a list where # every member occurs only once) of the given searched IDs searched_ids = list(set(searched_ids)) # Set the given ID name to the name which SABIO-RK uses for them if id_type == "EC": id_name = "ECNumber" elif id_type == "KEGG": id_name = "KeggReactionID" # Depending on the wildcard level which is serched, either # the output or the wildcard output will be used as output # These central dictionaries will contain the ID<->kcat mapping output = {} wildcard_output = {} # We use batched searched in order to save search time :D batch_start = 0 # Loop while not all IDs were searched \o/ while batch_start < len(searched_ids): # Get the batch for the search :-) batch = searched_ids[batch_start: batch_start + batch_size] # The query dicts contain a list of dictionaries which contain # the data for a SABIO-RK search entry query_dicts: List[Dict[str, str]] = [] # Go through each single EC number in the search bath for ec_number in batch: # Create the cache filename cache_filename = ec_number.replace(".", "_").replace("*", "W") + ".json" # If the EC number is already searched, i.e. it can be found in the cache, # take the results from there in order to save much search time :D if cache_filename in cache_files: cache_filepath = cache_basepath + cache_filename output[ec_number] = json_load(cache_filepath) print(f"Loading {cache_filename}...") # Otherwise, create an actual SABIO-RK API search query else: query_dicts.append({id_name: ec_number, "Parametertype": "kcat", "EnzymeType": "wildtype"}) # If not all of the searched IDs are present in the cache... if len(query_dicts) > 0: # ...use SABIO-RK's API :D print(f"Performing query {query_dicts}...") result = sabio_rk_query_get_csv_lines(query_dicts) # If there was an error with the SABIO-RK result (i.e., no result found or an invalid given ID), # continue with the next batch if result == "NO_RESULT": batch_start += batch_size continue # ...otherwise set the query result to nothing else: result = [] # Loop through every SABIO-RK API query call result :D temp_ec_numbers_found_in_search = [] result = _extract_kcat_lines(result) for row in result: # Get the unit of the parameter unit = row["parameter.unit"] # If it is a weird unusable unit, do not use this result and continue with the next result \o/ if unit not in list(UNIT_MULTIPLIER.keys()): # e.g. (s^-1)*(mg^-1) continue # Get the serached ID ec_number = row[id_name] # Generate a lowercarse and semicolon seperated list of substrates substrates_names = row["Substrate"] substrates_list = [x.lower() for x in substrates_names.replace("+", "").split(";")] substrates_list = sorted(substrates_list) # Convert the substrates name list into a BIGG ID list (only works # if there is a name<->BIGG ID mapping present for each substrate) bigg_ig_substrates_list = [] for substrate in substrates_list: if substrate in bigg_id_name_mapping.keys(): bigg_id = bigg_id_name_mapping[substrate] bigg_ig_substrates_list.append(bigg_id) # If one of the substrates cannot be found, use the pseudometabolite "REST" # and break :O else: bigg_ig_substrates_list = ["REST"] break # Set the substrate list to a semicolon-connected string substrate = ";".join(bigg_ig_substrates_list) # Get the result's organism :D species = row["Organism"] # Get the kcat and set # it to 1/s for consistent behaviour :D raw_kcat = float(row["parameter.startValue"]) # Without unit correction kcat = raw_kcat * UNIT_MULTIPLIER[unit] # With unit correction 🎉 # Add the result to the output for the given EC number, sustrate and species if ec_number not in output.keys(): output[ec_number] = {} if substrate not in output[ec_number].keys(): output[ec_number][substrate] = {} if species not in output[ec_number][substrate].keys(): output[ec_number][substrate][species] = [] output[ec_number][substrate][species].append(kcat) # Since we found a result, add the EC number :D temp_ec_numbers_found_in_search.append(ec_number) # Create cache files for all newly found EC numbers which were not present # in the cache temp_ec_numbers_found_in_search = list(set(temp_ec_numbers_found_in_search)) for ec_number in temp_ec_numbers_found_in_search: cache_filename = ec_number.replace(".", "_") + ".json" if cache_filename not in cache_files: json_write(cache_basepath + cache_filename, output[ec_number]) # Get all wildcarded searched EC numbers... wildcarded_searched_ec_numbers = [x for x in batch if "*" in x] # ...and loop through them in order to create a result for the EC numbers # which fit into the wildcard (i.e 1.1.1.123 in 1.1.1.*) :D for wildcarded_ec_number in wildcarded_searched_ec_numbers: # Ste the cache name for the wildcarded EC number cache_filename = wildcarded_ec_number.replace(".", "_").replace("*", "W") + ".json" # If the wildcarded EC number cannot be found in the cache, search for # fitting EC numbers, and combine their entries into a huge entry for the # wildcarded EC number if cache_filename not in cache_files: fitting_ec_numbers = [] for found_ec_number in temp_ec_numbers_found_in_search: if is_fitting_ec_numbers(wildcarded_ec_number, found_ec_number, wildcarded_ec_number.count("*")): fitting_ec_numbers.append(found_ec_number) # Combine the EC number entries of fitting EC numbers :D wildcarded_ec_number_dict: Dict[str, Any] = {} for fitting_ec_number in fitting_ec_numbers: fitting_ec_number_result = output[fitting_ec_number] for metabolite_key in fitting_ec_number_result.keys(): if metabolite_key not in wildcarded_ec_number_dict.keys(): wildcarded_ec_number_dict[metabolite_key] = fitting_ec_number_result[metabolite_key] else: for organism_key in fitting_ec_number_result[metabolite_key].keys(): if organism_key not in wildcarded_ec_number_dict[metabolite_key].keys(): wildcarded_ec_number_dict[metabolite_key][organism_key] =\ copy.deepcopy(fitting_ec_number_result[metabolite_key][organism_key]) else: wildcarded_ec_number_dict[metabolite_key][organism_key] +=\ copy.deepcopy(fitting_ec_number_result[metabolite_key][organism_key]) wildcarded_ec_number_dict[metabolite_key][organism_key] =\ list(set(wildcarded_ec_number_dict[metabolite_key][organism_key])) # Create cache files for the searched wildcarded EC numbers \o/ if wildcarded_ec_number_dict != {}: json_write(cache_basepath + cache_filename, wildcarded_ec_number_dict) wildcard_output[wildcarded_ec_number] = wildcarded_ec_number_dict # If the wildcarded EC number is in the cache, load the cache file :D else: wildcard_output[wildcarded_ec_number] = json_load(cache_basepath + cache_filename) print(f"Loading {cache_filename}...") # Continue with the next searched ID batch :D batch_start += batch_size # If the wildcard level is greater than 0, set the wildcard output as output if len(wildcard_output.keys()) > 0: output = wildcard_output return output
def create_smoment_model_reaction_wise( model: cobra.Model, output_sbml_name: str, project_folder: str, project_name: str, excluded_reactions: List[str], type_of_default_kcat_selection: str = "median") -> None: """Adds proteomic constraints according to sMOMENT to the given stoichiometric model and stores it as SBML. Arguments ---------- * model: cobra.Model ~ A cobra Model representation of the metabolic network. This model will be changed using cobrapy functions in order to add the proteomic constraints. * output_sbml_name: str ~ The base name of the created SBML. * project_folder: str ~ The folder in which the spreadsheets and JSONs with the model's supplemental data can be found. * project_name: str ~ The sMOMENTed model creation's name, which will be added at the beginning of the created SBML's name. * excluded_reactions: List[str] ~ A string list of reaction IDs (the 'reverse' and 'forward' name additions must not be added, i.e. for 'ACALD_forward' just 'ACALD' has to be given) to which no kcat shall be added. Typically used for gas exchange reactions such as 'CO2tex'. * type_of_default_kcat_selection: str ~ The type of selection of default kcat values. Can be "mean", "median" or "random". Is "median" by default. Output ---------- An SBML in the given folder with the given name, which describes the given stoichiometric model enhanced by the protein constraint introduction with this function. """ # Standardize project folder project_folder = standardize_folder(project_folder) # Set folder path for newly created SBML and name for the reaction ID addition (added at the end, # and used in order to have a programatically convinient way to separate additions such as 'reverse' # from the 'actual' reaction ID). basepath: str = project_folder + project_name id_addition: str = "_TG_" # READ REACTIONS<->KEGG ID XLSX protein_id_mass_mapping: Dict[str, float] = json_load( basepath + "_protein_id_mass_mapping.json") # Load protein data XLSX protein_id_concentration_mapping, p_total, unmeasured_protein_fraction, mean_saturation = \ read_protein_data_xlsx(basepath) # Read enzyme stoichiometries xlsx reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping = \ read_enzyme_stoichiometries_xlsx(basepath) # Calculate p_measured p_measured = get_p_measured(protein_id_concentration_mapping, protein_id_mass_mapping) # Split reactions with measured enzymes model, reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping = \ get_model_with_separated_measured_enzyme_reactions(model, protein_id_concentration_mapping, reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping, excluded_reactions, protein_id_mass_mapping) # Make model irreversible, separating all reversible reactions to which a gene rule is given # in order to save some reactions. model = get_irreversible_model(model, id_addition) # Add prot_pool reaction according to the given protein pool values model, prot_pool_metabolite = add_prot_pool_reaction( model, id_addition, p_total, p_measured, unmeasured_protein_fraction, mean_saturation) # Read reaction <-> kcat mapping :-) reactions_kcat_mapping_database = json_load( basepath + "_reactions_kcat_mapping_combined.json") # sMOMENT :D # Get all kcats which are not math.nan and calculate the median of them, which will be used as default kcat all_kcats = [x["forward"] for x in reactions_kcat_mapping_database.values()] + \ [x["reverse"] for x in reactions_kcat_mapping_database.values()] all_kcats = [x for x in all_kcats if not math.isnan(x)] if type_of_default_kcat_selection == "median": default_kcat = statistics.median(all_kcats) elif type_of_default_kcat_selection == "mean": default_kcat = statistics.mean(all_kcats) elif type_of_default_kcat_selection == "random": default_kcat = random.choice(all_kcats) else: print( 'ERROR: Argument type_of_default_kcat_selection must be either "median", "mean" or "random".' ) sys.exit(-1) print(f"Default kcat is: {default_kcat}") # Get all reaction IDs of the given model model_reaction_ids = [x.id for x in model.reactions] # Add measured enzyme pseudo-metabolites and pseudo-reactions for protein_id in protein_id_concentration_mapping.keys(): new_metabolite = cobra.Metabolite( id="ENZYME_" + protein_id, name="Pseudo-metabolite of protein " + protein_id, compartment="sMOMENT") max_protein_concentration = protein_id_concentration_mapping[ protein_id] new_reaction = cobra.Reaction( id="ENZYME_DELIVERY_" + protein_id, name="Delivery reaction of pseudo-metabolite " + protein_id, lower_bound=0, upper_bound=max_protein_concentration) new_reaction.add_metabolites({new_metabolite: 1}) model.add_reactions([new_reaction]) # Main loop :D, add enzyme constraints to reactions \o/ for model_reaction_id in model_reaction_ids: # Get the reaction and split the ID at the ID addition reaction = model.reactions.get_by_id(model_reaction_id) splitted_id = reaction.id.split(id_addition) # If the reaction has no name, ignore it if splitted_id[0] == "": continue # Take the reaction ID from the first part of the split reaction_id = splitted_id[0] # Remove GPRSPLIT name addition from reactions with measured protein concentrations if "_GPRSPLIT_" in reaction_id: reaction_id = reaction_id.split("_GPRSPLIT_")[0] # If the reaction has no associated enzyme stoichiometries, ignore it if reaction_id not in list(reaction_id_gene_rules_mapping.keys()): continue # If the reaction has no gene rule, ignore it gene_rule = reaction_id_gene_rules_mapping[reaction_id] if gene_rule == [""]: continue # If the reaction is manually excluded, ignore it if reaction_id in excluded_reactions: continue # Check if all proteins in the reaction's gene rule have a found mass # This is not the case for e.g. spontaneous reactions which often get the pseudo-enzyme 's0001' all_available = True for enzyme in gene_rule: if type(enzyme) == str: if enzyme not in list(protein_id_mass_mapping.keys()): print(enzyme) all_available = False break else: for enzyme_id in enzyme: if enzyme_id not in list(protein_id_mass_mapping.keys()): all_available = False break # If not all of the mass-checked enzymes have a found mass, ignore this reaction if not all_available: continue # Retrieve the reaction's forward and reverse kcats from the given reaction<->kcat database if reaction_id in reactions_kcat_mapping_database.keys(): forward_kcat = reactions_kcat_mapping_database[reaction_id][ "forward"] reverse_kcat = reactions_kcat_mapping_database[reaction_id][ "reverse"] # If the reaction is not in the database, set the default kcat else: forward_kcat = default_kcat reverse_kcat = default_kcat # If the given reaction<->kcat database contains math.nan as the reaction's kcat, # set the default kcat as math.nan means that no kcat could be found. if math.isnan(forward_kcat): forward_kcat = default_kcat if math.isnan(reverse_kcat): reverse_kcat = default_kcat # Add the given forward or reverse kcat is the reaction was # splitted due to its reversibility. # If the reaction is not splitted, add the forward kcat (this # is the only possible direction for non-splitted=non-reversible # reactions) if model_reaction_id.endswith(id_addition + "forward"): reaction_kcat = forward_kcat elif model_reaction_id.endswith(id_addition + "reverse"): reaction_kcat = reverse_kcat else: reaction_kcat = forward_kcat # Add protein pool pseudo-metabolite depending on isozyme complex presence stoichiometries: List[float] = [ ] # List of selectable MW/kcat stoichiometries (the most conservative constraint will be chosen) stoichiometry_enzyme_name_list: List[str] = [ ] # List of enzyme names and stoichiometries (semicolon-separated) for a console report for isozyme_id in gene_rule: # If it's not a complex :O... if type(isozyme_id) is str: # ...get the reaction ID without the additions... reaction_id = reaction_id.split("_TG_")[0] # ...get the number of units for this protein... number_units = reaction_id_gene_rules_protein_stoichiometry_mapping[ reaction_id][isozyme_id][isozyme_id] stoichiometry = number_units # ...and determine the protein pool stoichiometry by # 1) Multiplying the number of units for this protein with its mass (converted from kDa to mDa, since the reaction # flux is defined for mmol/(gDW*h) and not mol/(gDW*h)) stoichiometry *= (protein_id_mass_mapping[isozyme_id] / 1000) # 2) Dividing it with the reaction's kcat (converted from 1/s to 1/h) stoichiometry /= (reaction_kcat * 3600) # 3) Setting the right direction (educt) stoichiometry *= -1 stoichiometries.append(stoichiometry) stoichiometry_enzyme_name_list.append(isozyme_id + ";" + str(number_units)) # Add proteomics constraints if isozyme_id in protein_id_concentration_mapping.keys(): enzyme_pseudo_metabolite = model.metabolites.get_by_id( "ENZYME_" + isozyme_id) stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[ reaction_id][isozyme_id][isozyme_id] stoichiometry *= 1 / (reaction_kcat * 3600) stoichiometry *= -1 reaction.add_metabolites( {enzyme_pseudo_metabolite: stoichiometry}) # If it is a complex :O... else: # ...convert the complex IDs to a hashable tuple (used for the stoichiometry selection)... isozyme_id = tuple(isozyme_id) stoichiometry = 0 # ...go through each single ID of the complex... stoichiometry_enzyme_name_list.append("") for single_id in isozyme_id: # ...get the reaction ID without additions... reaction_id = reaction_id.split("_TG_")[0] # ...get the number of units for this protein... number_units = reaction_id_gene_rules_protein_stoichiometry_mapping[ reaction_id][isozyme_id][single_id] single_stoichiometry = number_units # ...and determine the protein pool stoichiometry addition by # 1) Multiplying the number of units for this protein with its mass (converted from kDa to Da) single_stoichiometry *= ( protein_id_mass_mapping[single_id] / 1000) # 2) Dividing it with the reaction's kcat (converted from 1/s to 1/h) single_stoichiometry /= (reaction_kcat * 3600) # 3) Setting the right direction (educt) single_stoichiometry *= -1 # 4) and add it to the complex's stoichiometry stoichiometry += single_stoichiometry # Add name of current single ID stoichiometry_enzyme_name_list[ -1] += single_id + ";" + str(number_units) + " " stoichiometry_enzyme_name_list[ -1] = stoichiometry_enzyme_name_list[-1].rstrip() # Add to list of stoichiometries stoichiometries.append(stoichiometry) # Add proteomics constraints for single_id in isozyme_id: if single_id in protein_id_concentration_mapping.keys(): enzyme_pseudo_metabolite = model.metabolites.get_by_id( "ENZYME_" + single_id) stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[ reaction_id][isozyme_id][single_id] stoichiometry *= 1 / (reaction_kcat * 3600) stoichiometry *= -1 reaction.add_metabolites( {enzyme_pseudo_metabolite: stoichiometry}) # Take the maximal stoichiometry (i.e., the one with the least cost since this one will usually be prefered # anyway in an FBA). metabolites = {} max_stoichiometry = max(stoichiometries) metabolites[prot_pool_metabolite] = max_stoichiometry reaction.add_metabolites(metabolites) selected_enzyme = stoichiometry_enzyme_name_list[stoichiometries.index( max_stoichiometry)] # Print report of selected kcat and molecular weight for this reaction print("Reaction: ", model_reaction_id) print("Selected kcat: ", reaction_kcat) print("Selected molecular weight (kDa): ", end="") if " " in selected_enzyme: # Multiple enzymes mass_sum = .0 for single_enzyme in selected_enzyme.split(" "): enzyme_name = single_enzyme.split(";")[0] enzyme_unit_number = float(single_enzyme.split(";")[1]) mass_sum += protein_id_mass_mapping[ enzyme_name] * enzyme_unit_number print(mass_sum) else: # Single enzyme enzyme_name = selected_enzyme.split(";")[0] enzyme_unit_number = float(selected_enzyme.split(";")[1]) print(protein_id_mass_mapping[enzyme_name] * enzyme_unit_number) # Output as SBML (without constraints due to cobrapy limitations) cobra.io.write_sbml_model(model, project_folder + output_sbml_name)
def parse_brenda_textfile(brenda_textfile_path: str, bigg_metabolites_json_folder: str, json_output_path: str) -> None: """Goes through a BRENDA database textfile and converts it into a machine-readable JSON. The JSON includes kcats for found organisms and substrates. As of 29/04/2019, the BRENDA database can be downloaded as textfile under https://www.brenda-enzymes.org/download_brenda_without_registration.php The BRENDA database is not in a completely standardized format, so that this functions contains many convoluted checks and circumventions of non-standardized data. kcats from mutated enzymes are excluded. Arguments ---------- * brenda_textfile_path: str ~ The BRENDA database text file path * bigg_metabolites_json_folder: str ~ The folder in which the BIGG metabolites database is stored (it has to have the name 'bigg_id_name_mapping.json'). * json_output_path: str ~ The path of the JSON that shall be created Output ---------- * A JSON containing the BRENDA textfile kcat data in a machine-readable format: <pre> { "$EC_NUMBER": { "$SUBSTRATE_WITH_BIGG_ID_1": { "$ORGANISM_1": [ $kcat_1, (...) $kcat_n, ] }, (...), "REST": { "$ORGANISM_1": [ $kcat_1, (...) $kcat_n, ] } } (...), } </pre> 'REST' stands for a substrate without found BIGG ID. """ # Standardize output folder bigg_metabolites_json_folder = standardize_folder( bigg_metabolites_json_folder) # Load BIGG ID <-> metabolite name mapping :D bigg_id_name_mapping: Dict[str, str] = json_load(bigg_metabolites_json_folder + "bigg_id_name_mapping.json") # Load BRENDA textfile as list of strings without newlines :D with open(brenda_textfile_path, "r", encoding="utf-8") as f: lines = f.readlines() lines = [x.replace("\n", "") for x in lines] # Go through each line and collect the organism lines and kcat lines for each EC number in_turnover_numbers = False in_organism_reference = False ec_number_kcat_lines_mapping: Dict[str, List[str]] = {} ec_number_organsism_lines_mapping: Dict[str, List[str]] = {} current_ec_number = "" organism_lines: List[str] = [] kcat_lines: List[str] = [] i = 0 while i < len(lines): line = lines[i] if line.startswith("ID\t"): if current_ec_number != "": ec_number_organsism_lines_mapping[ current_ec_number] = organism_lines ec_number_kcat_lines_mapping[current_ec_number] = kcat_lines current_ec_number = line.replace("ID\t", "").replace(" ()", "") organism_lines = [] kcat_lines = [] if len(line) == 0: in_turnover_numbers = False in_organism_reference = False elif line.startswith("PROTEIN"): in_organism_reference = True i += 1 line = lines[i] elif line.startswith("TURNOVER_NUMBER"): in_turnover_numbers = True i += 1 line = lines[i] if in_organism_reference: if line.startswith("PR"): organism_lines.append("") if len(organism_lines[-1]) > 0: organism_lines[-1] += " " organism_lines[-1] += " " + line elif in_turnover_numbers: if line.startswith("TN"): kcat_lines.append("") if len(kcat_lines[-1]) > 0: kcat_lines[-1] += " " kcat_lines[-1] += line if len(line) == 0: in_turnover_numbers = False in_organism_reference = False i += 1 # Create the BRENDA database dictionary using the collected kcat and organism lines # of each EC number :D ec_numbers = list(ec_number_kcat_lines_mapping.keys()) brenda_kcat_database: Dict[str, Any] = {} for ec_number in ec_numbers: if "(transferred to " in ec_number: actual_ec_number = ec_number.split(" (transferred")[0] try: brenda_kcat_database[actual_ec_number] = {} brenda_kcat_database[actual_ec_number]["TRANSFER"] = \ ec_number.lower().replace(" ", " ").split("(transferred to ec")[1].replace(")", "").lstrip() except Exception: # Some transfers go to general subgroups instead of single EC numbers so that # no kcat database can be built from it D: print("WARNING: BRENDA text file line " + ec_number + " is not interpretable!") continue brenda_kcat_database[ec_number] = {} reference_number_organism_mapping = {} organism_lines = ec_number_organsism_lines_mapping[ec_number] for organism_line in organism_lines: reference_number = organism_line.split("#")[1] organism_line_split_first_part = organism_line.split("# ")[1] organism_line_split = organism_line_split_first_part.split(" ") organism_line_split = [ x for x in organism_line_split if len(x) > 0 ] end = 1 for part in organism_line_split: # Some organism names contain their SwissProt or UniProt ID, # since we don't nned them they are excluded if ("swissprot" in part.lower()) or \ (part.lower() == "and") or \ ("uniprot" in part.lower()) or \ ("genbank" in part.lower()) or \ ("trembl" in part.lower()): end -= 2 break if ("<" in part) or ("(" in part): end -= 1 break end += 1 organism_name = " ".join(organism_line_split[:end]) reference_number_organism_mapping[reference_number] = organism_name kcat_lines = ec_number_kcat_lines_mapping[ec_number] for kcat_line in kcat_lines: kcat_line = kcat_line # Exclude kcats of mutated/changed proteins since # they may not have a biological relevance if ("mutant" in kcat_line.lower()) or ("mutated" in kcat_line.lower()): continue reference_number = kcat_line.split("#")[1].split(",")[0] organism = reference_number_organism_mapping[reference_number] kcat_str = "".join( kcat_line.split("#")[2]).split("{")[0].lstrip().rstrip() kcat = max([float(x) for x in kcat_str.split("-") if len(x) > 0]) substrate = "".join(kcat_line.split("{")[1]).split("}")[0] substrate = substrate.lower() if substrate in bigg_id_name_mapping.keys(): substrate = bigg_id_name_mapping[substrate] else: substrate = "REST" if substrate not in brenda_kcat_database[ec_number].keys(): brenda_kcat_database[ec_number][substrate] = {} if organism not in brenda_kcat_database[ec_number][substrate].keys( ): brenda_kcat_database[ec_number][substrate][organism] = [] brenda_kcat_database[ec_number][substrate][organism].append(kcat) # Write final BRENDA kcat database :D json_write(json_output_path, brenda_kcat_database)
def create_combined_kcat_database(sabio_rk_kcat_database_path: str, brenda_kcat_database_path: str, output_path: str) -> None: """Creates a combined JSON of the given SABIO-K and BRENDA kcat databases with non-wildcard entries only. Arguments ---------- * sabio_rk_kcat_database_path: str ~ The path to the SABIO-RK kcat database JSON * brenda_kcat_database_path: str ~ The path to the BRENDA kcat database JSON * output_path: str ~ The outputh path (with filename) of the genreated combined kcat database JSON Output: A JSON with the following format: <pre> { '$EC_NUMBER': { '$BIGG_IDS_OF_SUBSTRATES': { '$ORGANISM': { kcat: float }, (...) }, (...), 'SOURCE': 'SABIO_RK' or 'BRENDA' or 'BRENDA and SABIO-RK', 'WILDCARD': false }, (...) } </pre> """ # Load the two given databases as JSONs sabio_rk_database = json_load(sabio_rk_kcat_database_path) brenda_database = json_load(brenda_kcat_database_path) # Get all EC number keys (BRENDA contains all relevant EC numbers) ec_number_keys: List[str] = list(brenda_database.keys()) # Set-up combined kcat database dictionary combined_database: Dict[str, Dict[str, Any]] = {} # Go through each EC number :D... for ec_number_key in ec_number_keys: # Get the wildcard status (i.e., found with a * wildcard?) is_sabio_rk_from_wildcard: bool = sabio_rk_database[ec_number_key]["WILDCARD"] is_brenda_from_wildcard: bool = brenda_database[ec_number_key]["WILDCARD"] # If both are from wildcards, ignore them :3 if (is_sabio_rk_from_wildcard) and (is_brenda_from_wildcard): continue # Set-up dictionary for the EC number since at least one of the two databases # is not from a wildcarded search :D combined_database[ec_number_key] = {} # If both are not from wildcards, combine them :D... if (not is_sabio_rk_from_wildcard) and (not is_brenda_from_wildcard): # ...by reading their metabolites... sabio_rk_metabolite_keys = list(sabio_rk_database[ec_number_key].keys()) brenda_metabolite_keys = list(brenda_database[ec_number_key].keys()) metabolite_keys = list(set(sabio_rk_metabolite_keys + brenda_metabolite_keys)) # ...going through them... for metabolite_key in metabolite_keys: # ...excluding the WILDCARD key... if metabolite_key == "WILDCARD": continue # ...and adding the metabolites according to their presence in the databases :D is_metabolite_in_brenda: bool = metabolite_key in brenda_metabolite_keys is_metabolite_in_sabio_rk: bool = metabolite_key in sabio_rk_metabolite_keys if is_metabolite_in_brenda and is_metabolite_in_sabio_rk: sabio_rk_entry = sabio_rk_database[ec_number_key][metabolite_key] brenda_entry = brenda_database[ec_number_key][metabolite_key] combined_database[ec_number_key][metabolite_key] = {**sabio_rk_entry, **brenda_entry} elif is_metabolite_in_brenda: brenda_entry = brenda_database[ec_number_key][metabolite_key] combined_database[ec_number_key][metabolite_key] = brenda_entry else: sabio_rk_entry = sabio_rk_database[ec_number_key][metabolite_key] combined_database[ec_number_key][metabolite_key] = sabio_rk_entry combined_database[ec_number_key]["WILDCARD"] = is_sabio_rk_from_wildcard combined_database[ec_number_key]["SOURCE"] = "BRENDA and SABIO-RK" # If only the SABIO-RK entry does not come from a wildcard, use it :D elif not is_sabio_rk_from_wildcard: combined_database[ec_number_key] = sabio_rk_database[ec_number_key] combined_database[ec_number_key]["WILDCARD"] = False combined_database[ec_number_key]["SOURCE"] = "SABIO-RK" # If only the BRENDA entry does not come from a wildcard, use it :-) elif not is_brenda_from_wildcard: combined_database[ec_number_key] = brenda_database[ec_number_key] combined_database[ec_number_key]["WILDCARD"] = False combined_database[ec_number_key]["SOURCE"] = "BRENDA" json_write(output_path, combined_database)