def parse_species_from_pathway(data_dir, p_idx_l=None, init_spe=60, atom_followed="C", end_t=1.0, species_path=False): """ parse unique species list from pathway list """ if p_idx_l is None: return suffix = naming.get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) prefix = "" if species_path is True: prefix = "species_" f_n_path_name = os.path.join( data_dir, "output", prefix + "pathway_name_candidate" + suffix + ".csv") p_name = np.genfromtxt(f_n_path_name, dtype=str, delimiter=',') unique_spe = set() for p_idx in p_idx_l: matched_spe = re.findall(r"S(\d+)", p_name[p_idx]) for _, s_idx in enumerate(matched_spe): unique_spe.add(s_idx) return unique_spe
def path_prob_terminating_with_spe(data_dir, init_spe=62, atom_followed="C", tau=10.0, end_t=1.0, species_path=True, end_s_idx=None, exclude_idx=None, time_axis=0): """ get pathway and their pathway probability, path ending with spe """ if exclude_idx is None: exclude_idx = [] prefix = "" if species_path is True: prefix = "species_" suffix = naming.get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) f_n_pn = os.path.join(data_dir, "output", prefix + "pathway_name_candidate" + suffix + ".csv") f_n_pp = os.path.join(data_dir, "output", prefix + "pathway_prob" + suffix + ".csv") path_names = np.loadtxt(f_n_pn, dtype=str, delimiter=",") data_pp = np.loadtxt(f_n_pp, dtype=float, delimiter=",") dim_n = len(np.shape(data_pp)) if dim_n == 1: data_y = data_pp elif dim_n == 2: data_y = data_pp[:, time_axis] # each pathway as a column d_f_n = pd.DataFrame(path_names, columns=['name'], dtype=str) d_f_p = pd.DataFrame(data_y, columns=['frequency'], dtype=float) d_f = pd.concat([d_f_n, d_f_p], axis=1) # print(d_f.head()) # filter if end_s_idx is not None: if isinstance(end_s_idx, int): d_f = d_f.loc[ lambda x: x['name'].str.endswith("S" + str(end_s_idx))] elif isinstance(end_s_idx, list): # got to be a tuple mask_str = tuple(["S" + str(e_s) for e_s in end_s_idx]) d_f = d_f.loc[lambda x: x['name'].str.endswith(mask_str)] d_f.sort_values(by='frequency', inplace=True, ascending=False) d_f.reset_index(drop=True, inplace=True) print(d_f) return d_f
def calculate_Merchant_alpha_value(data_dir, init_spe=10, atom_followed="C", end_t=1.0, species_path=False, s_idx=10, r_idx=736): """ calculate Merchat alpha value at time point as in time.csv, not at time zero """ suffix = naming.get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) prefix = "" if species_path is True: prefix = "species_" spe_conc_mat = np.loadtxt(os.path.join(data_dir, "output", "concentration_dlsode_M.csv"), dtype=float, delimiter=',') spe_k_mat = np.loadtxt(os.path.join(data_dir, "output", "drc_dlsode_M.csv"), dtype=float, delimiter=',') reaction_rate_mat = np.loadtxt(os.path.join(data_dir, "output", "reaction_rate_dlsode_M.csv"), dtype=float, delimiter=',') n_points = np.shape(spe_conc_mat)[0] spe_total_sink_rate_vec = np.zeros(n_points) merchant_alpha_v = np.zeros(n_points) for i in range(1, n_points): spe_total_sink_rate_vec[i] = spe_conc_mat[i, s_idx] * spe_k_mat[i, s_idx] if spe_total_sink_rate_vec[i] > 0: merchant_alpha_v[i] = reaction_rate_mat[ i, r_idx] / spe_total_sink_rate_vec[i] # set the first value at time zero to be the same as value at time one merchant_alpha_v[0] = merchant_alpha_v[1] merchant_alpha_fn = os.path.join( data_dir, "output", prefix + "Merchant_alpha_" + "S" + str(s_idx) + "_R" + str(r_idx) + suffix + ".csv") np.savetxt(merchant_alpha_fn, merchant_alpha_v, fmt='%.15e') return
def parse_pathway_contains_species(data_dir, s_idx_ds=None, init_spe=60, atom_followed="C", end_t=1.0, species_path=False): """ parse pathway contains only species from list or set ds --> data structure """ if s_idx_ds is None: return s_idx_set = set(s_idx_ds) suffix = naming.get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) prefix = "" if species_path is True: prefix = "species_" f_n_path_name = os.path.join( data_dir, "output", prefix + "pathway_name_candidate" + suffix + ".csv") p_name = np.genfromtxt(f_n_path_name, dtype=str, delimiter=',') path_list = [] for idx, path in enumerate(p_name): # Contains Species Only From Species List c_s_o_f_s_l = True matched_spe = re.findall(r"S(\d+)", path) for _, s_idx in enumerate(matched_spe): if s_idx not in s_idx_set: c_s_o_f_s_l = False break if c_s_o_f_s_l is True: path_list.append(idx) print(path_list) return path_list
def path_length_statistics(data_dir, init_spe=62, atom_followed="C", end_t=1.0, end_spe=None): """ path length statistics """ d_f = path_prob_terminating_with_spe(data_dir, init_spe, atom_followed, end_t, end_spe) count_map = OrderedDict() for _, val in enumerate(d_f['pathway'][0:20]): count = int(parse_pattern.parse_path_length(val)) if count in count_map: count_map[count] += 1 else: count_map[count] = 1 mat = [] count_map = OrderedDict(sorted(count_map.items())) for key, value in count_map.items(): mat.append([int(key), int(value)]) suffix = naming.get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) if end_spe is not None: suffix += "_S" + str(end_spe) out_f_n = os.path.join(data_dir, "output", "path_length" + suffix + ".csv") np.savetxt(out_f_n, mat, fmt='%d', delimiter=',', newline='\n', header='', footer='', comments='# ')
def pathway_time_2_array_index(data_dir, init_spe=None, atom_followed="C", end_t=1.0, species_path=False, time=1.0): """ pathway time converted to array index, pathway time read from pathway_time_canditate* """ suffix = get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) prefix = "" if species_path is True: prefix = "species_" f_n_path_time = os.path.join( data_dir, "output", prefix + "pathway_time_candidate" + suffix + ".csv") p_time = np.genfromtxt(f_n_path_time, dtype=float, delimiter=',') # in case of two dimensional pathway time if len(np.shape(p_time)) == 2: p_time = p_time[0, :] y_idx = [float(i) for i in range(len(p_time))] array_idx = interpolation.interp1d(p_time, y_idx, time) array_idx = int(array_idx) if array_idx >= len(p_time): array_idx = len(p_time) - 1 if array_idx < 0: array_idx = 0 return array_idx
def parse_spe_production_along_path(data_dir, top_n=10, spe_idx=10, init_spe=62, atom_followed="C", end_t=1.0, species_path=False, axis=0, path_branching_factor=False, s_consumption=False, s_production=True): """ parse species peoduction along path, note species might not explictly shown on path but are side products of reaction on pathway if path_idx is None, use top_n path if path_idx is not None, instead it is a list, use only selected path, the output file name thereafter ends with "selected_path" """ id_tmp = "" if spe_idx is None or spe_idx is []: return elif isinstance(spe_idx, int): id_tmp = str(spe_idx) spe_idx = [spe_idx] else: for x_t in spe_idx: if id_tmp == "": id_tmp = str(x_t) else: id_tmp += "_" + str(x_t) suffix = naming.get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) prefix = "" if species_path is True: prefix = "species_" f_n_path_name = os.path.join( data_dir, "output", prefix + "pathway_name_candidate" + suffix + ".csv") pathname_data = np.genfromtxt(f_n_path_name, dtype=str, max_rows=top_n + 1) # in case of two dimensional pathway name if len(np.shape(pathname_data)) == 2: pathname_data = pathname_data[:, axis] net_reactant = psri.parse_reaction_net_reactant(data_dir) net_product = psri.parse_reaction_net_product(data_dir) s_p_r_c = psri.parse_species_pair_reaction(data_dir) if path_branching_factor is True: atom_scheme = asch.get_atom_scheme(data_dir) s_idx_name, _ = psri.parse_spe_info(data_dir) s_p_c = [] for _, p_n in enumerate(pathname_data): spe_consumption_count = 0 spe_production_count = 0 for s_i in spe_idx: if s_consumption is True: spe_consumption_count += parse_pattern.parse_species_along_path_using_reaction( p_n, net_reactant, s_i, s_p_r_c) if s_production is True: spe_production_count += parse_pattern.parse_species_along_path_using_reaction( p_n, net_product, s_i, s_p_r_c) path_branching_number = 1 if path_branching_factor is True: path_branching_number = parse_pattern.calculate_path_branching_number( pathname=p_n, net_reactant=net_reactant, net_product=net_product, s_idx_name=s_idx_name, atom_scheme=atom_scheme, atom_followed=atom_followed) s_p_c.append((spe_production_count - spe_consumption_count) * path_branching_number) if id_tmp != "": suffix += "_" + id_tmp f_n_spe_production_count = os.path.join( data_dir, "output", prefix + "pathway_species_production_count" + suffix + ".csv") np.savetxt(f_n_spe_production_count, s_p_c, fmt='%d')
def init_directed_network(data_dir, path_idx=None, init_spe=None, atom_followed="C", end_t=None, species_path=False, time_axis=0): """ init directed network without parallel edges return networkx.DiGraph """ spe_idx_name_dict, _ = psri.parse_spe_info(data_dir) suffix = get_suffix(data_dir, init_spe=init_spe, atom_followed=atom_followed, end_t=end_t) prefix = "" if species_path is True: prefix = "species_" f_n_path_name = os.path.join( data_dir, "output", prefix + "pathway_name_candidate" + suffix + ".csv") f_n_path_prob = os.path.join(data_dir, "output", prefix + "pathway_prob" + suffix + ".csv") print(f_n_path_name, f_n_path_prob) p_n = np.genfromtxt(f_n_path_name, dtype=str, delimiter=',') p_p = np.genfromtxt(f_n_path_prob, dtype=float, delimiter=',') # in case of two dimensional pathway name if len(np.shape(p_n)) == 2: p_n = p_n[:, time_axis] if len(np.shape(p_p)) == 2: p_p = p_p[:, time_axis] # retrieve pathway name and pathway probability before sort p_n = [p_n[i] for i in path_idx] p_p = [p_p[i] for i in path_idx] # set the data type seperately d_f_n = pd.DataFrame(p_n, columns=['name'], dtype=str) d_f_p = pd.DataFrame(p_p, columns=['prob'], dtype=float) d_f = pd.concat([d_f_n, d_f_p], axis=1) d_f.sort_values(by='prob', ascending=False, inplace=True, na_position='last') d_f.reset_index(drop=True, inplace=True) print(d_f.head()) # temporary directed graph d_g_tmp = nx.DiGraph() # modify labels spe_union_find_group = global_settings.get_union_find_group( DATA_DIR, atom_followed) # record all nodes nodes = set() for _, val in d_f.iterrows(): matched_spe = re.findall(r"S(\d+)", val['name']) for _, spe in enumerate(matched_spe): nodes.add( change_spe_name(spe, spe_idx_name_dict, union_find=spe_union_find_group)) for _, val in enumerate(nodes): d_g_tmp.add_node(val, weight=0.0, label=str(val)) for _, val in d_f.iterrows(): prob = float(val['prob']) # get rid of R-1000003S90, don't need it here print(val['name']) path_name_tmp = re.sub(r"R-\d+S\d+", r'', val['name']) print(path_name_tmp) # pathway contains both reaction and species if species_path is False: matched_spe = re.findall(r"S(\d+)", path_name_tmp) matched_reaction = re.findall(r"R(\d+)", path_name_tmp) for idx, spe in enumerate(matched_spe): d_g_tmp.node[change_spe_name( spe, spe_idx_name_dict, union_find=spe_union_find_group)]['weight'] += 1.0 * prob if idx > 0: src = change_spe_name(matched_spe[idx - 1], spe_idx_name_dict, union_find=spe_union_find_group) dest = change_spe_name(spe, spe_idx_name_dict, union_find=spe_union_find_group) rxn = change_rxn_name(matched_reaction[idx - 1]) if d_g_tmp.has_edge(src, dest): d_g_tmp[src][dest]['weight'] += 1.0 * prob d_g_tmp[src][dest]['reactions'].add(rxn) else: d_g_tmp.add_edge(src, dest, reactions=set([rxn]), weight=1.0 * prob) else: matched_spe = re.findall(r"S(\d+)", path_name_tmp) for idx, spe in enumerate(matched_spe): d_g_tmp.node[change_spe_name( spe, spe_idx_name_dict, union_find=spe_union_find_group)]['weight'] += 1.0 * prob if idx > 0: src = change_spe_name(matched_spe[idx - 1], spe_idx_name_dict, union_find=spe_union_find_group) dest = change_spe_name(spe, spe_idx_name_dict, union_find=spe_union_find_group) rxn = '-1' if d_g_tmp.has_edge(src, dest): d_g_tmp[src][dest]['weight'] += 1.0 * prob d_g_tmp[src][dest]['reactions'].add(rxn) else: d_g_tmp.add_edge(src, dest, reactions=set([rxn]), weight=1.0 * prob) # update directed graph, for example, # 1. reactions is originally a set, combine to get a string of reactions # 2. smooth and re-normalize node weight # 3. re-normalize edge weight node_weight = [] for _, val in enumerate(d_g_tmp.nodes()): node_weight.append(d_g_tmp.node[val]['weight']) edge_weight = [] for _, val in enumerate(d_g_tmp.edges()): edge_weight.append(d_g_tmp[val[0]][val[1]]['weight']) node_weight = rescale_array(node_weight, 1.0, 5.0) edge_weight = rescale_array(edge_weight, 3.0, 15.0) # final directed graph di_graph = nx.DiGraph() for idx, val in enumerate(d_g_tmp.nodes()): di_graph.add_node(val, weight=node_weight[idx]) for idx, val in enumerate(d_g_tmp.edges()): src = val[0] dest = val[1] rxn_set = d_g_tmp[src][dest]['reactions'] rxn_set = sorted(rxn_set, key=lambda x: int(x), reverse=False) name = ",".join(x for x in rxn_set) weight = edge_weight[idx] di_graph.add_edge(src, dest, name=name, weight=weight) return di_graph