def fProximity(A, B=None, zeroDiag=True): ''' Return the proximity (similarity x correlation) as : - 2D nparray scalar between 2D nparray vectors (filled with zeros for diagonal and symetrix terms) - 1D nparray scalar between 1D nparray vector and 2D nparray vectors - 2D nparray scalar between 2D nparray vectors ''' sA = A.shape sB = B.shape if B == None: corr = zeros((sA[0], sA[0])) for i in range(sA[0]): corr[i, i + 1:] = fProximity(A[i], A[i + 1:]) return corr elif A.ndim == 1: dif = 1. - abs(A - B).sum(axis=-1) / (1. * sA[0]) sim = B.dot(A) / (A**2).sum(axis=-1)**(0.5) / (B** 2).sum(axis=-1)**(0.5) return where(isfinite(sim), dif * sim, dif) elif B.ndim == 1: dif = 1. - abs(A - B).sum(axis=-1) / (1. * sB[0]) sim = A.dot(B) / (A**2).sum(axis=-1)**(0.5) / (B** 2).sum(axis=-1)**(0.5) return where(isfinite(sim), dif * sim, dif) else: corr = zeros((sA[0], sB[0])) for i in range(sA[0]): corr[i] = fProximity(A[i], B) return corr - zeroDiag * diag(diag(corr))
def get_auto_data(data): """ Extract auto data. """ data = data[data['STAT_MainState'] == 3] data = data[pl.isfinite(data['GPSP_Lat'].astype(float))] data = data[pl.isfinite(data['GPSP_Lon'].astype(float))] if len(data) == 0: raise RuntimeError('no auto mode detected') return data
def get_float_data(dataframe): """ Get float data out of dataframe. """ dataframe = dataframe[pl.isfinite(dataframe.TIME_StartTime)] float_cols = [isfloatarray(col) for col in dataframe.values.T] return (dataframe.T[float_cols].T).astype(float)
def apply(self, sim): t = sim.t # Process daily tests -- has to be here rather than init so have access to the sim object if isinstance(self.daily_tests, (pd.Series, pd.DataFrame)): start_date = sim['start_day'] end_date = self.daily_tests.index[-1] dateindex = pd.date_range(start_date, end_date) self.daily_tests = self.daily_tests.reindex(dateindex, fill_value=0).to_numpy() # Check that there are still tests if t < len(self.daily_tests): n_tests = self.daily_tests[t] # Number of tests for this day if not (n_tests and pl.isfinite(n_tests)): # If there are no tests today, abort early return else: sim.results['new_tests'][t] += n_tests else: return test_probs = np.ones(sim.n) # Begin by assigning equal tesitng probability to everyone symp_inds = cvu.true(sim.people.symptomatic) quar_inds = cvu.true(sim.people.quarantined) diag_inds = cvu.true(sim.people.diagnosed) test_probs[symp_inds] *= self.sympt_test test_probs[quar_inds] *= self.quar_test test_probs[diag_inds] = 0. test_inds = cvu.choose_w(probs=test_probs, n=n_tests, unique=False) sim.people.test(test_inds, self.sensitivity, loss_prob=self.loss_prob, test_delay=self.test_delay) return
def apply(self, sim): t = sim.t if t < self.start_day: return elif self.end_day is not None and t > self.end_day: return # Check that there are still tests rel_t = t - self.start_day if rel_t < len(self.daily_tests): n_tests = int(self.daily_tests[rel_t]/sim.rescale_vec[t]) # Number of tests for this day -- rescaled if not (n_tests and pl.isfinite(n_tests)): # If there are no tests today, abort early return else: sim.results['new_tests'][t] += n_tests else: return test_probs = np.ones(sim.n) # Begin by assigning equal tesitng probability to everyone symp_inds = cvu.true(sim.people.symptomatic) quar_inds = cvu.true(sim.people.quarantined) diag_inds = cvu.true(sim.people.diagnosed) test_probs[symp_inds] *= self.symp_test test_probs[quar_inds] *= self.quar_test test_probs[diag_inds] = 0. test_inds = cvu.choose_w(probs=test_probs, n=n_tests, unique=False) sim.people.test(test_inds, self.sensitivity, loss_prob=self.loss_prob, test_delay=self.test_delay) return
def pos_analysis(data): """ Analyze position. """ tmerc_map = create_map(data.GPS_Lon.values, data.GPS_Lat.values) gps_y, gps_x = tmerc_map(data.GPS_Lon.values, data.GPS_Lat.values) gpos_y, gpos_x = tmerc_map(data.GPOS_Lon.values, data.GPOS_Lat.values) gpsp_y, gpsp_x = tmerc_map( data.GPSP_Lon[pl.isfinite(data.GPSP_Lon.values)].values, data.GPSP_Lat[pl.isfinite(data.GPSP_Lat.values)].values) pl.plot(gpos_y, gpos_x, '.', label='est') pl.plot(gps_y, gps_x, 'x', label='GPS') pl.plot(gpsp_y, gpsp_x, 'ro', label='cmd') pl.xlabel('E, m') pl.ylabel('N, m') pl.grid() pl.autoscale(True, 'both', True) pl.legend(loc='best') return locals()
def read_alberty_mathematica(self, fname): """ example line: acetatesp={{-369.31,-486.01,-1,3},{-396.45,-485.76,0,4}}; the order of values is: (dG0, dH0, z, nH) """ fp = open(fname, 'r') alberty_name_to_pmap = {} alberty_name_to_hmap = {} # same as pmap but for dH of formation for line in fp.readlines(): line.rstrip() if line.find('=') == -1: continue (alberty_name, values) = line.split('sp=', 1) for token in re.findall("{([0-9\-\.\,_\s]+)}", values): val_list = token.split(',', 3) if len(val_list) != 4: raise ValueError("Syntax error at: " + line) dG0 = float(val_list[0]) try: dH0 = float(val_list[1]) except ValueError: dH0 = NaN z = int(val_list[2]) nH = int(val_list[3]) if alberty_name.find("coA") != -1: nH += 32 nMg = 0 alberty_name_to_pmap.setdefault(alberty_name, pseudoisomer.PseudoisomerMap()) alberty_name_to_pmap[alberty_name].Add(nH, z, nMg, dG0, ref='Alberty 2006') if isfinite(dH0): alberty_name_to_hmap.setdefault( alberty_name, pseudoisomer.PseudoisomerMap()) alberty_name_to_hmap[alberty_name].Add(nH, z, nMg, dH0, ref='Alberty 2006') return alberty_name_to_pmap, alberty_name_to_hmap
def apply(self, sim): t = sim.t # Process daily tests -- has to be here rather than init so have access to the sim object if isinstance(self.daily_tests, (pd.Series, pd.DataFrame)): start_date = sim['start_day'] end_date = self.daily_tests.index[-1] dateindex = pd.date_range(start_date, end_date) self.daily_tests = self.daily_tests.reindex(dateindex, fill_value=0).to_numpy() # Check that there are still tests if t < len(self.daily_tests): n_tests = self.daily_tests[t] # Number of tests for this day if not (n_tests and pl.isfinite(n_tests)): # If there are no tests today, abort early return else: sim.results['new_tests'][t] += n_tests else: return test_probs = np.ones(sim.n) new_diagnoses = 0 for i,person in enumerate(sim.people): new_diagnoses += person.check_diagnosed(t) # Adjust testing probability based on what's happened to the person # NB, these need to be separate if statements, because a person can be both diagnosed and infectious/symptomatic if person.symptomatic: test_probs[i] *= self.sympt_test # They're symptomatic if person.quarantine: test_probs[i] *= self.quar_test # They're in quarantine if person.diagnosed: test_probs[i] = 0.0 test_inds = cv.choose_weighted(probs=test_probs, n=n_tests, normalize=True, unique=False) sim.results['new_diagnoses'][t] += new_diagnoses for test_ind in test_inds: person = sim.people[test_ind] person.test(t, self.sensitivity, test_delay=self.test_delay) return
def apply(self, sim): t = sim.t # Check that there are still tests if t < len(self.daily_tests): n_tests = self.daily_tests[t] # Number of tests for this day sim.results['new_tests'][t] += n_tests else: return # If there are no tests today, abort early if not (n_tests and pl.isfinite(n_tests)): return test_probs = np.ones(sim.n) new_diagnoses = 0 for i, person in enumerate(sim.people): new_diagnoses += person.check_diagnosed(t) # Adjust testing probability based on what's happened to the person # NB, these need to be separate if statements, because a person can be both diagnosed and infectious/symptomatic if person.symptomatic: test_probs[i] *= self.sympt_test # They're symptomatic if person.known_contact: test_probs[ i] *= self.trace_test # They've had contact with a known positive if person.diagnosed: test_probs[i] = 0.0 test_inds = cv.choose_weighted(probs=test_probs, n=n_tests, normalize=True) sim.results['new_diagnoses'][t] += new_diagnoses for test_ind in test_inds: person = sim.people[test_ind] person.test(t, self.sensitivity, test_delay=self.test_delay) return
def astausgleich(ab2org, mn2org, rhoaorg): """shifts the branches of a dc sounding to generate a matching curve.""" ab2 = P.asarray(ab2org) mn2 = P.asarray(mn2org) rhoa = P.asarray(rhoaorg) um = P.unique(mn2) for i in range(len(um) - 1): r0, r1 = [], [] ac = P.intersect1d(ab2[mn2 == um[i]], ab2[mn2 == um[i + 1]]) for a in ac: r0.append(rhoa[(ab2 == a) * (mn2 == um[i])][0]) r1.append(rhoa[(ab2 == a) * (mn2 == um[i + 1])][0]) if len(r0) > 0: fak = P.mean(P.array(r0) / P.array(r1)) print(fak) if P.isfinite(fak) and fak > 0.: rhoa[mn2 == um[i + 1]] *= fak return rhoa # formerly pg as vector
def read_alberty_mathematica(self, fname): """ example line: acetatesp={{-369.31,-486.01,-1,3},{-396.45,-485.76,0,4}}; the order of values is: (dG0, dH0, z, nH) """ fp = open(fname, 'r') alberty_name_to_pmap = {} alberty_name_to_hmap = {} # same as pmap but for dH of formation for line in fp.readlines(): line.rstrip() if line.find('=') == -1: continue (alberty_name, values) = line.split('sp=', 1) for token in re.findall("{([0-9\-\.\,_\s]+)}", values): val_list = token.split(',', 3) if len(val_list) != 4: raise ValueError("Syntax error at: " + line) dG0 = float(val_list[0]) try: dH0 = float(val_list[1]) except ValueError: dH0 = NaN z = int(val_list[2]) nH = int(val_list[3]) if alberty_name.find("coA") != -1: nH += 32 nMg = 0 alberty_name_to_pmap.setdefault(alberty_name, pseudoisomer.PseudoisomerMap()) alberty_name_to_pmap[alberty_name].Add(nH, z, nMg, dG0, ref='Alberty 2006') if isfinite(dH0): alberty_name_to_hmap.setdefault(alberty_name, pseudoisomer.PseudoisomerMap()) alberty_name_to_hmap[alberty_name].Add(nH, z, nMg, dH0, ref='Alberty 2006') return alberty_name_to_pmap, alberty_name_to_hmap
def apply(self, sim): t = sim.t if t < self.start_day: return elif self.end_day is not None and t > self.end_day: return # Check that there are still tests rel_t = t - self.start_day if rel_t < len(self.daily_tests): n_tests = int( self.daily_tests[rel_t] / sim.rescale_vec[t]) # Number of tests for this day -- rescaled if not (n_tests and pl.isfinite(n_tests) ): # If there are no tests today, abort early return else: sim.results['new_tests'][t] += n_tests else: return test_probs = np.ones( sim.n) # Begin by assigning equal testing probability to everyone # Handle symptomatic testing, taking into account prevalence of ILI symptoms symp_inds = cvu.true(sim.people.symptomatic) if self.ili_prev is not None: if rel_t < len(self.ili_prev): n_ili = int( self.ili_prev[rel_t] * sim['pop_size']) # Number with ILI symptoms on this day ili_inds = cvu.choose( sim['pop_size'], n_ili ) # Give some people some symptoms. Assuming that this is independent of COVID symptomaticity... symp_inds = np.unique(np.concatenate((symp_inds, ili_inds)), 0) test_probs[symp_inds] *= self.symp_test # Handle quarantine testing quar_inds = cvu.true(sim.people.quarantined) test_probs[quar_inds] *= self.quar_test # Handle any other user-specified testing criteria if self.subtarget is not None: subtarget_inds, subtarget_vals = cv.get_subtargets( self.subtarget, sim) test_probs[ subtarget_inds] = test_probs[subtarget_inds] * subtarget_vals # Don't re-diagnose people diag_inds = cvu.true(sim.people.diagnosed) test_probs[diag_inds] = 0. # Now choose who gets tested and test them test_inds = cvu.choose_w(probs=test_probs, n=n_tests, unique=False) sim.people.test(test_inds, self.sensitivity, loss_prob=self.loss_prob, test_delay=self.test_delay) return
def apply(self, sim): t = sim.t if t < self.start_day: return elif self.end_day is not None and t > self.end_day: return # Check that there are still tests rel_t = t - self.start_day if rel_t < len(self.daily_tests): n_tests = cvu.randround( self.daily_tests[rel_t] / sim.rescale_vec[t] ) # Correct for scaling that may be applied by rounding to the nearest number of tests if not (n_tests and pl.isfinite(n_tests) ): # If there are no tests today, abort early return else: sim.results['new_tests'][t] += n_tests else: return test_probs = np.ones( sim.n ) # Begin by assigning equal testing weight (converted to a probability) to everyone # Calculate test probabilities for people with symptoms symp_inds = cvu.true(sim.people.symptomatic) symp_test = self.symp_test if self.pdf: # Handle the onset to swab delay symp_time = cvd.default_int(t - sim.people.date_symptomatic[symp_inds] ) # Find time since symptom onset inv_count = ( np.bincount(symp_time) / len(symp_time) ) # Find how many people have had symptoms of a set time and invert count = np.nan * np.ones(inv_count.shape) # Initialize the count count[inv_count != 0] = 1 / inv_count[ inv_count != 0] # Update the counts where defined symp_test *= self.pdf.pdf(symp_time) * count[ symp_time] # Put it all together test_probs[symp_inds] *= symp_test # Update the test probabilities # Handle symptomatic testing, taking into account prevalence of ILI symptoms if self.ili_prev is not None: if rel_t < len(self.ili_prev): n_ili = int( self.ili_prev[rel_t] * sim['pop_size']) # Number with ILI symptoms on this day ili_inds = cvu.choose( sim['pop_size'], n_ili ) # Give some people some symptoms. Assuming that this is independent of COVID symptomaticity... ili_inds = np.setdiff1d(ili_inds, symp_inds) test_probs[ili_inds] *= self.symp_test # Handle quarantine testing quar_test_inds = get_quar_inds(self.quar_policy, sim) test_probs[quar_test_inds] *= self.quar_test # Handle any other user-specified testing criteria if self.subtarget is not None: subtarget_inds, subtarget_vals = get_subtargets( self.subtarget, sim) test_probs[ subtarget_inds] = test_probs[subtarget_inds] * subtarget_vals # Don't re-diagnose people diag_inds = cvu.true(sim.people.diagnosed) test_probs[diag_inds] = 0.0 # With dynamic rescaling, we have to correct for uninfected people outside of the population who would test if sim.rescale_vec[t] / sim[ 'pop_scale'] < 1: # We still have rescaling to do in_pop_tot_prob = test_probs.sum() * sim.rescale_vec[ t] # Total "testing weight" of people in the subsampled population out_pop_tot_prob = sim.scaled_pop_size - sim.rescale_vec[t] * sim[ 'pop_size'] # Find out how many people are missing and assign them each weight 1 in_frac = in_pop_tot_prob / ( in_pop_tot_prob + out_pop_tot_prob ) # Fraction of tests which should fall in the sample population n_tests = cvu.randround(n_tests * in_frac) # Recompute the number of tests # Now choose who gets tested and test them n_tests = min( n_tests, (test_probs != 0).sum() ) # Don't try to test more people than have nonzero testing probability test_inds = cvu.choose_w(probs=test_probs, n=n_tests, unique=True) # Choose who actually tests sim.people.test(test_inds, self.sensitivity, loss_prob=self.loss_prob, test_delay=self.test_delay) return
def thermodynamic_pathway_analysis(S, rids, fluxes, cids, thermodynamics, html_writer): Nr, Nc = S.shape # adjust the directions of the reactions in S to fit the fluxes fluxes = map(abs, fluxes) kegg = Kegg.getInstance() #kegg.write_reactions_to_html(html_writer, S, rids, fluxes, cids, show_cids=False) dG0_f = thermodynamics.GetTransformedFormationEnergies(cids) bounds = [thermodynamics.bounds.get(cid, (None, None)) for cid in cids] res = {} try: c_mid = thermodynamics.c_mid c_range = thermodynamics.c_range res['pCr'] = find_pCr(S, dG0_f, c_mid=c_mid, ratio=3.0, bounds=bounds) #res['PCR2'] = find_unfeasible_concentrations(S, dG0_f, c_range, c_mid=c_mid, bounds=bounds) res['MTDF'] = find_mtdf(S, dG0_f, c_range=c_range, bounds=bounds) #path = pathway_modelling.Pathway(S, dG0_f) #res['pCr_regularized'] = path.FindPcr_OptimizeConcentrations( # c_mid=c_mid, ratio=3.0, bounds=bounds) #res['pCr_regularized (dGr < -2.7)'] = path.FindPcr_OptimizeConcentrations( # c_mid=c_mid, ratio=3.0, bounds=bounds, max_reaction_dg=-2.7) #res['MTDF_regularized'] = path.FindMTDF_OptimizeConcentrations( # c_range=c_range, bounds=bounds, c_mid=c_mid) #costs = [] #for max_dg in pylab.arange(0.0,-4.25,-0.25): # c = path.FindPcrEnzymeCost(c_mid=c_mid, # ratio=3.0, # bounds=bounds, # max_reaction_dg=max_dg, # fluxes=fluxes) # costs.append(str(c)) #print ', '.join(costs) except LinProgNoSolutionException: html_writer.write( '<b>No feasible solution found, cannot calculate the Margin</b>') # plot the profile graph pylab.rcParams['text.usetex'] = False pylab.rcParams['legend.fontsize'] = 10 pylab.rcParams['font.family'] = 'sans-serif' pylab.rcParams['font.size'] = 12 pylab.rcParams['lines.linewidth'] = 2 pylab.rcParams['lines.markersize'] = 5 pylab.rcParams['figure.figsize'] = [8.0, 6.0] pylab.rcParams['figure.dpi'] = 100 # plot the thermodynamic profile in standard conditions profile_fig = pylab.figure() profile_fig.hold(True) pylab.title('Thermodynamic profile', figure=profile_fig) pylab.ylabel('cumulative dG [kJ/mol]', figure=profile_fig) pylab.xlabel('Reaction KEGG ID', figure=profile_fig) pylab.xticks(pylab.arange(1, Nr + 1), ['R%05d' % rids[i] for i in xrange(Nr)], fontproperties=FontProperties(size=8), rotation=30) dG0_r = pylab.zeros((Nr, 1)) for r in range(Nr): reactants = pylab.find(S[r, :]) dG0_r[r, 0] = pylab.dot(S[r, reactants], dG0_f[reactants]) nan_indices = pylab.find(pylab.isnan(dG0_r)) finite_indices = pylab.find(pylab.isfinite(dG0_r)) if (len(nan_indices) > 0): dG0_r_finite = pylab.zeros((Nr, 1)) dG0_r_finite[finite_indices] = dG0_r[finite_indices] cum_dG0_r = pylab.cumsum( [0] + [dG0_r_finite[r, 0] * fluxes[r] for r in range(Nr)]) else: cum_dG0_r = pylab.cumsum([0] + [dG0_r[r, 0] * fluxes[r] for r in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG0_r, figure=profile_fig, label='Standard [1M]') # plot the thermodynamic profile for the different optimization schemes pylab.grid(True, figure=profile_fig) for optimization in res.keys(): dG_f, conc, score = res[optimization] if score is None: continue dG_r = pylab.dot(S, dG_f) cum_dG_r = pylab.cumsum([0] + [dG_r[i, 0] * fluxes[i] for i in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG_r, figure=profile_fig, label='%s = %.1f' % (optimization, score)) pylab.legend() html_writer.embed_matplotlib_figure(profile_fig, width=480, height=360) # plot the optimal metabolite concentrations for the different optimization schemes ind_nan = pylab.find(pylab.isnan(dG0_f)) for optimization in res.keys(): dG_f, conc, score = res[optimization] if score is None: continue dG_r = pylab.dot(S, dG_f) conc[ ind_nan] = thermodynamics.c_mid # give all compounds with unknown dG0_f the middle concentration value conc_fig = pylab.figure() conc_fig.suptitle('Concentrations (%s = %.1f)' % (optimization, score)) pylab.xscale('log', figure=conc_fig) pylab.ylabel('Compound KEGG ID', figure=conc_fig) pylab.xlabel('Concentration [M]', figure=conc_fig) pylab.yticks(range(Nc, 0, -1), ["C%05d" % cid for cid in cids], fontproperties=FontProperties(size=8)) pylab.plot(conc, range(Nc, 0, -1), '*b', figure=conc_fig) x_min = conc.min() / 10 x_max = conc.max() * 10 y_min = 0 y_max = Nc + 1 for c in range(Nc): pylab.text(conc[c, 0] * 1.1, Nc - c, kegg.cid2name(cids[c]), \ figure=conc_fig, fontsize=6, rotation=0) b_low, b_up = bounds[c] if b_low is None: b_low = x_min if b_up is None: b_up = x_max pylab.plot([b_low, b_up], [Nc - c, Nc - c], '-k', linewidth=0.4) if optimization.startswith('pCr'): c_range_opt = pC_to_range(score, c_mid=thermodynamics.c_mid, ratio=3.0) pylab.axvspan(c_range_opt[0], c_range_opt[1], facecolor='g', alpha=0.3, figure=conc_fig) else: pylab.axvspan(thermodynamics.c_range[0], thermodynamics.c_range[1], facecolor='r', alpha=0.3, figure=conc_fig) pylab.axis([x_min, x_max, y_min, y_max], figure=conc_fig) try: html_writer.embed_matplotlib_figure(conc_fig, width=420, height=360) except AttributeError: html_writer.write('<b>Failed to generate concentration figure</b>') # write all the results in tables as well for optimization in res.keys(): (dG_f, conc, score) = res[optimization] html_writer.write( '<p>Biochemical Compound Formation Energies (%s = %.1f)<br>\n' % (optimization, score)) html_writer.write('<table border="1">\n') html_writer.write(' ' + '<td>%s</td>' * 5 % ("KEGG CID", "Compound Name", "Concentration [M]", "dG'0_f [kJ/mol]", "dG'_f [kJ/mol]") + '\n') for c in range(Nc): cid = cids[c] name = kegg.cid2name(cid) if (pylab.isnan(dG0_f[c, 0])): html_writer.write('<tr><td><a href="%s">C%05d</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % \ (kegg.cid2link(cid), cid, name, "N/A", "N/A", "N/A")) else: html_writer.write('<tr><td><a href="%s">C%05d</a></td><td>%s</td><td>%.2g</td><td>%.2f</td><td>%.2f</td></tr>\n' % \ (kegg.cid2link(cid), cid, name, conc[c, 0], dG0_f[c, 0], dG_f[c, 0])) html_writer.write('</table></p>\n') html_writer.write( '<p>Biochemical Reaction Energies (%s = %.1f)<br>\n' % (optimization, score)) html_writer.write('<table border="1">\n') html_writer.write(' ' + '<td>%s</td>' * 3 % ("KEGG RID", "dG'0_r [kJ/mol]", "dG'_r [kJ/mol]") + '\n') dG_r = pylab.dot(S, dG_f) for r in range(Nr): rid = rids[r] if (pylab.isnan(dG0_r[r, 0])): html_writer.write('<tr><td><a href="%s" title="%s">R%05d</a></td><td>%s</td><td>%.2f</td></tr>\n' % \ (kegg.rid2link(rid), kegg.rid2name(rid), rid, "N/A", dG_r[r, 0])) else: html_writer.write('<tr><td><a href="%s" title="%s">R%05d</a></td><td>%.2f</td><td>%.2f</td></tr>\n' % \ (kegg.rid2link(rid), kegg.rid2name(rid), rid, dG0_r[r, 0], dG_r[r, 0])) html_writer.write('</table></p>\n') return res
def log_probfn(theta, x, y, xerr, yerr): lp = log_prior(theta) if not pyl.isfinite(lp): return -pyl.inf return log_prior(theta) + log_likelihood(theta, x, y, xerr, yerr)
def thermodynamic_pathway_analysis(S, rids, fluxes, cids, thermodynamics, html_writer): Nr, Nc = S.shape # adjust the directions of the reactions in S to fit the fluxes fluxes = map(abs, fluxes) kegg = Kegg.getInstance() #kegg.write_reactions_to_html(html_writer, S, rids, fluxes, cids, show_cids=False) dG0_f = thermodynamics.GetTransformedFormationEnergies(cids) bounds = [thermodynamics.bounds.get(cid, (None, None)) for cid in cids] res = {} try: c_mid = thermodynamics.c_mid c_range = thermodynamics.c_range res['pCr'] = find_pCr(S, dG0_f, c_mid=c_mid, ratio=3.0, bounds=bounds) #res['PCR2'] = find_unfeasible_concentrations(S, dG0_f, c_range, c_mid=c_mid, bounds=bounds) res['MTDF'] = find_mtdf(S, dG0_f, c_range=c_range, bounds=bounds) #path = pathway_modelling.Pathway(S, dG0_f) #res['pCr_regularized'] = path.FindPcr_OptimizeConcentrations( # c_mid=c_mid, ratio=3.0, bounds=bounds) #res['pCr_regularized (dGr < -2.7)'] = path.FindPcr_OptimizeConcentrations( # c_mid=c_mid, ratio=3.0, bounds=bounds, max_reaction_dg=-2.7) #res['MTDF_regularized'] = path.FindMTDF_OptimizeConcentrations( # c_range=c_range, bounds=bounds, c_mid=c_mid) #costs = [] #for max_dg in pylab.arange(0.0,-4.25,-0.25): # c = path.FindPcrEnzymeCost(c_mid=c_mid, # ratio=3.0, # bounds=bounds, # max_reaction_dg=max_dg, # fluxes=fluxes) # costs.append(str(c)) #print ', '.join(costs) except LinProgNoSolutionException: html_writer.write('<b>No feasible solution found, cannot calculate the Margin</b>') # plot the profile graph pylab.rcParams['text.usetex'] = False pylab.rcParams['legend.fontsize'] = 10 pylab.rcParams['font.family'] = 'sans-serif' pylab.rcParams['font.size'] = 12 pylab.rcParams['lines.linewidth'] = 2 pylab.rcParams['lines.markersize'] = 5 pylab.rcParams['figure.figsize'] = [8.0, 6.0] pylab.rcParams['figure.dpi'] = 100 # plot the thermodynamic profile in standard conditions profile_fig = pylab.figure() profile_fig.hold(True) pylab.title('Thermodynamic profile', figure=profile_fig) pylab.ylabel('cumulative dG [kJ/mol]', figure=profile_fig) pylab.xlabel('Reaction KEGG ID', figure=profile_fig) pylab.xticks(pylab.arange(1, Nr + 1), ['R%05d' % rids[i] for i in xrange(Nr)], fontproperties=FontProperties(size=8), rotation=30) dG0_r = pylab.zeros((Nr, 1)) for r in range(Nr): reactants = pylab.find(S[r,:]) dG0_r[r, 0] = pylab.dot(S[r, reactants], dG0_f[reactants]) nan_indices = pylab.find(pylab.isnan(dG0_r)) finite_indices = pylab.find(pylab.isfinite(dG0_r)) if (len(nan_indices) > 0): dG0_r_finite = pylab.zeros((Nr, 1)) dG0_r_finite[finite_indices] = dG0_r[finite_indices] cum_dG0_r = pylab.cumsum([0] + [dG0_r_finite[r, 0] * fluxes[r] for r in range(Nr)]) else: cum_dG0_r = pylab.cumsum([0] + [dG0_r[r, 0] * fluxes[r] for r in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG0_r, figure=profile_fig, label='Standard [1M]') # plot the thermodynamic profile for the different optimization schemes pylab.grid(True, figure=profile_fig) for optimization in res.keys(): dG_f, conc, score = res[optimization] if score is None: continue dG_r = pylab.dot(S, dG_f) cum_dG_r = pylab.cumsum([0] + [dG_r[i, 0] * fluxes[i] for i in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG_r, figure=profile_fig, label='%s = %.1f' % (optimization, score)) pylab.legend() html_writer.embed_matplotlib_figure(profile_fig, width=480, height=360) # plot the optimal metabolite concentrations for the different optimization schemes ind_nan = pylab.find(pylab.isnan(dG0_f)) for optimization in res.keys(): dG_f, conc, score = res[optimization] if score is None: continue dG_r = pylab.dot(S, dG_f) conc[ind_nan] = thermodynamics.c_mid # give all compounds with unknown dG0_f the middle concentration value conc_fig = pylab.figure() conc_fig.suptitle('Concentrations (%s = %.1f)' % (optimization, score)) pylab.xscale('log', figure=conc_fig) pylab.ylabel('Compound KEGG ID', figure=conc_fig) pylab.xlabel('Concentration [M]', figure=conc_fig) pylab.yticks(range(Nc, 0, -1), ["C%05d" % cid for cid in cids], fontproperties=FontProperties(size=8)) pylab.plot(conc, range(Nc, 0, -1), '*b', figure=conc_fig) x_min = conc.min() / 10 x_max = conc.max() * 10 y_min = 0 y_max = Nc + 1 for c in range(Nc): pylab.text(conc[c, 0] * 1.1, Nc - c, kegg.cid2name(cids[c]), \ figure=conc_fig, fontsize=6, rotation=0) b_low, b_up = bounds[c] if b_low is None: b_low = x_min if b_up is None: b_up = x_max pylab.plot([b_low, b_up], [Nc - c, Nc - c], '-k', linewidth=0.4) if optimization.startswith('pCr'): c_range_opt = pC_to_range(score, c_mid=thermodynamics.c_mid, ratio=3.0) pylab.axvspan(c_range_opt[0], c_range_opt[1], facecolor='g', alpha=0.3, figure=conc_fig) else: pylab.axvspan(thermodynamics.c_range[0], thermodynamics.c_range[1], facecolor='r', alpha=0.3, figure=conc_fig) pylab.axis([x_min, x_max, y_min, y_max], figure=conc_fig) try: html_writer.embed_matplotlib_figure(conc_fig, width=420, height=360) except AttributeError: html_writer.write('<b>Failed to generate concentration figure</b>') # write all the results in tables as well for optimization in res.keys(): (dG_f, conc, score) = res[optimization] html_writer.write('<p>Biochemical Compound Formation Energies (%s = %.1f)<br>\n' % (optimization, score)) html_writer.write('<table border="1">\n') html_writer.write(' ' + '<td>%s</td>'*5 % ("KEGG CID", "Compound Name", "Concentration [M]", "dG'0_f [kJ/mol]", "dG'_f [kJ/mol]") + '\n') for c in range(Nc): cid = cids[c] name = kegg.cid2name(cid) if (pylab.isnan(dG0_f[c, 0])): html_writer.write('<tr><td><a href="%s">C%05d</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % \ (kegg.cid2link(cid), cid, name, "N/A", "N/A", "N/A")) else: html_writer.write('<tr><td><a href="%s">C%05d</a></td><td>%s</td><td>%.2g</td><td>%.2f</td><td>%.2f</td></tr>\n' % \ (kegg.cid2link(cid), cid, name, conc[c, 0], dG0_f[c, 0], dG_f[c, 0])) html_writer.write('</table></p>\n') html_writer.write('<p>Biochemical Reaction Energies (%s = %.1f)<br>\n' % (optimization, score)) html_writer.write('<table border="1">\n') html_writer.write(' ' + '<td>%s</td>'*3 % ("KEGG RID", "dG'0_r [kJ/mol]", "dG'_r [kJ/mol]") + '\n') dG_r = pylab.dot(S, dG_f) for r in range(Nr): rid = rids[r] if (pylab.isnan(dG0_r[r, 0])): html_writer.write('<tr><td><a href="%s" title="%s">R%05d</a></td><td>%s</td><td>%.2f</td></tr>\n' % \ (kegg.rid2link(rid), kegg.rid2name(rid), rid, "N/A", dG_r[r, 0])) else: html_writer.write('<tr><td><a href="%s" title="%s">R%05d</a></td><td>%.2f</td><td>%.2f</td></tr>\n' % \ (kegg.rid2link(rid), kegg.rid2name(rid), rid, dG0_r[r, 0], dG_r[r, 0])) html_writer.write('</table></p>\n') return res
def CompareMtdf(self, target_mtdf=None): n_pathways = len(self.pathways) for i, (name, pathway_data) in enumerate(self.pathways.iteritems()): logging.info('Analyzing pathway %s', name) self.html_writer.write('<div margin="20px"><div><b>%s</b></div>' % name) self.GetConditions(pathway_data) S, rids, fluxes, cids = self.GetReactions(name, pathway_data) self.WriteReactionsToHtml(S, rids, fluxes, cids, show_cids=False) # Bounds on concentrations. bounds = [ self.thermo.bounds.get(cid, (None, None)) for cid in cids ] # All fluxes are forwards fluxes = map(abs, fluxes) dG0_f = self.thermo.GetTransformedFormationEnergies(cids) c_mid = self.thermo.c_mid c_range = self.thermo.c_range path = pathway_modelling.Pathway(S, dG0_f) if target_mtdf is not None: _ln_conc, score = path.FindMtdf_Regularized( c_range, bounds, c_mid, min_mtdf=target_mtdf, max_mtdf=target_mtdf) else: _ln_conc, score = path.FindMTDF_OptimizeConcentrations( c_range, bounds, c_mid) if score is None: logging.error('No MTDF score for %s', name) continue Nr, Nc = S.shape profile_fig = pylab.figure() profile_fig.hold(True) pylab.title('Thermodynamic Profile', figure=profile_fig) pylab.ylabel('Cumulative dG [kJ/mol]', figure=profile_fig) pylab.xlabel('Reaction KEGG ID', figure=profile_fig) pylab.grid(True, figure=profile_fig) rids = ['%s' % rids[i] for i in xrange(Nr)] pylab.xticks(pylab.arange(1, Nr + 1), rids, fontproperties=FontProperties(size=8), rotation=30) dG0_r = pylab.zeros((Nr, 1)) for r in range(Nr): reactants = pylab.find(S[r, :]) dG0_r[r, 0] = pylab.dot(S[r, reactants], dG0_f[reactants]) nan_indices = pylab.find(pylab.isnan(dG0_r)) finite_indices = pylab.find(pylab.isfinite(dG0_r)) if (len(nan_indices) > 0): dG0_r_finite = pylab.zeros((Nr, 1)) dG0_r_finite[finite_indices] = dG0_r[finite_indices] cum_dG0_r = pylab.cumsum( [0] + [dG0_r_finite[r, 0] * fluxes[r] for r in range(Nr)]) else: cum_dG0_r = pylab.cumsum( [0] + [dG0_r[r, 0] * fluxes[r] for r in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG0_r, 'g--', label='Standard [1M]', figure=profile_fig) # plot the thermodynamic profile for the different optimization schemes dG_r = pylab.dot(S, dG_f) self.html_writer.write('<ol>') for i, dG in enumerate(dG_r): self.html_writer.write('<li>%s: %.2f' % (rids[i], dG)) self.html_writer.write('</ol>') cum_dG_r = pylab.cumsum( [0] + [dG_r[i, 0] * fluxes[i] for i in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG_r, figure=profile_fig, label='%s MTDF = %.1f' % (name, score)) pylab.legend(['Standard conditions', 'MTDF'], 'lower left') fname = '%s-profile-fig' % name html_writer.embed_matplotlib_figure(profile_fig, width=640, height=480, name=fname) # Give all compounds with unknown dG0_f the middle concentration value conc[nan_indices] = self.thermo.c_mid unconstrained_cs = [] unconstrained_cids = [] for i, bound in enumerate(bounds): b_low, b_up = bound if b_low is None and b_up is None: unconstrained_cs.append(conc[i, 0]) unconstrained_cids.append(cids[i]) n_constrained = len(unconstrained_cs) conc_fig = pylab.figure() conc_fig.suptitle('Concentrations %s (MTDF = %.1f)' % (name, score)) pylab.xscale('log', figure=conc_fig) pylab.ylabel('Compound KEGG ID', figure=conc_fig) pylab.xlabel('Concentration [M]', figure=conc_fig) cids_names = ["C%05d" % cid for cid in unconstrained_cids] pylab.yticks(range(n_constrained, 0, -1), cids_names, fontproperties=FontProperties(size=8)) pylab.plot(unconstrained_cs, range(n_constrained, 0, -1), '*b', figure=conc_fig) x_min = self.thermo.c_range[0] / 10 x_max = self.thermo.c_range[1] * 50 y_min = 0 y_max = n_constrained + 1 for i, concentration in enumerate(unconstrained_cs): pylab.text(concentration * 1.1, n_constrained - i, kegg.cid2name(unconstrained_cids[i]), figure=conc_fig, fontsize=6, rotation=0) y_val = n_constrained - i pylab.plot([x_min, x_max], [y_val, y_val], '-k', linewidth=0.4) pylab.axvspan(min(unconstrained_cs), max(unconstrained_cs), facecolor='g', alpha=0.3, figure=conc_fig) pylab.axis([x_min, x_max, y_min, y_max], figure=conc_fig) fname = '%s-mtdf-conc-fig' % name html_writer.embed_matplotlib_figure(conc_fig, width=640, height=480, name=fname) self.html_writer.write('</div>')
def CompareMtdf(self, target_mtdf=None): n_pathways = len(self.pathways) for i, (name, pathway_data) in enumerate(self.pathways.iteritems()): logging.info('Analyzing pathway %s', name) self.html_writer.write('<div margin="20px"><div><b>%s</b></div>' % name) self.GetConditions(pathway_data) S, rids, fluxes, cids = self.GetReactions(name, pathway_data) self.WriteReactionsToHtml(S, rids, fluxes, cids, show_cids=False) # Bounds on concentrations. bounds = [self.thermo.bounds.get(cid, (None, None)) for cid in cids] # All fluxes are forwards fluxes = map(abs, fluxes) dG0_f = self.thermo.GetTransformedFormationEnergies(cids) c_mid = self.thermo.c_mid c_range = self.thermo.c_range path = pathway_modelling.Pathway(S, dG0_f) if target_mtdf is not None: _ln_conc, score = path.FindMtdf_Regularized( c_range, bounds, c_mid, min_mtdf=target_mtdf, max_mtdf=target_mtdf) else: _ln_conc, score = path.FindMTDF_OptimizeConcentrations( c_range, bounds, c_mid) if score is None: logging.error('No MTDF score for %s', name) continue Nr, Nc = S.shape profile_fig = pylab.figure() profile_fig.hold(True) pylab.title('Thermodynamic Profile', figure=profile_fig) pylab.ylabel('Cumulative dG [kJ/mol]', figure=profile_fig) pylab.xlabel('Reaction KEGG ID', figure=profile_fig) pylab.grid(True, figure=profile_fig) rids = ['%s' % rids[i] for i in xrange(Nr)] pylab.xticks(pylab.arange(1, Nr + 1), rids, fontproperties=FontProperties(size=8), rotation=30) dG0_r = pylab.zeros((Nr, 1)) for r in range(Nr): reactants = pylab.find(S[r,:]) dG0_r[r, 0] = pylab.dot(S[r, reactants], dG0_f[reactants]) nan_indices = pylab.find(pylab.isnan(dG0_r)) finite_indices = pylab.find(pylab.isfinite(dG0_r)) if (len(nan_indices) > 0): dG0_r_finite = pylab.zeros((Nr, 1)) dG0_r_finite[finite_indices] = dG0_r[finite_indices] cum_dG0_r = pylab.cumsum([0] + [dG0_r_finite[r, 0] * fluxes[r] for r in range(Nr)]) else: cum_dG0_r = pylab.cumsum([0] + [dG0_r[r, 0] * fluxes[r] for r in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG0_r, 'g--', label='Standard [1M]', figure=profile_fig) # plot the thermodynamic profile for the different optimization schemes dG_r = pylab.dot(S, dG_f) self.html_writer.write('<ol>') for i, dG in enumerate(dG_r): self.html_writer.write('<li>%s: %.2f' % (rids[i], dG)) self.html_writer.write('</ol>') cum_dG_r = pylab.cumsum([0] + [dG_r[i, 0] * fluxes[i] for i in range(Nr)]) pylab.plot(pylab.arange(0.5, Nr + 1), cum_dG_r, figure=profile_fig, label='%s MTDF = %.1f' % (name, score)) pylab.legend(['Standard conditions', 'MTDF'], 'lower left') fname = '%s-profile-fig' % name html_writer.embed_matplotlib_figure(profile_fig, width=640, height=480, name=fname) # Give all compounds with unknown dG0_f the middle concentration value conc[nan_indices] = self.thermo.c_mid unconstrained_cs = [] unconstrained_cids = [] for i, bound in enumerate(bounds): b_low, b_up = bound if b_low is None and b_up is None: unconstrained_cs.append(conc[i, 0]) unconstrained_cids.append(cids[i]) n_constrained = len(unconstrained_cs) conc_fig = pylab.figure() conc_fig.suptitle('Concentrations %s (MTDF = %.1f)' % (name, score)) pylab.xscale('log', figure=conc_fig) pylab.ylabel('Compound KEGG ID', figure=conc_fig) pylab.xlabel('Concentration [M]', figure=conc_fig) cids_names = ["C%05d" % cid for cid in unconstrained_cids] pylab.yticks(range(n_constrained, 0, -1), cids_names, fontproperties=FontProperties(size=8)) pylab.plot(unconstrained_cs, range(n_constrained, 0, -1), '*b', figure=conc_fig) x_min = self.thermo.c_range[0] / 10 x_max = self.thermo.c_range[1] * 50 y_min = 0 y_max = n_constrained + 1 for i, concentration in enumerate(unconstrained_cs): pylab.text(concentration * 1.1, n_constrained - i, kegg.cid2name(unconstrained_cids[i]), figure=conc_fig, fontsize=6, rotation=0) y_val = n_constrained - i pylab.plot([x_min, x_max], [y_val, y_val], '-k', linewidth=0.4) pylab.axvspan(min(unconstrained_cs), max(unconstrained_cs), facecolor='g', alpha=0.3, figure=conc_fig) pylab.axis([x_min, x_max, y_min, y_max], figure=conc_fig) fname = '%s-mtdf-conc-fig' % name html_writer.embed_matplotlib_figure(conc_fig, width=640, height=480, name=fname) self.html_writer.write('</div>')
figid = pylab.figure(num=3, figsize=(8, 6)) pn = figid.add_subplot(1, 1, 1) pn.plot(freq, power) pn.set_title('Power Spectrum of Sunspot Data') pn.set_xlabel('Frequency (cycles/Year)') pn.set_ylabel('|FFT(f(t))|$^2$') period = 1. / freq fig = pylab.figure(figsize=(8, 6)) pylab.plot(period, power) pylab.xlim(0., 100.) pylab.title('Power Spectrum of Sunspot Data') pylab.xlabel('Period (Years/cycle)') pylab.ylabel('|FFT(f(t))|^2') # Find finite values indFinite = pylab.where(pylab.isfinite(power))[0] # Find index where |F|^2 is maximum maxpower = power[indFinite].max() ind = pylab.where(power[indFinite] == maxpower)[0] # Peak value pylab.plot(period[indFinite[ind]], power[indFinite[ind]], marker='o') # Annotation pylab.text(period[ indFinite[ ind ] ] + 3, power[ indFinite[ ind ] ], \ 'Period = %8.3f years' % period[ indFinite[ ind ] ], color='k' )
def weightedQuantilesByGroup(pandasDF,quantilesOf,byGroup=None,weightVar='weight',varPrefix='qtl_', varsByQuantile=None):#,suffix='',skipPlots=True,rankfileprefix=None,ginifileprefix=None,returnFilenamesOnly=False,forceUpdate=False,groupNames=None,ginisOf=None,loadAll=None,parallelSafe=None): # ########################################################################## ########################################################################## """ 2013 Jan: This is derived from pystata's generateRankingData() for stata data, but this one takes pandas DataFrame instead. And we don't include ginis! (ugh). And leave plotting to a separate function, since we could return data. e.g.: generateRankingData(pandasDF,'income', varsByQuantile=None,byGroup='year PRuid',weightVar='weight',suffix='',skipPlots=True,rankfileprefix=None,returnFilenamesOnly=False,forceUpdate=False,groupNames=None,parallelSafe=None): As for the "varsByQuantile", those can easily be done in pandas using cut and groupby etc... not done yet. This no longer creates files. It returns an augmented DataFrame. Does not allow more than one variable for quantilesOf. """ df=pandasDF from scipy import stats assert quantilesOf in df if isinstance(byGroup,str): byGroup=byGroup.split(' ') import numpy as np newvar=varPrefix+quantilesOf df[newvar]=np.nan def getq(adf): # If I remove the .values from the following, it fails to preserve order. ww=weightedQuantile(adf[quantilesOf].values,adf[weightVar].values) adf[newvar]=ww assert ww is np.nan or len(ww)==len(adf) return(adf) print 'Calculating quantiles...', #,end=' ') withquantiles=df.groupby(byGroup,group_keys=False).apply(getq) print(' [Done]') return(withquantiles) # 2013 Feb. Also calculate varsByQuantile, if desired. if varsByQuantile==None: varsByQuantile==[] assert all(vbq in df for vbq in varsByQuantile) assert not varsByQuantile if 0: # NOT WRITTEN YET!!!!!!!!!!!!!!!!!!!!!!! for iv,vname in enumerate(varsByQuantile+[quantilesOf]): # Use values with weights: vvww=[ finiteValues(array([respondent[vname] for respondent in byQtl[qtl]]), array([respondent[weightVar] for respondent in byQtl[qtl]]) ) for qtl in pQtl] #qtlStats['uw_'+vname]=[np.mean( # finiteValues(array([respondent[vname] for respondent in byQtl[qtl]])) # ) for qtl in pQtl] qtlStats[vname]=[wtmean(vv,weights=ww) for vv,ww in vvww] #qtlStats['uw_se'+vname]=[stats.sem( # finiteValues(array([respondent[vname] for respondent in byQtl[qtl]])) # ) for qtl in pQtl] qtlStats['se'+vname]=[wtsem(vv,ww) for vv,ww in vvww] # Ugly kludge: if vname in ['SWL','lifeToday']: vvall,wwall=finiteValues(array([respondent[vname] for respondent in groupDfinite]), array([respondent[weightVar] for respondent in groupDfinite])) from pylab import histogram,array qtlStats['hist'+vname]=histogram(vvall,bins=-0.5+array([0,1,2,3,4,5,6,7,8,9,10,11]),weights=wwall) # Shall I also calculate Gini here? It seems it may be much faster than Stata's version. #:(, Though I won't have a standard error for it. if doGini and (ginisOf is None or vname in ginisOf): # n.b. I don't just want the ones with finite rankVar. So go back to groupD: xxV=array([respondent[vname] for respondent in groupD]) macroInequalities[agroup]['gini'+vname]= cpblGini(weightD,xxV) #print " %s=%s: Gini=%f"%(byGroup,agroup,inequality.Gini) # ne=where(logical_and(logical_and(isfinite(x),isfinite(y)),logical_and(isfinite(yLow),isfinite(yHigh)))) #vQtl=array([stats.mean(finiteValues( # vv[find(logical_and(y<=yQtl[iq] , y>=([min(y)]+yQtl)[iq]))] )) for iq in range(len(yQtl))]) #sevQtl=array([stats.sem(finiteValues( # vv[find(logical_and(y<=yQtl[iq] , y>=([min(y)]+yQtl)[iq]))] )) for iq in range(len(yQtl))]) return(withquantiles) if 0: def assignQs(x,w):#adf, xv,wv) from scipy import interpolate import numpy as np #w,x=adf[wv],adf[xv] CDF=np.cumsum(w)*1.0/sum(w) # interp1d returns a function... qinterp=interpolate.interp1d(np.array(CDF),np.array(x)) return([np.nan if np.isnan(xi) else qinterp(xi) for xi in x]) # else: # Return a value for quantile q # return(interpolate.interp1d(array(CDF),array(x))(q)) #quantiles=df.groupby(byGroup).apply(lambda adf: assignQs(adf[quantilesOf],adf[weightVar])) bb=quantiles0[quantiles0['PRuid']==24] plt.plot(bb['qtl_lnHHincome'],bb['lnHHincome'],'.') plt.show() iuiui # as_index=False makes it so that the eventual returned value is not grouped! print 'Calculating quantiles...', #,end=' ') quantiles=df.groupby(byGroup, as_index=False).apply(lambda adf: weightedQuantile(adf[quantilesOf],adf[weightVar])) print(' [Done]') quantilesi=df.groupby(byGroup, group_keys=False).apply(lambda adf: weightedQuantile(adf[quantilesOf],adf[weightVar])) xdf=df.groupby(byGroup).transform(lambda adf: weightedQuantile(adf[quantilesOf],adf[weightVar])) #df.merge(quantilesi ###links2=links.merge(pd.DataFrame(fuelByStateYear),how='left',left_on=['MIN_AGE','state'],right_on=['year','state']) fooo # OLD FUNCTION BELOW from pylab import figure,plot,show,clf,arange,floor,array,find,logical_and,where,isfinite,xlabel,ylabel,cumsum,subplot,rcParams rcParams.update({'text.usetex': False,}) #Grrr. need it for plusminus sign, but can't deal with all foreign characters in country and region names?! import numpy as np from cpblUtilities import plotWithEnvelope,transLegend,savefigall,sortDictsIntoQuantiles,finiteValues,shelfSave,shelfLoad # Because numpy and scipy don't have basic weight option in mean, sem !!! from cpblUtilities import wtmean,wtsem,wtvar from inequality import ineq,cpblGini if byGroup==None: byGroup='' if varsByQuantile==None: varsByQuantile==[] if suffix: suffix='-'+suffix assert isinstance(byGroup,str) #tsvFile=WP+stripWPdta(stataFile)+'-qtlInput'+suffix+'.tsv' microQuantFile=WP+stripWPdta(stataFile)+'-qtlData'+suffix+'.tsv' macroQuantFileShelf=WP+stripWPdta(stataFile)+'-qtlData-'+byGroup+suffix+'.pyshelf' macroQuantFile=WP+stripWPdta(stataFile)+'-qtlData-'+byGroup+suffix+'.tsv' macroGiniFile=WP+stripWPdta(stataFile)+'-gini-'+byGroup+suffix+'.tsv' plotfileprefix=WP+'graphics/TMPRANK' if rankfileprefix: microQuantFile=rankfileprefix+'-'+byGroup+'.tsv' macroQuantFileShelf=rankfileprefix+'-'+byGroup+'.pyshelf' macroQuantFile=rankfileprefix+'-'+byGroup+'.tsv' plotfileprefix=WP+'graphics/'+stripWPdta(rankfileprefix)+byGroup if ginifileprefix: macroGiniFile=ginifileprefix+'-'+byGroup+'.tsv' if not fileOlderThan([microQuantFile,macroQuantFileShelf]+doGini*[macroGiniFile],WPdta(stataFile)) and not forceUpdate: print ' (Skipping generateRankingData; no need to update %s/%s from %s...)'%(microQuantFile,macroQuantFileShelf,stataFile) return(os.path.splitext(microQuantFile)[0],os.path.splitext(macroQuantFileShelf)[0]) #return(microQuantFile,macroQuantFileShelf) # Suffix is used in following to ensure that different calls to this function get the correct result exported from Stata, etc, (see notes in fcn below). # Caution! if onlyVars=None if not loadAll: onlyVars=' '.join(uniqueInOrder(inVars+[byGroup, quantilesOf]+varsByQuantile+[weightVar])) # If parallelSafe, Make the following force-updated, to avoid using shelve/shelf files simultanously by different processes!! dddT=loadStataDataForPlotting(stataFile,onlyVars=onlyVars,treeKeys=[byGroup],forceUpdate='parallel' if parallelSafe else forceUpdate,suffix=suffix)#vectors=True)#False,forceUpdate=False,singletLeaves=False): # Testing functionality aug 2012 to make this robust to weight variable not existing for all in dataset: for kk in dddT: plen=len(dddT[kk]) dddT[kk]=[rrrr for rrrr in dddT[kk] if isfinite(rrrr[weightVar])] if not len(dddT[kk])==plen: print('CAUTION: I found and ditched some (%d/%d) individuals without weight %s for group %s in generateRankingData'%(plen-len(dddT[kk]),plen,weightVar,kk)) if 0: from dictTrees import dictTree kk=ddd.keys() #for byKey in byGroup print 'Sorting by key...' dddT=dictTree([dict([[akey,ddd[akey][irow]] for akey in kk]) for irow in range(len(ddd[kk[0]]))],[byGroup]) # Now.. Order these and assign ranking (between 0 and 1): This should take into account the weights, properly. print '%d elements have no group (%s).'%(len(dddT.get('',[])),byGroup) rankGroups=[] macroStats=[] macroInequalities={} if not skipPlots: figure(126) clf() figure(124) for agroup in sorted(dddT.keys()):#.keys()[0:10]: if not agroup: continue groupD=dddT[agroup] weightD=array([respondent[weightVar] for respondent in groupD]) groupDfinite=[xx for xx in groupD if isfinite(xx[quantilesOf]) ] # Hm, does the following fail if I include the nan's!? groupDfinite.sort(key=lambda x:x[quantilesOf]) if doGini: macroInequalities[agroup]={byGroup:agroup} if 0: # I'm eliminating the following, unweighted ranking for now. if len(groupDfinite)==0: continue if len(groupDfinite)==1: groupDfinite[0]['rank'+quantilesOf]=0.5 else: for iRank,respondent in enumerate(groupDfinite): # THIS IS WRONG!!!!!!!!!! IT IGNORES WEIGHT. I SHOULD BE USING WEIGHTED RANK. I DO THIS BELOW. CANNOT FIND scipy ROUTINE TO DO QUANTILES WITH SAMPLE WEIGHTS. respondent['rank'+quantilesOf]=iRank*1.0/(len(groupDfinite)-1) x=array([respondent['rank'+quantilesOf] for respondent in groupDfinite]) y=array([respondent[quantilesOf] for respondent in groupDfinite]) w=array([respondent[weightVar] for respondent in groupDfinite]) # Now, I also need to section these up into groups, in order to calculate other variables by quantile. How to do this? I could use a kernel smoothing, to estimate y(I), where, e.g. y is SWB and I is income. OR I could calculate quantiles. e.g. qtlY(I) would be the mean y amongst all those in the ith quantile. I'll do the latter. This means that curves will NOT represent y(I), since it's mean(y) but i<I. minN=20 nQuantiles=min(25,floor(len(y)/minN)) pQtl=(1.0+1.0*arange(nQuantiles))/nQuantiles assert len(pQtl)==nQuantiles assert all(isfinite(w)) # Really? Couldn't I make this robust... [aug2012: okay, i have, above, by modifying ddTT] # Use my nifty sort-into-quantiles function minN=20 if len(y)<minN/2: print ' SKIPPING '+agroup+' with only %d respondents...'%len(y) continue nQuantiles=max(2,min(25,floor(len(y)/minN))) # The following function ALSO fills in a new element of the weighted rank of each individual. byQtl=sortDictsIntoQuantiles(groupD,sortkey=quantilesOf,weightkey=weightVar,approxN=25,)#nQuantiles=min(25,floor(len(y)/minN))) pQtl=sorted(byQtl.keys()) print ' Quantiles: parsing for group %s=%20s,\t with %d respondents,\t with %d having rank variable;\t therefore using %d quantiles...'%(byGroup,agroup,len(groupDfinite),len(finiteValues(y)),len(pQtl)) # So since sortDictsIntoQ... filled in individual ranks, I can now plot these: x=array([respondent['rank'+quantilesOf[0].upper()+quantilesOf[1:]] for respondent in groupDfinite]) if not skipPlots: figure(126) clf() subplot(121) plot(y,x,hold=True) xlabel(substitutedNames(quantilesOf)) ylabel('Quantile') print 'More up to date plots are made by a custom function using the .shelf data, in regressionsInequality' #print [stats.mean([gg['lnHHincome'] for gg in byQtl[qq]]) for qq in pQtl] #print [stats.mean([gg['lifeToday'] for gg in byQtl[qq]]) for qq in pQtl] # Cool! That worked nicely, and is even quite efficient. # I wonder how byQtl.keys() compares with the unweighted measure below... (uses approximately quantile unbiased (Cunnane) parameters) yQtl2=stats.mstats.mquantiles(y, prob=pQtl, alphap=0.40000000000000002, betap=0.40000000000000002, axis=None, limit=()) # Now calculate weighted means for variables of interest within each quantile group: qtlStats={'qtl':pQtl,'group':agroup} # Also save in the output any variables which are uniform within this group (ie markers of a group in which this is a subgroup): if 0: for vvv in [vv for vv in inVars if vv not in [byGroup]]: if all(array([respondent[vvv] for respondent in groupDfinite])==groupDfinite[0][vvv]): # ah, this variable is uniform within the group qtlStats[vvv]=groupDfinite[0][vvv] qtlStats['n']=[ len( finiteValues(array([respondent[quantilesOf] for respondent in byQtl[qtl]])) ) for qtl in pQtl] for iv,vname in enumerate(varsByQuantile+[quantilesOf]): # Use values with weights: vvww=[ finiteValues(array([respondent[vname] for respondent in byQtl[qtl]]), array([respondent[weightVar] for respondent in byQtl[qtl]]) ) for qtl in pQtl] #qtlStats['uw_'+vname]=[np.mean( # finiteValues(array([respondent[vname] for respondent in byQtl[qtl]])) # ) for qtl in pQtl] qtlStats[vname]=[wtmean(vv,weights=ww) for vv,ww in vvww] #qtlStats['uw_se'+vname]=[stats.sem( # finiteValues(array([respondent[vname] for respondent in byQtl[qtl]])) # ) for qtl in pQtl] qtlStats['se'+vname]=[wtsem(vv,ww) for vv,ww in vvww] # Ugly kludge: if vname in ['SWL','lifeToday']: vvall,wwall=finiteValues(array([respondent[vname] for respondent in groupDfinite]), array([respondent[weightVar] for respondent in groupDfinite])) from pylab import histogram,array qtlStats['hist'+vname]=histogram(vvall,bins=-0.5+array([0,1,2,3,4,5,6,7,8,9,10,11]),weights=wwall) # Shall I also calculate Gini here? It seems it may be much faster than Stata's version. #:(, Though I won't have a standard error for it. if doGini and (ginisOf is None or vname in ginisOf): # n.b. I don't just want the ones with finite rankVar. So go back to groupD: xxV=array([respondent[vname] for respondent in groupD]) macroInequalities[agroup]['gini'+vname]= cpblGini(weightD,xxV) #print " %s=%s: Gini=%f"%(byGroup,agroup,inequality.Gini) # ne=where(logical_and(logical_and(isfinite(x),isfinite(y)),logical_and(isfinite(yLow),isfinite(yHigh)))) #vQtl=array([stats.mean(finiteValues( # vv[find(logical_and(y<=yQtl[iq] , y>=([min(y)]+yQtl)[iq]))] )) for iq in range(len(yQtl))]) #sevQtl=array([stats.sem(finiteValues( # vv[find(logical_and(y<=yQtl[iq] , y>=([min(y)]+yQtl)[iq]))] )) for iq in range(len(yQtl))]) if (not skipPlots) and vname in varsByQuantile: figure(126) subplot(122) colors='rgbckm' vQtl= array(qtlStats[vname]) sevQtl= array(qtlStats['se'+vname]) pQtl=array(pQtl) plotWithEnvelope(pQtl,vQtl,vQtl+sevQtl,vQtl-sevQtl,linestyle='.-',linecolor=None,facecolor=colors[iv],alpha=0.5,label=None,lineLabel=None,patchLabel=vname,laxSkipNaNsSE=True,laxSkipNaNsXY=True,ax=None,skipZeroSE=True) # Why do I seem to need both lax flags? plot(pQtl,vQtl,'.',color=colors[iv],alpha=0.5) xlabel(substitutedNames(quantilesOf) +' quantile') ##ylabel(vname) from cpblUtilities import str2pathname if not skipPlots: transLegend(comments=[groupNames.get(agroup,agroup),r'$\pm$1s.e.'],loc='lower right') savefigall(plotfileprefix+'-'+str2pathname(agroup)) rankGroups+=groupDfinite macroStats+=[qtlStats] if 0*'doRankCoefficients': groupVectors=dict([[kk,[gd[kk] for gd in groupDfinite ]] for kk in groupDfinite[0].keys()]) from cpblUtilities import cpblOLS x=cpblOLS('lifeToday',groupVectors,rhsOnly=[ 'rankHHincome'],betacoefs=False,weights=groupVectors['weight']) foioi # assert not 'afg: Kabul' in agroup # Add the quantile info for this group to the data. Also, compile the summary stats for it. #[, 0.25, 0.5, 0.75] # Centre a series of quantiles """ No. Create 20 quantiles. Assign. if none there, weight nearest? e.g. 1 2 10 13 scipy.stats.mstats.mquantiles scipy.stats.mstats.mquantiles(data, prob=[, 0.25, 0.5, 0.75], alphap=0.40000000000000002, betap=0.40000000000000002, axis=None, limit=()) """ from cpblUtilities import dictToTsv dictToTsv(rankGroups,microQuantFile) tsv2dta(microQuantFile) if doGini: dictToTsv(macroInequalities.values(),macroGiniFile) tsv2dta(macroGiniFile) shelfSave(macroQuantFileShelf,macroStats) if 0: # whoooo... i think this was totally misguided. it's not a macro file.. dictToTsv(macroStats,macroQuantFile) tsv2dta(macroQuantFile) #vectorsToTsv(qtlStats,macroQuantFile) #tsv2dta(macroQuantFile) #inequality,redundancy,equality,variation,thesum,absolute=ineq(zip(popn,wealth)) return(os.path.splitext(microQuantFile)[0],os.path.splitext(macroQuantFileShelf)[0])
def filter_finite(data): return pl.all(data[pl.isfinite(data.values.astype(float))], axis=1)
def apply(self, sim): t = sim.t if t < self.start_day: return elif self.end_day is not None and t > self.end_day: return # Check that there are still tests rel_t = t - self.start_day if rel_t < len(self.daily_tests): n_tests = int( self.daily_tests[rel_t] / sim.rescale_vec[t]) # Number of tests for this day -- rescaled if not (n_tests and pl.isfinite(n_tests) ): # If there are no tests today, abort early return else: sim.results['new_tests'][t] += n_tests else: return test_probs = np.ones( sim.n) # Begin by assigning equal testing probability to everyone # Calculate test probabilities for people with symptoms symp_inds = cvu.true(sim.people.symptomatic) symp_test = self.symp_test if self.pdf: # Handle the onset to swab delay symp_time = cvd.default_int(t - sim.people.date_symptomatic[symp_inds] ) # Find time since symptom onset inv_count = ( np.bincount(symp_time) / len(symp_time) ) # Find how many people have had symptoms of a set time and invert count = np.nan * np.ones(inv_count.shape) # Initialize the count count[inv_count != 0] = 1 / inv_count[ inv_count != 0] # Update the counts where defined symp_test *= self.pdf.pdf(symp_time) * count[ symp_time] # Put it all together test_probs[symp_inds] *= symp_test # Update the test probabilities # Handle symptomatic testing, taking into account prevalence of ILI symptoms if self.ili_prev is not None: if rel_t < len(self.ili_prev): n_ili = int( self.ili_prev[rel_t] * sim['pop_size']) # Number with ILI symptoms on this day ili_inds = cvu.choose( sim['pop_size'], n_ili ) # Give some people some symptoms. Assuming that this is independent of COVID symptomaticity... ili_inds = np.setdiff1d(ili_inds, symp_inds) test_probs[ili_inds] *= self.symp_test # Handle quarantine testing quar_inds = get_quar_inds(self.quar_policy, sim) test_probs[quar_inds] *= self.quar_test # Handle any other user-specified testing criteria if self.subtarget is not None: subtarget_inds, subtarget_vals = get_subtargets( self.subtarget, sim) test_probs[ subtarget_inds] = test_probs[subtarget_inds] * subtarget_vals # Don't re-diagnose people diag_inds = cvu.true(sim.people.diagnosed) test_probs[diag_inds] = 0. # Now choose who gets tested and test them test_inds = cvu.choose_w(probs=test_probs, n=n_tests, unique=False) sim.people.test(test_inds, self.sensitivity, loss_prob=self.loss_prob, test_delay=self.test_delay) return
data[i_scan] = pl.append(data[i_scan], 0) lentghs = map(len, data) dims = set(lentghs) dims = map(pl.shape, data) data = pl.vstack(data).T channels = pl.arange(data.shape[0]) if doBGcorrect: for l in xrange(data.shape[1]): indf = data[:,l] < (pl.median(data[:,l])) poly = rt.PolynomialFit(channels, data[:,l], indf=indf) data[:,l] -= poly data /= mon if "3d" in colname and not saveonly: if "-log" in colname: imdata = pl.log(data) ind = pl.isfinite(imdata) else: imdata = data ind = slice(None) vmin[j] = 0#min(vmin[j], imdata[ind].min()) vmax[j] = max(vmax[j], imdata[ind].max()) thisax.imshow(pl.flipud(imdata), aspect="auto", vmin=vmin[j], vmax=vmax[j], extent=(x[0], x[-1], 0, data.shape[0]-1)) thisax.set_title("#%i: %s"%(i, scan.command())) thisax.grid(False) elif "2d" in colname: if "roi" in colname: imax = data.sum(1).argmax() ind = slice(imax - nint, imax + nint) print("Channels in roi: %i...%i"%(imax - nint, imax + nint))