def compare_masses(): shipley = Table.read('output/tables/nbCGs_with-Shipley-mass.fits') fitted = Table.read( 'output/tables/nbCGs_integrated-logM-logZ-from-fitting.fits') fittedMass, shipleyMass = fitted['logM'], shipley['logM'] mask = ~np.isnan(fittedMass) shipleyMass, fittedMass = shipleyMass[mask], fittedMass[mask] popt, pcov = curve_fit(core.linear, shipleyMass, fittedMass - shipleyMass) # p0=[0.4, 8, 0.5] # initial parameters for `core.exp` curve xs = np.linspace(8.3, 11.5, 1000) ys = core.linear(xs, *popt) # compare the masses from the Shipley table to the integrated fitted ones plt.plot_simple_multi([shipleyMass, xs], [fittedMass - shipleyMass, ys], ['', 'fit'], ['k', 'r'], ['o', ''], ['', '-'], xlabel=r'$\log(M/M_{\odot})_{\rm DeepSpace}$', ylabel=(r'$\log(M/M_{\odot})_{\rm fit} - ' + r'\log(M/M_{\odot})_{\rm DeepSpace}$'), scale='linear') #, # xmin=8, xmax=11.4, ymin=-0.4, ymax=0.7) # compare the mass from the Shipley table to the integrated fitted ones # including a correction based on the exponential fit above y_correction = core.linear(shipleyMass, *popt) plt.plot_simple_multi( [shipleyMass + y_correction, xs], [fittedMass, xs], ['', 'equality'], ['k', 'r'], ['o', ''], ['', '--'], xlabel=r'$\log(M/M_{\odot})_{\rm DeepSpace, corrected}$', ylabel=r'$\log(M/M_{\odot})_{\rm fit}$', scale='linear', xmin=8.3, xmax=11.5, ymin=8.3, ymax=11.5) # compare the histograms of the masses from the Shipley table to the # integrated fitted ones plt.histogram_multi([fittedMass, shipleyMass], r'$\log(M/M_{\odot})$', [20, 20], colors=['k', 'r'], labels=['fit', 'DeepSpace'], styles=['-', '-'], histtype='step', loc=2) # investigate the histogram of the integrated fitted metallicities plt.histogram(fitted['avglogZ'], r'$\langle \log(Z/Z_{\odot}) \rangle_{L}$', bins=20, histtype='step') return
def main(): print('Random search:') results = run_random_search(num_runs=1000) histograms(results, 'random_search') learn_rate = 0.1 print('\nHill climbing (learn_rate={}):'.format(learn_rate)) results = run_hill_climbing(learn_rate=learn_rate, num_runs=100) histograms(results, 'hill_climbing', fix_axes=False) print('Policy gradient:') results = run_policy_gradient(num_runs=100) histogram(results, 'policy_gradient', 'Policy gradient')
def simulate(n, M, H): simulations=[] for _ in range(n): simulations.append(evolve(M, H)[-1]) plotting.histogram(simulations) media, desviacion = ss.norm.fit(simulations) d, pvalor = ss.kstest(simulations,'norm',args=(media, desviacion)) if pvalor < 0.01: print("No se ajusta a una normal con confianza de 99 %") else: print("Se puede ajustar a una normal con confianza de 99 %")
def plot_posteriors(param, minmax=False, percentiles=False): table = Table.read('boneyard/subsample_posteriors.fits') val = table[param] plt.histogram(val, '{}'.format(param), bins=50, histtype='step') if percentiles: print(np.percentile(val, [0.15, 50, 99.85])) if minmax: print(np.min(val), np.percentile(val, 50), np.max(val)) return
def check_distributions(): all_clusters = concatenate_all() all_clusters = all_clusters[all_clusters['pop'] == 'Q'] bins = int(np.round(np.sqrt(len(all_clusters)))) redshift = all_clusters['z_spec'] mstar = all_clusters['lmass'] r_e = all_clusters['flux_radius'] plt.histogram(redshift, r'$z_{\rm spec}$', bins=bins, histtype='step', vlines=[0.308, 0.375, 0.396, 0.545, 0.543, 0.348], colors=['purple', 'g', 'gold', 'r', 'orange', 'b'], labels=['A2744', 'A370', 'M416', 'M717', 'M1149', 'AS1063']) plt.histogram(mstar, r'$\log(M_{*}/M_{\odot})$', bins=bins, histtype='step') plt.histogram(np.log10(r_e), r'$\log(R_{\rm e}/{\rm pix})$', bins=bins, histtype='step') return
def main(): ''' Read in a series of input files on the sequence specificities of RBPs, filter the data and write a set of motifs for each RBP. Arguments (see Methods for further details on the input data files): upper_threshold, lower_threshold: the longest and shortest a motif is allowed to be, respectively RBPDB_experiments: path to RBPDB experiments file RBPDB proteins: path to RBPDB proteins file RBPDB_PWMs: path to file containing RBPDB PWM identifier to RBP mapping pwm_dir: path to directory containing RBPDB PWMs RBPmap_PSSMs: path to directory containing RBPmap PSSMs SFmap_proteins: path to file containing motifs from SFmap RNAcompete_information: path to summary file from CIS-BP RNA RNAcompete_PWMs: path to directory containing CIS-BP RNA PWMs final_motifs_file_name: name for output file plot_name: file for plot displaying the distribution of motif set sizes species: the species for which motifs are required ''' description = "Compile a set of motifs putatively recognized by RNA-binding proteins." args = parse_arguments(description, ["upper_threshold", "lower_threshold", "RBPDB_experiments", "RBPDB_proteins", "RBPDB_PWMs", "pwm_dir", "RBPmap_PSSMs", "SFmap_proteins", "RNAcompete_information", "RNAcompete_PWMs", "final_motifs_file_name", "plot_name", "species"], ints = [0, 1]) [upper_threshold, lower_threshold, RBPDB_experiments, RBPDB_proteins, RBPDB_PWMs, pwm_dir, RBPmap_PSSMs, SFmap_proteins, RNAcompete_information, RNAcompete_PWMs, final_motifs_file_name, plot_name, species] = [args.upper_threshold, args.lower_threshold, args.RBPDB_experiments, args.RBPDB_proteins, args.RBPDB_PWMs, args.pwm_dir, args.RBPmap_PSSMs, args.SFmap_proteins, args.RNAcompete_information, args.RNAcompete_PWMs, args.final_motifs_file_name, args.plot_name, args.species] db_fields = rw.read_many_fields(RBPDB_experiments, ",") db_fields = db_fields[1:] print("There are {0} RBPDB experiments.".format(len(db_fields))) db_proteins = rw.read_many_fields(RBPDB_proteins, ",") #species is "H**o sapiens" or "Mus musculus" db_proteins = [i for i in db_proteins if i[6] == species] protein_names = sorted(list(set([i[4] for i in db_proteins]))) db_fields = [i for i in db_fields if i[3] in protein_names] protein_number_before = (len(list(set([i[3] for i in db_fields])))) print("{0} were performed in {1}.\n".format(len(db_fields), species)) db_fields = [i for i in db_fields if i[2] != ""] protein_number_after = (len(list(set([i[3] for i in db_fields])))) db_fields = [[i[3], "RBPDB", i[0], i[1], i[2]] for i in db_fields] print("After removing experiments with no reported motif, {0} proteins remain of the initial {1}.\n".format(protein_number_after, protein_number_before)) bases = np.array(["A", "C", "G", "U"]) db_pwm_list = rw.read_many_fields(RBPDB_PWMs, "\t") for i in db_pwm_list: if i[1] in protein_names: current_file_name = "{0}/{1}.pwm".format(pwm_dir, i[0]) current_PWM = rw.read_many_fields(current_file_name, delimiter = " ") for j in range(len(current_PWM)): current_PWM[j] = [float(k) for k in current_PWM[j] if k != ""] consensus = nc.consensus_from_PWM(current_PWM, bases, 0) PMID = i[0].split("_") PMID = PMID[1] new_record = [i[1], "RBPDB_PWM", PMID, "SELEX", consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding additional sequences from SELEX PWMs (RBPDB), there are {0} proteins.\n".format(protein_number_after)) if species == "Mus musculus": RBPmap_proteins = rw.read_many_fields("RBP/RBPmap_proteins.csv", ",") RBPmap_proteins = list_to_dict(RBPmap_proteins, 0, 1) RNAc_source = [i for i in RBPmap_proteins if "23846655" in RBPmap_proteins[i]] else: RNAc_source = [] for file_name in os.listdir(RBPmap_PSSMs): #RBPmap and SFmap don't distinguish between human and mouse motifs if "human" in file_name: file_name_split = file_name.split("_") protein_name = file_name_split[0] if protein_name not in RNAc_source: initial_pssm = rw.read_many_fields(os.path.join(RBPmap_PSSMs, file_name), delimiter = "\t") current_pssm = initial_pssm[1:] current_pssm = [i[1:] for i in current_pssm] for i in range(len(current_pssm)): current_pssm[i] = [float(j) for j in current_pssm[i]] consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True) protein_name = list(protein_name) if protein_name[:4] == ["S", "R", "S", "F"]: protein_name[:4] = ["S", "F", "R", "S"] protein_name = "".join(protein_name) new_record = [protein_name, "RBPmap_PWM", "NULL", "various", consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding additional sequences from RBPmap PSSMs, there are {0} proteins.\n".format(protein_number_after)) SFmap_data = rw.read_many_fields(SFmap_proteins, delimiter = ",") for i in SFmap_data: if "," in i[1]: temp_split = i[1].split(", ") temp_split = [j.upper() for j in temp_split] i[1] = ";".join(temp_split) else: i[1] = i[1].upper() new_record = [i[0], "SFmap", "NULL", "various", i[1]] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding motifs from SFmap, there are {0} proteins.\n".format(protein_number_after)) RNAc = rw.read_many_fields(RNAcompete_information, delimiter = "\t") RNAc = [i for i in RNAc[1:] if i] if species == "H**o sapiens": RNAc = [i for i in RNAc if i[3] != "." and i[8] == "D"] if species == "Mus musculus": RNAc = [i for i in RNAc if i[3] != "."] PSSM_folder = RNAcompete_PWMs for record in RNAc: motif_name = record[3] initial_pssm = rw.read_many_fields(os.path.join(PSSM_folder, "{0}.txt".format(motif_name)), delimiter = "\t") if initial_pssm == []: if record[19] == "21036867":#RBPDB paper pass else: print(record) else: current_pssm = initial_pssm[1:] current_pssm = [i[1:] for i in current_pssm] for i in range(len(current_pssm)): current_pssm[i] = [float(j) for j in current_pssm[i]] consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True) protein_name = record[6] new_record = [protein_name, "CIS-BP_RNA_PWM", record[19], record[14], consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding motifs from CIS-BP RNA, there are {0} proteins.\n".format(protein_number_after)) to_delete = [] for pos, i in enumerate(db_fields): if ";" in i[4]: if "; " in i[4]: temp_split = i[4].split("; ") else: temp_split = i[4].split(";") temp_split = [((j.upper()).lstrip("N")).rstrip("N") for j in temp_split] temp_split = [j for j in temp_split if len(j) <= upper_threshold and len(j) >= lower_threshold and "(" not in j] if temp_split: db_fields[pos][4] = temp_split[0] for j in temp_split[1:]: db_fields.append([i[0], i[1], i[2], i[3], j]) else: to_delete.append(pos) else: i[4] = (((i[4]).upper()).rstrip("N")).lstrip("N") if len(i[4]) > upper_threshold or len(i[4]) < lower_threshold or "(" in i[4]: to_delete.append(pos) else: db_fields[pos][4] = i[4] db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete] protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After only keeping motifs of length {0}-{1} bp, {2} proteins remain.\n".format(lower_threshold, upper_threshold, protein_number_after)) protein_names = list(set([i[0] for i in db_fields])) if species == "Mus musculus": protein_names_file = "RBP/RBP_names_for_checking.txt" with open(protein_names_file, "w") as file: for name in protein_names: file.write("{0}\n".format(name)) MGI_file = "RBP/MGI_correspondances.txt" MGI = rw.read_many_fields(MGI_file, "\t") MGI_names_all = [i[0] for i in MGI[1:]] found = [i[0] for i in MGI if i[0] == i[3]] MGI = {i[0]: i[3] for i in MGI[1:] if i[0] not in found} to_delete = [] for pos, i in enumerate(db_fields): if species == "Mus musculus": db_fields[pos][0] = "".join([db_fields[pos][0][0].upper(), db_fields[pos][0][1:].lower()]) #will get rid of Hnrnpcl1, which didn't return anything in the MGI search. if db_fields[pos][0] not in MGI_names_all: to_delete.append(pos) else: if db_fields[pos][0] not in found: db_fields[pos][0] = MGI[db_fields[pos][0]] elif species == "H**o sapiens": if i[0] == "A2BP1" or i[0] == "FOX1": db_fields[pos][0] = "RBFOX1" elif i[0] == "SFRS13A": db_fields[pos][0] = "SRSF10" elif i[0][:6] == "BRUNOL": db_fields[pos][0] = "CELF{0}".format(i[0][-1]) elif i[0] == "CUGBP": db_fields[pos][0] = "CELF1" elif i[0] == "Fusip1": db_fields[pos][0] = "SRSF10" elif i[0][:4] == "SFRS": db_fields[pos][0] = "SRSF{0}".format(i[0][4:]) elif i[0] == "HuR": db_fields[pos][0] = "ELAVL1" elif i[0] == "MBNL": db_fields[pos][0] = "MBNL1" elif i[0] == "PTB": db_fields[pos][0] = "PTBP1" elif i[0] == "QK1": db_fields[pos][0] = "QKI" elif i[0] == "RBM9": db_fields[pos][0] = "RBFOX2" elif i[0] == "STAR-PAP": db_fields[pos][0] = "TUT1" elif i[0] == "YB-1": db_fields[pos][0] = "YBX1" elif i[0] == "hnRNPK": db_fields[pos][0] = "HNRNPK" elif i[0] == "hnRNPLL" or i[0] == "HNRPLL": db_fields[pos][0] = "HNRNPLL" db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete] protein_names = list(set([i[0] for i in db_fields])) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After cleaning up protein IDs, {0} proteins remain.\n".format(protein_number_after)) protein_dict = {} for i in db_fields: if i[0] not in protein_dict.keys(): protein_dict[i[0]] = [i] else: protein_dict[i[0]].append(i) if species == "H**o sapeins": del protein_dict["PPIE"] del protein_dict["MIR1236"] del protein_dict["PABPC4"] print("After removing PPIE, PABPC4 and MIR1236, {0} proteins remain.\n".format(len(protein_dict))) elif species == "Mus musculus": del protein_dict["Pabpc4"] print("After removing Pabpc4, {0} proteins remain.\n".format(len(protein_dict))) for i in protein_dict: if i == "ELAVL1": protein_dict[i].append(['ELAVL1', 'synthetic', 'synthetic', 'synthetic', 'UUWGDUU']) elif i == "ELAVL2": protein_dict[i].append(['ELAVL2', 'synthetic', 'synthetic', 'synthetic', 'RWUUYAUUUWR']) protein_dict[i] = sorted(protein_dict[i], key = lambda x:x[4]) current_motifs = [j[4] for j in protein_dict[i]] to_delete = [] for j in range(1, len(current_motifs)): if current_motifs[j] == current_motifs[j-1]: for k in range(1, 4): protein_dict[i][j][k] = ",".join([protein_dict[i][j][k], protein_dict[i][j - 1][k]]) to_delete.append(j - 1) protein_dict[i] = [protein_dict[i][j] for j in range(len(protein_dict[i])) if j not in to_delete] for i in protein_dict: protein_dict[i] = [[j[0], j[4], j[1], j[2], j[3]] for j in protein_dict[i]] print("\n") print("Writing motifs to {0}.\n".format(final_motifs_file_name)) motif_numbers = [] with open(final_motifs_file_name, "w") as final_motifs_file: for i in sorted(list(protein_dict.keys())): final_motifs_file.write(">{0}\n".format(i)) current_motifs = [j[1] for j in protein_dict[i]] DNA_motifs = [nc.DNA_RNA_conversion(j) for j in current_motifs] unravelled_motifs = [nc.unravel_consensus(j) for j in DNA_motifs] unravelled_motifs = flatten(unravelled_motifs) unravelled_motifs = list(set(unravelled_motifs)) print("Writing {0} motifs for {1}.".format(len(unravelled_motifs), i)) motif_numbers.append(len(unravelled_motifs)) unravelled_motifs = "|".join(unravelled_motifs) final_motifs_file.write("{0}\n".format(unravelled_motifs)) plt.figure(1) plotting.histogram(motif_numbers, 50, x_lab = "Motif number", y_lab = "Frequency", title = None) plotting.save_and_show([10, 10], 100, plot_name)
def compare_bouton_responses(exptGrp, ax, stimuli, comp_method='angle', plot_method='cdf', channel='Ch2', label=None, roi_filter=None, **response_kwargs): """Compare various pairs of boutons, based on several conventions: 'bouton' in label of bouton ROIs boutons targeting a cell soma are tagged with the cell number they are targeting, i.e. 'cell1', 'cell2', etc. boutons on an axons are tagged with the fiber number they are on, i.e. 'fiber1', 'fiber2', etc. boutons with no tags have no information about their axon or target """ response_matrix, rois = ia.response_matrix(exptGrp, stimuli, channel=channel, label=label, roi_filter=roi_filter, return_full=True, **response_kwargs) data = {} data['mouse'] = [roi[0] for roi in rois] data['loc'] = [roi[1] for roi in rois] data['label'] = [roi[2] for roi in rois] tags = [] for mouse, loc, name in it.izip(data['mouse'], data['loc'], data['label']): roi_tags = set() for expt in exptGrp: if expt.parent == mouse \ and expt.get('uniqueLocationKey') == loc: for roi in expt.rois(channel=channel, label=label, roi_filter=roi_filter): if roi.label == name: # NOTE: Taking the union of all tags, # so mis-matched tags will just be combined roi_tags = roi_tags.union(roi.tags) tags.append(roi_tags) data['tags'] = tags data['responses'] = [response for response in response_matrix] df = pd.DataFrame(data) if comp_method == 'angle': ax.set_xlabel('Response similarity (angle)') def compare(roi1, roi2): return np.dot(roi1, roi2) / np.linalg.norm(roi1) \ / np.linalg.norm(roi2) elif comp_method == 'abs angle': ax.set_xlabel('Response similarity (abs angle)') def compare(roi1, roi2): return np.abs( np.dot(roi1, roi2) / np.linalg.norm(roi1) / np.linalg.norm(roi2)) elif comp_method == 'corr': ax.set_xlabel('Response similarity (corr)') def compare(roi1, roi2): return np.corrcoef(roi1, roi2)[0, 1] elif comp_method == 'abs corr': ax.set_xlabel('Response similarity (abs corr)') def compare(roi1, roi2): return np.abs(np.corrcoef(roi1, roi2)[0, 1]) elif comp_method == 'mean diff': ax.set_xlabel('Response similarity (mean diff)') def compare(roi1, roi2): return np.abs(roi1 - roi2).mean() else: raise ValueError('Unrecognized compare method argument') same_fiber = [] fiber_with_not = [] same_soma = [] soma_with_not = [] bouton_with_fiber = [] diff_all = [] for name, group in df.groupby(['mouse', 'loc']): for roi1, roi2 in it.combinations(group.iterrows(), 2): r1_responses = roi1[1]['responses'] r2_responses = roi2[1]['responses'] non_nan = np.isfinite(r1_responses) & np.isfinite(r2_responses) comp = compare(r1_responses[non_nan], r2_responses[non_nan]) if np.isnan(comp): continue fiber1 = set([tag for tag in roi1[1]['tags'] if 'fiber' in tag]) fiber2 = set([tag for tag in roi2[1]['tags'] if 'fiber' in tag]) cell1 = set([tag for tag in roi1[1]['tags'] if 'cell' in tag]) cell2 = set([tag for tag in roi2[1]['tags'] if 'cell' in tag]) if len(fiber1.intersection(fiber2)): same_fiber.append(comp) elif len(fiber1) or len(fiber2): fiber_with_not.append(comp) if len(cell1.intersection(cell2)): same_soma.append(comp) elif len(cell1) or len(cell2): soma_with_not.append(comp) if len(fiber1) and roi2[1]['label'] in fiber1 \ or len(fiber2) and roi1[1]['label'] in fiber2: bouton_with_fiber.append(comp) elif not len(fiber1.intersection(fiber2)) \ and not len(cell1.intersection(cell2)): diff_all.append(comp) if plot_method == 'cdf': plotting.cdf(ax, same_fiber, bins='exact', label='same fiber') plotting.cdf(ax, same_soma, bins='exact', label='same soma') plotting.cdf(ax, bouton_with_fiber, bins='exact', label='bouton with fiber') plotting.cdf(ax, fiber_with_not, bins='exact', label='fiber with not') plotting.cdf(ax, soma_with_not, bins='exact', label='soma with not') plotting.cdf(ax, diff_all, bins='exact', label='diff all') elif plot_method == 'hist': colors = lab.plotting.color_cycle() plotting.histogram(ax, same_fiber, bins=50, color=colors.next(), normed=True, label='same fiber') plotting.histogram(ax, same_soma, bins=50, color=colors.next(), normed=True, label='same soma') plotting.histogram(ax, bouton_with_fiber, bins=50, color=colors.next(), normed=True, label='bouton with fiber') plotting.histogram(ax, fiber_with_not, bins=50, color=colors.next(), normed=True, label='fiber with not') plotting.histogram(ax, soma_with_not, bins=50, color=colors.next(), normed=True, label='soma with not') plotting.histogram(ax, diff_all, bins=50, color=colors.next(), normed=True, label='diff all') # ax.legend() return { 'same fiber': same_fiber, 'same soma': same_soma, 'bouton_with_fiber': bouton_with_fiber, 'fiber_with_not': fiber_with_not, 'soma_with_not': soma_with_not, 'diff all': diff_all }