def baseline_cutter(mol): """Cut a vis file. It seems like doing this with uvaver is no good because it drops the SPECSYS keyword from the header, so now implementing it with CASA in the split early on, so this is no longer used. This one uses Popen and cwd (change working directory) because the path was getting to be longer than buffer's 64-character limit. Could be translated to other funcs as well, but would just take some work. """ filepath = './data/' + mol + '/' min_baseline = lines[mol]['baseline_cutoff'] name = mol new_name = name + '-short' + str(min_baseline) print("\nCompleted uvaver; starting fits uvout\n") sp.call([ 'fits', 'op=uvout', 'in={}.vis'.format(new_name), 'out={}.uvf'.format(new_name) ], cwd=filepath) # Now clean that out file. print("\nCompleted fits uvout; starting ICR\n\n") icr(filepath + new_name, mol) # For some reason icr is returning and so it never deletes these. Fix later sp.Popen(['rm -rf {}.bm'.format(new_name)], shell=True) sp.Popen(['rm -rf {}.cl'.format(new_name)], shell=True) sp.Popen(['rm -rf {}.mp'.format(new_name)], shell=True)
def get_baseline_rmss(mol, niters=1e4, baselines=baselines, remake_all=False): """Iterate through a range of baseline cutoffs and compare the results. Args: vis (str): the name of the core data file that this is pulling. baselines (list of ints): the baselines to check over. """ # Set up the symlink run_dir = './baselines/baseline_' + mol + str(int(niters)) + '/' scratch_dir = '/scratch/jonas/' + run_dir orig_vis = './data/' + mol + '/' + mol new_vis = run_dir + mol if remake_all is True or already_exists(new_vis) is False: remove(scratch_dir) # :-1 because a symlink with a deleted root isn't a directory anymore remove(run_dir[:-1]) sp.call(['mkdir {}'.format(scratch_dir)], shell=True) sp.call(['ln', '-s', scratch_dir, './baselines/']) sp.call(['cp', '-r', '{}.vis'.format(orig_vis), '{}/'.format(run_dir)]) print "Made symlinked directory, copied core .vis over.\n\n" data_list = [] for b in baselines: print '\n\n\n NEW ITERATION\nBaseline: ', b, '\n' name = run_dir + mol + str(b) if b != 0 else run_dir + mol # Check if we've already icr'ed this one. if already_exists(name + '.cm') is True: print "File already exists; going straight to imstat" mean, rms = imstat(name, ext='.cm') else: icr(new_vis, mol=mol, min_baseline=b, niters=niters) mean, rms = imstat(name, ext='.cm') step_output = {'RMS': rms, 'Mean': mean, 'Baseline': b} data_list.append(step_output) print step_output data_pd = pd.DataFrame(data_list) return data_pd
def get_baseline_rmss(mol, niters=1e4, baselines=baselines, remake_all=False): """Iterate through a range of baseline cutoffs and compare the results. Args: vis (str): the name of the core data file that this is pulling. baselines (list of ints): the baselines to check over. """ # Set up the symlink run_dir = './data/' + mol + '/baseline_testing/' orig_vis = './data/' + mol + '/' + mol new_vis = run_dir + mol if remake_all is True or already_exists(new_vis) is False: sp.call(['mkdir {}'.format(run_dir)], shell=True) sp.call(['cp', '-r', '{}.vis'.format(orig_vis), '{}'.format(run_dir)]) data_list = [] for b in baselines: print('\n\n\n NEW ITERATION\nBaseline: ', b, '\n') name = run_dir + mol + str(b) if b != 0 else run_dir + mol # Check if we've already icr'ed this one. if already_exists(name + '.cm') is True: print("File already exists; going straight to imstat") mean, rms = imstat(name, ext='.cm') else: icr(new_vis, mol=mol, min_baseline=b, niters=niters) mean, rms = imstat(name, ext='.cm') step_output = {'RMS': rms, 'Mean': mean, 'Baseline': b} data_list.append(step_output) print(step_output) data_pd = pd.DataFrame(data_list) return data_pd
def run_full_pipeline(): """Run the whole thing. Note that this no longer produces both cut and uncut output; since the cut happens much earlier, it now only produces one or the other (depending on whether or not cut_baselines is true.) The Process: - casa_sequence(): - cvel the cont-sub'ed dataset from jonas/raw_data to here. - split out the 50 channels around restfreq - convert that .ms to a .uvf - var_vis(): pull in that .uvf, add variances, resulting in another uvf - convert that to a vis - icr that vis to get a cm - cm to fits; now we have mol.{{uvf, vis, fits, cm}} - delete the clutter files: _split, _cvel, _exportuvfits, bm, cl, mp """ t0 = time.time() mol = input('Which line (HCN, HCO, CS, or CO)?\n').lower() cut = input('Cut baselines for better signal (y/n)?\n').lower() cut_baselines = True if cut == 'y' else False remake = input('Remake everything (y/n)?\n') remake_all = True if remake.lower() == 'y' else False # Paths to the data jonas = '/Volumes/disks/jonas/' raw_data_path = jonas + 'raw_data/' final_data_path = jonas + 'modeling/data/' + mol + '/' name = mol if cut_baselines is True: name += '-short' + str(lines[mol]['baseline_cutoff']) # Establish a string for the log file to be made at the end log = 'Files created on ' + today + '\n\n' if remake_all is True: # This doesn't work yet. print("Remaking everything; emptied line dir and remaking.") remove(final_data_path + '*') log += "Full remake occured; all files are fresh.\n\n" else: log += "Some files already existed and so were not remade.\n" log += "Careful for inconsistencies.\n\n" print("Now processing data....") casa_sequence(mol, raw_data_path, final_data_path + name, cut_baselines) print("Running varvis....\n\n") if already_exists(final_data_path + name + '.uvf') is False: # Note that var_vis takes in mol_exportuvfits, returns mol.uvf var_vis(final_data_path + name) print("Finished varvis; converting uvf to vis now....\n\n") # Note that this is different than lines[mol][chan0_freq] bc # it's dealing with the chopped vis set restfreq = lines[mol]['restfreq'] f = fits.getheader(final_data_path + name + '.uvf') # chan0_freq = (f['CRVAL4'] - (f['CRPIX4']-1) * f['CDELT4']) * 1e-9 # Using the same math as in lines 130-135 # chan0_vel = c * (chan0_freq - restfreq)/restfreq data, header = fits.getdata(final_data_path + name + '.uvf', header=True) header['RESTFREQ'] = restfreq * 1e9 fits.writeto(final_data_path + name + '.uvf', data, header, overwrite=True) if already_exists(final_data_path + name + '.vis') is False: sp.Popen( [ 'fits', 'op=uvin', 'in={}.uvf'.format(name), # DONT PUT THIS BACK IN # Or if you do, flip the sign of chan0_vel to pos # 'velocity=lsr,{},1'.format(chan0_vel), 'out={}.vis'.format(name) ], cwd=final_data_path).wait() print("Convolving data to get image, converting output to .fits\n\n") if already_exists(final_data_path + name + '.cm') is False: icr(final_data_path + name, mol=mol) print("Deleting the junk process files...\n\n") fpath = final_data_path + name files_to_remove = [ fpath + '.bm', fpath + '_split.*', fpath + '.cl', fpath + '_cvel.*', fpath + '.mp', fpath + '_exportuvfits.*', 'casa*.log', '*.last' ] remove(files_to_remove) tf = time.time() t_total = (tf - t0) / 60 log += '\nThis processing took ' + str(t_total) + ' minutes.' with open(final_data_path + 'file_log.txt', 'w') as f: f.write(log) print("All done! This processing took " + str(t_total) + " minutes.")
def fullRun(diskAParams, diskBParams, mol, use_a_previous_result=False, cut_central_chans=False): """Run it all. diskXParams are fed in from full_run.py, where the parameter selections are made. """ t0 = time.time() # Calculate the number of steps and consequent runtime na = 1 for a in diskAParams: na *= len(diskAParams[a]) nb = 1 for b in diskBParams: nb *= len(diskBParams[b]) n, dt = na + nb, 2.1 t = n * dt if t <= 60: t = str(round(n * dt, 2)) + " minutes." elif t > 60 and t <= 1440: t = str(round(n * dt / 60, 2)) + " hours." elif t >= 1440: t = str(round(n * dt / 1440, 2)) + " days." # Update the chi2 containers to be the right sizes. diskA_shape = [len(diskAParams[p]) for p in param_names] diskB_shape = [len(diskBParams[p]) for p in param_names] global diskARawX2 diskARawX2 = np.zeros(diskA_shape) global diskARedX2 diskARedX2 = np.zeros(diskA_shape) global diskBRawX2 diskBRawX2 = np.zeros(diskB_shape) global diskBRedX2 diskBRedX2 = np.zeros(diskB_shape) # Begin setting up symlink and get directory paths lined up this_run_basename = today + '_' + mol this_run = this_run_basename modelPath = './gridsearch_runs/' + this_run run_counter = 2 # while already_exists_old(modelPath) is True: # while already_exists('/'.join(modelPath.split('/')[:-1])) is True: while already_exists(modelPath) is True: this_run = this_run_basename + '-' + str(run_counter) modelPath = './gridsearch_runs/' + this_run run_counter += 1 # Add on the file base name to the path. modelPath += '/' + this_run # Parameter Check: print("\nThis run will fit for", mol.upper()) print("It will iterate through these parameters for Disk A:") for p in diskAParams: print(p, ': ', diskAParams[p]) print("\nAnd these values for Disk B:") for p in diskBParams: print(p, ': ', diskBParams[p]) print("\nThis run will take", n, "steps, spanning about", t) print("Output will be in", modelPath, '\n') response = input('Sound good? (Enter to begin, anything else to stop)\n') if response != "": return "\nGo fix whatever you don't like and try again.\n\n" else: print("Sounds good!\n") new_dir = '/Volumes/disks/jonas/modeling/gridsearch_runs/' + this_run sp.call(['mkdir', 'gridsearch_runs/' + this_run]) # CHECK FOR REUSE """This is a little bit janky looking but makes sense. Since we are treating the two disks as independent, then if, in one run, we find good fits (no edge values), then it doesn't make sense to run that grid again; it would be better to just grab the relevant information from that run and only fit the disk that needs fitting. That's what this is for.""" to_skip = '' if use_a_previous_result is True: response2 = input( 'Please enter the path to the .fits file to use from a previous', 'run (should be ./models/date/run_date/datefitted_[A/B].fits)\n') if 'A' in response2: to_skip = 'fitted_A' elif 'B' in response2: to_skip = 'fitted_B' else: print( "Bad path; must have 'fitted_A or fitted_B' in it. Try again") return # STARTING THE RUN # # Make the initial static model (B), just with the first parameter values dBInit = {} for p in diskBParams: dBInit[p] = diskBParams[p][0] # Grid search over Disk A, retrieve the resulting pd.DataFrame if to_skip != 'A': df_A_fit = gridSearch(diskAParams, dBInit, mol, 0, modelPath, n, cut_central_chans=cut_central_chans) # Find where the chi2 is minimized and save it idx_of_BF_A = df_A_fit.index[df_A_fit['Reduced Chi2'] == np.min( df_A_fit['Reduced Chi2'])][0] print("Index of Best Fit, A is ", idx_of_BF_A) # Make a list of those parameters to pass the next round of grid searching. fit_A_params = {} for param in df_A_fit.columns: fit_A_params[param] = df_A_fit[param][idx_of_BF_A] print("First disk has been fit\n") # Now search over the other disk df_B_fit = gridSearch(diskBParams, fit_A_params, mol, 1, modelPath, n, steps_so_far=na, cut_central_chans=cut_central_chans) idx_of_BF_B = df_B_fit.index[df_B_fit['Reduced Chi2'] == np.min( df_B_fit['Reduced Chi2'])][0] fit_B_params = {} for param in df_B_fit.columns: fit_B_params[param] = df_B_fit[param][idx_of_BF_B] # Bind the data frames, output them. # Reiterated in tools.py/depickler(), but we can unwrap these vals with: # full_log.loc['A', :] to get all the columns for disk A, or # full_log[:, 'Incl.'] to see which inclinations both disks tried. full_log = pd.concat([df_A_fit, df_B_fit], keys=['A', 'B'], names=['Disk']) # Pickle the step log df. pickle.dump(full_log, open('{}_step-log.pickle'.format(modelPath), "wb")) # To read the pickle: # f = pickle.load(open('{}_step-log.pickle'.format(modelPath), "rb")) # Finally, Create the final best-fit model and residuals print("\n\nCreating best fit model now") sample_model_in_uvplane(modelPath + '_bestFit', mol=mol) sample_model_in_uvplane(modelPath + '_bestFit', option='subtract', mol=mol) icr(modelPath + '_bestFit', mol=mol) icr(modelPath + '_bestFit_resid', mol=mol) print("Best-fit model created: " + modelPath + "_bestFit.im\n\n") # Calculate and present the final X2 values. finalX2s = chiSq(modelPath + '_bestFit', mol) print("Final Raw Chi-Squared Value: ", finalX2s[0]) print("Final Reduced Chi-Squared Value: ", finalX2s[1]) # Clock out t1 = time.time() t_total = (t1 - t0) / 60 # n+4 to account for best-fit model making and static disks in grid search t_per = str(t_total / (n + 4)) with open(modelPath + '_stepDurations.csv', 'w') as f: wr = csv.writer(f) wr.writerows(times) print("\n\nFinal run duration was", t_total / 60, ' hours') print('with each step taking on average', t_per, ' minutes') # log file w/ best fit vals, range queried, indices of best vals, best chi2 with open(modelPath + '_summary.log', 'w') as f: s0 = '\nLOG FOR RUN ON' + today + ' FOR THE ' + mol + ' LINE' s1 = '\nBest Chi-Squared values [raw, reduced]:\n' + str(finalX2s) s2 = '\n\n\nParameter ranges queried:\n' s3 = '\nDisk A:\n' for i, ps in enumerate(diskAParams): s3 = s3 + param_names[i] + str(ps) + '\n' s4 = '\nDisk B:\n' for i, ps in enumerate(diskBParams): s4 = s4 + param_names[i] + str(ps) + '\n' s5 = '\n\n\nBest-fit values (Tatm, Tqq, Xmol, outerR, PA, Incl):' s6 = '\nDisk A:\n' + str(fit_A_params) s7 = '\nDisk B:\n' + str(fit_B_params) s8 = '\n\n\nFinal run duration was' + str(t_total / 60) + 'hours' s9 = '\nwith each step taking on average' + t_per + 'minutes' s10 = '\n\nData file used was ' + dataPath s = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 f.write(s) run = GridSearch_Run(modelPath, save_all_plots=True) print("Successfully finished everything.")
def fullRun(diskAParams, diskBParams, use_a_previous_result=False): """Run it all. diskXParams are fed in from full_run.py, where the parameter selections are made. """ t0 = time.time() # Calculate the number of steps and consequent runtime na = 1 for a in range(0, len(diskAParams)): na *= len(diskAParams[a]) nb = 1 for b in range(0, len(diskBParams)): nb *= len(diskBParams[b]) n, dt = na + nb, 2.1 t = n * dt t = str(n * dt) + " minutes." if t < 60 else str(n * dt / 60) + " hours." # Begin setting up symlink and get directory paths lined up scratch_home = '/scratch/jonas/' this_run_basename = today run_counter = 2 this_run = this_run_basename while already_exists_old(scratch_home + this_run) is True: this_run = this_run_basename + '-' + str(run_counter) run_counter += 1 # Cool. Now we know where we're symlinking to. scratch_dir = scratch_home + this_run modelPath = './models/' + this_run + '/' + this_run # Parameter Check: print "This run will iterate through these parameters for Disk A:" print diskAParams print "\nAnd these values for Disk B:\n", diskBParams print "\nThis run will take", n, "steps, spanning about", t print "\nOutput will be in", modelPath, '\n' response = raw_input( 'Sound good? (Enter to begin, anything else to stop)\n') if response != "": return "\nGo fix whatever you don't like and try again.\n\n" else: print "Sounds good!\n" # Make the symlink: # remove(modelPath) # remove(scratch_dir) sp.call(['mkdir', scratch_dir]) sp.call(['ln', '-s', scratch_dir, './models/']) # CHECK FOR REUSE """This is a little bit janky looking but makes sense. Since we are treating the two disks as independent, then if, in one run, we find good fits (no edge values), then it doesn't make sense to run that grid again; it would be better to just grab the relevant information from that run and only fit the disk that needs fitting. That's what this is for.""" to_skip = '' if use_a_previous_result is True: response2 = raw_input( 'Please enter the path to the .fits file to use from a previous', 'run (should be ./models/date/run_date/datefitted_[A/B].fits)\n') if 'A' in response2: to_skip = 'fitted_A' elif 'B' in response2: to_skip = 'fitted_B' else: print "Bad path; must have 'fitted_A or fitted_B' in it. Try again" return # STARTING THE RUN # # Make the initial static model (B), just with the first parameter values dBInit = [] for i in diskBParams: dBInit.append(i[0]) # Grid search over Disk A, retrieve the resulting pd.DataFrame if to_skip != 'A': df_A_fit = gridSearch(diskAParams, dBInit, 0, modelPath, n, cut_central_chans=False) # Find where the chi2 is minimized and save it idx_of_BF_A = df_A_fit.index[df_A_fit['Reduced Chi2'] == np.min( df_A_fit['Reduced Chi2'])][0] print "Index of Best Fit, A is ", idx_of_BF_A # Make a list of those parameters to pass the next round of grid searching. Ps_A = [ df_A_fit['Atms Temp'][idx_of_BF_A], df_A_fit['Temp Struct'][idx_of_BF_A], df_A_fit['Molecular Abundance'][idx_of_BF_A], df_A_fit['Outer Radius'][idx_of_BF_A], df_A_fit['Pos. Angle'][idx_of_BF_A], df_A_fit['Incl.'][idx_of_BF_A], df_A_fit['Offset X'][idx_of_BF_A], df_A_fit['Offset Y'][idx_of_BF_A], df_A_fit['Systemic Velocity'][idx_of_BF_A] ] fit_A_params = np.array(Ps_A) print "First disk has been fit\n" # Now search over the other disk df_B_fit = gridSearch(diskBParams, fit_A_params, 1, modelPath, n, steps_so_far=na, cut_central_chans=False) idx_of_BF_B = df_B_fit.index[df_B_fit['Reduced Chi2'] == np.min( df_B_fit['Reduced Chi2'])][0] Ps_B = [ df_B_fit['Atms Temp'][idx_of_BF_B], df_B_fit['Temp Struct'][idx_of_BF_B], df_B_fit['Molecular Abundance'][idx_of_BF_B], df_B_fit['Outer Radius'][idx_of_BF_B], df_B_fit['Pos. Angle'][idx_of_BF_B], df_B_fit['Incl.'][idx_of_BF_B], df_B_fit['Offset X'][idx_of_BF_B], df_B_fit['Offset Y'][idx_of_BF_B], df_B_fit['Systemic Velocity'][idx_of_BF_B] ] fit_B_params = np.array(Ps_B) # Bind the data frames, output them. # Reiterated in tools.py/depickler(), but we can unwrap these vals with: # full_log.loc['A', :] to get all the columns for disk A, or # full_log[:, 'Incl.'] to see which inclinations both disks tried. full_log = pd.concat([df_A_fit, df_B_fit], keys=['A', 'B'], names=['Disk']) # Pickle the step log df. pickle.dump(full_log, open('{}_step-log.pickle'.format(modelPath), "wb")) # To read the pickle: # f = pickle.load(open('{}_step-log.pickle'.format(modelPath), "rb")) # Finally, Create the final best-fit model. print "\n\nCreating best fit model now" sample_model_in_uvplane(modelPath + '_bestFit', mol=mol) icr(modelPath + '_bestFit', mol=mol) print "Best-fit model created: " + modelPath + "_bestFit.im\n\n" # Calculate and present the final X2 values. finalX2s = chiSq(modelPath + '_bestFit') print "Final Raw Chi-Squared Value: ", finalX2s[0] print "Final Reduced Chi-Squared Value: ", finalX2s[1] # Clock out t1 = time.time() t_total = (t1 - t0) / 60 # n+4 to account for best-fit model making and static disks in grid search t_per = str(t_total / (n + 4)) with open(modelPath + '_stepDurations.csv', 'w') as f: wr = csv.writer(f) wr.writerows(times) print "\n\nFinal run duration was", t_total / 60, ' hours' print 'with each step taking on average', t_per, ' minutes' # log file w/ best fit vals, range queried, indices of best vals, best chi2 with open('run_' + today + 'summary.log', 'w') as f: s0 = '\nLOG FOR RUN ON' + today + ' FOR THE ' + mol + ' LINE' s1 = '\nBest Chi-Squared values [raw, reduced]:\n' + str(finalX2s) s2 = '\n\n\nParameter ranges queried:\n' s3 = '\nDisk A:\n' for i, ps in enumerate(diskAParams): s3 = s3 + param_names[i] + str(ps) + '\n' s4 = '\nDisk B:\n' for i, ps in enumerate(diskBParams): s4 = s4 + param_names[i] + str(ps) + '\n' s5 = '\n\n\nBest-fit values (Tatm, Tqq, Xmol, outerR, PA, Incl):' s6 = '\nDisk A:\n' + str(fit_A_params) s7 = '\nDisk B:\n' + str(fit_B_params) s8 = '\n\n\nFinal run duration was' + str(t_total / 60) + 'hours' s9 = '\nwith each step taking on average' + t_per + 'minutes' s10 = '\n\nData file used was ' + dataPath s = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 f.write(s) plot_gridSearch_log(modelPath, show=False) plot_step_duration(modelPath, show=False) plot_fits(modelPath + '_bestFit.fits', show=False)