示例#1
0
def baseline_cutter(mol):
    """Cut a vis file.

    It seems like doing this with uvaver is no good because it drops the
    SPECSYS keyword from the header, so now implementing it with CASA in the
    split early on, so this is no longer used.

    This one uses Popen and cwd (change working directory) because the path was
    getting to be longer than buffer's 64-character limit. Could be translated
    to other funcs as well, but would just take some work.
    """
    filepath = './data/' + mol + '/'
    min_baseline = lines[mol]['baseline_cutoff']
    name = mol
    new_name = name + '-short' + str(min_baseline)

    print("\nCompleted uvaver; starting fits uvout\n")
    sp.call([
        'fits', 'op=uvout', 'in={}.vis'.format(new_name),
        'out={}.uvf'.format(new_name)
    ],
            cwd=filepath)

    # Now clean that out file.
    print("\nCompleted fits uvout; starting ICR\n\n")
    icr(filepath + new_name, mol)

    # For some reason icr is returning and so it never deletes these. Fix later
    sp.Popen(['rm -rf {}.bm'.format(new_name)], shell=True)
    sp.Popen(['rm -rf {}.cl'.format(new_name)], shell=True)
    sp.Popen(['rm -rf {}.mp'.format(new_name)], shell=True)
def get_baseline_rmss(mol, niters=1e4, baselines=baselines, remake_all=False):
    """Iterate through a range of baseline cutoffs and compare the results.

    Args:
        vis (str): the name of the core data file that this is pulling.
        baselines (list of ints): the baselines to check over.
    """
    # Set up the symlink
    run_dir = './baselines/baseline_' + mol + str(int(niters)) + '/'
    scratch_dir = '/scratch/jonas/' + run_dir
    orig_vis = './data/' + mol + '/' + mol
    new_vis = run_dir + mol

    if remake_all is True or already_exists(new_vis) is False:
        remove(scratch_dir)
        # :-1 because a symlink with a deleted root isn't a directory anymore
        remove(run_dir[:-1])
        sp.call(['mkdir {}'.format(scratch_dir)], shell=True)
        sp.call(['ln', '-s', scratch_dir, './baselines/'])

        sp.call(['cp', '-r', '{}.vis'.format(orig_vis),
                 '{}/'.format(run_dir)])

        print "Made symlinked directory, copied core .vis over.\n\n"

    data_list = []
    for b in baselines:
        print '\n\n\n    NEW ITERATION\nBaseline: ', b, '\n'
        name = run_dir + mol + str(b) if b != 0 else run_dir + mol

        # Check if we've already icr'ed this one.
        if already_exists(name + '.cm') is True:
            print "File already exists; going straight to imstat"
            mean, rms = imstat(name, ext='.cm')

        else:
            icr(new_vis, mol=mol, min_baseline=b, niters=niters)
            mean, rms = imstat(name, ext='.cm')

        step_output = {'RMS': rms,
                       'Mean': mean,
                       'Baseline': b}

        data_list.append(step_output)
        print step_output

    data_pd = pd.DataFrame(data_list)
    return data_pd
示例#3
0
def get_baseline_rmss(mol, niters=1e4, baselines=baselines, remake_all=False):
    """Iterate through a range of baseline cutoffs and compare the results.

    Args:
        vis (str): the name of the core data file that this is pulling.
        baselines (list of ints): the baselines to check over.
    """
    # Set up the symlink
    run_dir = './data/' + mol + '/baseline_testing/'
    orig_vis = './data/' + mol + '/' + mol
    new_vis = run_dir + mol

    if remake_all is True or already_exists(new_vis) is False:
        sp.call(['mkdir {}'.format(run_dir)], shell=True)

        sp.call(['cp', '-r', '{}.vis'.format(orig_vis), '{}'.format(run_dir)])

    data_list = []
    for b in baselines:
        print('\n\n\n    NEW ITERATION\nBaseline: ', b, '\n')
        name = run_dir + mol + str(b) if b != 0 else run_dir + mol

        # Check if we've already icr'ed this one.
        if already_exists(name + '.cm') is True:
            print("File already exists; going straight to imstat")
            mean, rms = imstat(name, ext='.cm')

        else:
            icr(new_vis, mol=mol, min_baseline=b, niters=niters)
            mean, rms = imstat(name, ext='.cm')

        step_output = {'RMS': rms, 'Mean': mean, 'Baseline': b}

        data_list.append(step_output)
        print(step_output)

    data_pd = pd.DataFrame(data_list)
    return data_pd
示例#4
0
def run_full_pipeline():
    """Run the whole thing.

    Note that this no longer produces both cut and uncut output; since the cut
    happens much earlier, it now only produces one or the other (depending
    on whether or not cut_baselines is true.)
    The Process:
        - casa_sequence():
            - cvel the cont-sub'ed dataset from jonas/raw_data to here.
            - split out the 50 channels around restfreq
            - convert that .ms to a .uvf
        - var_vis(): pull in that .uvf, add variances, resulting in another uvf
        - convert that to a vis
        - icr that vis to get a cm
        - cm to fits; now we have mol.{{uvf, vis, fits, cm}}
        - delete the clutter files: _split, _cvel, _exportuvfits, bm, cl, mp
    """
    t0 = time.time()
    mol = input('Which line (HCN, HCO, CS, or CO)?\n').lower()
    cut = input('Cut baselines for better signal (y/n)?\n').lower()
    cut_baselines = True if cut == 'y' else False
    remake = input('Remake everything (y/n)?\n')
    remake_all = True if remake.lower() == 'y' else False

    # Paths to the data
    jonas = '/Volumes/disks/jonas/'
    raw_data_path = jonas + 'raw_data/'
    final_data_path = jonas + 'modeling/data/' + mol + '/'
    name = mol
    if cut_baselines is True:
        name += '-short' + str(lines[mol]['baseline_cutoff'])

    # Establish a string for the log file to be made at the end
    log = 'Files created on ' + today + '\n\n'

    if remake_all is True:
        # This doesn't work yet.
        print("Remaking everything; emptied line dir and remaking.")
        remove(final_data_path + '*')
        log += "Full remake occured; all files are fresh.\n\n"
    else:
        log += "Some files already existed and so were not remade.\n"
        log += "Careful for inconsistencies.\n\n"

    print("Now processing data....")
    casa_sequence(mol, raw_data_path, final_data_path + name, cut_baselines)

    print("Running varvis....\n\n")
    if already_exists(final_data_path + name + '.uvf') is False:
        # Note that var_vis takes in mol_exportuvfits, returns mol.uvf
        var_vis(final_data_path + name)
    print("Finished varvis; converting uvf to vis now....\n\n")

    # Note that this is different than lines[mol][chan0_freq] bc
    # it's dealing with the chopped vis set
    restfreq = lines[mol]['restfreq']
    f = fits.getheader(final_data_path + name + '.uvf')

    # chan0_freq = (f['CRVAL4'] - (f['CRPIX4']-1) * f['CDELT4']) * 1e-9
    # Using the same math as in lines 130-135
    # chan0_vel = c * (chan0_freq - restfreq)/restfreq
    data, header = fits.getdata(final_data_path + name + '.uvf', header=True)
    header['RESTFREQ'] = restfreq * 1e9
    fits.writeto(final_data_path + name + '.uvf', data, header, overwrite=True)
    if already_exists(final_data_path + name + '.vis') is False:
        sp.Popen(
            [
                'fits',
                'op=uvin',
                'in={}.uvf'.format(name),
                # DONT PUT THIS BACK IN
                # Or if you do, flip the sign of chan0_vel to pos
                # 'velocity=lsr,{},1'.format(chan0_vel),
                'out={}.vis'.format(name)
            ],
            cwd=final_data_path).wait()

    print("Convolving data to get image, converting output to .fits\n\n")
    if already_exists(final_data_path + name + '.cm') is False:
        icr(final_data_path + name, mol=mol)

    print("Deleting the junk process files...\n\n")
    fpath = final_data_path + name
    files_to_remove = [
        fpath + '.bm', fpath + '_split.*', fpath + '.cl', fpath + '_cvel.*',
        fpath + '.mp', fpath + '_exportuvfits.*', 'casa*.log', '*.last'
    ]
    remove(files_to_remove)

    tf = time.time()
    t_total = (tf - t0) / 60
    log += '\nThis processing took ' + str(t_total) + ' minutes.'
    with open(final_data_path + 'file_log.txt', 'w') as f:
        f.write(log)
    print("All done! This processing took " + str(t_total) + " minutes.")
示例#5
0
def fullRun(diskAParams,
            diskBParams,
            mol,
            use_a_previous_result=False,
            cut_central_chans=False):
    """Run it all.

    diskXParams are fed in from full_run.py,
    where the parameter selections are made.
    """
    t0 = time.time()

    # Calculate the number of steps and consequent runtime
    na = 1
    for a in diskAParams:
        na *= len(diskAParams[a])

    nb = 1
    for b in diskBParams:
        nb *= len(diskBParams[b])

    n, dt = na + nb, 2.1
    t = n * dt
    if t <= 60:
        t = str(round(n * dt, 2)) + " minutes."
    elif t > 60 and t <= 1440:
        t = str(round(n * dt / 60, 2)) + " hours."
    elif t >= 1440:
        t = str(round(n * dt / 1440, 2)) + " days."

    # Update the chi2 containers to be the right sizes.
    diskA_shape = [len(diskAParams[p]) for p in param_names]
    diskB_shape = [len(diskBParams[p]) for p in param_names]
    global diskARawX2
    diskARawX2 = np.zeros(diskA_shape)
    global diskARedX2
    diskARedX2 = np.zeros(diskA_shape)
    global diskBRawX2
    diskBRawX2 = np.zeros(diskB_shape)
    global diskBRedX2
    diskBRedX2 = np.zeros(diskB_shape)

    # Begin setting up symlink and get directory paths lined up
    this_run_basename = today + '_' + mol
    this_run = this_run_basename
    modelPath = './gridsearch_runs/' + this_run
    run_counter = 2
    # while already_exists_old(modelPath) is True:
    # while already_exists('/'.join(modelPath.split('/')[:-1])) is True:
    while already_exists(modelPath) is True:
        this_run = this_run_basename + '-' + str(run_counter)
        modelPath = './gridsearch_runs/' + this_run
        run_counter += 1
    # Add on the file base name to the path.
    modelPath += '/' + this_run

    # Parameter Check:
    print("\nThis run will fit for", mol.upper())
    print("It will iterate through these parameters for Disk A:")
    for p in diskAParams:
        print(p, ': ', diskAParams[p])
    print("\nAnd these values for Disk B:")
    for p in diskBParams:
        print(p, ': ', diskBParams[p])

    print("\nThis run will take", n, "steps, spanning about", t)
    print("Output will be in", modelPath, '\n')
    response = input('Sound good? (Enter to begin, anything else to stop)\n')
    if response != "":
        return "\nGo fix whatever you don't like and try again.\n\n"
    else:
        print("Sounds good!\n")

    new_dir = '/Volumes/disks/jonas/modeling/gridsearch_runs/' + this_run
    sp.call(['mkdir', 'gridsearch_runs/' + this_run])

    # CHECK FOR REUSE
    """This is a little bit janky looking but makes sense. Since we are
    treating the two disks as independent, then if, in one run, we find good
    fits (no edge values), then it doesn't make sense to run that grid again;
    it would be better to just grab the relevant information from that run
    and only fit the disk that needs fitting. That's what this is for."""
    to_skip = ''
    if use_a_previous_result is True:
        response2 = input(
            'Please enter the path to the .fits file to use from a previous',
            'run (should be ./models/date/run_date/datefitted_[A/B].fits)\n')
        if 'A' in response2:
            to_skip = 'fitted_A'
        elif 'B' in response2:
            to_skip = 'fitted_B'
        else:
            print(
                "Bad path; must have 'fitted_A or fitted_B' in it. Try again")
            return

    # STARTING THE RUN #
    # Make the initial static model (B), just with the first parameter values
    dBInit = {}
    for p in diskBParams:
        dBInit[p] = diskBParams[p][0]

    # Grid search over Disk A, retrieve the resulting pd.DataFrame
    if to_skip != 'A':
        df_A_fit = gridSearch(diskAParams,
                              dBInit,
                              mol,
                              0,
                              modelPath,
                              n,
                              cut_central_chans=cut_central_chans)

    # Find where the chi2 is minimized and save it
    idx_of_BF_A = df_A_fit.index[df_A_fit['Reduced Chi2'] == np.min(
        df_A_fit['Reduced Chi2'])][0]
    print("Index of Best Fit, A is ", idx_of_BF_A)

    # Make a list of those parameters to pass the next round of grid searching.
    fit_A_params = {}
    for param in df_A_fit.columns:
        fit_A_params[param] = df_A_fit[param][idx_of_BF_A]

    print("First disk has been fit\n")

    # Now search over the other disk
    df_B_fit = gridSearch(diskBParams,
                          fit_A_params,
                          mol,
                          1,
                          modelPath,
                          n,
                          steps_so_far=na,
                          cut_central_chans=cut_central_chans)

    idx_of_BF_B = df_B_fit.index[df_B_fit['Reduced Chi2'] == np.min(
        df_B_fit['Reduced Chi2'])][0]

    fit_B_params = {}
    for param in df_B_fit.columns:
        fit_B_params[param] = df_B_fit[param][idx_of_BF_B]

    # Bind the data frames, output them.
    # Reiterated in tools.py/depickler(), but we can unwrap these vals with:
    # full_log.loc['A', :] to get all the columns for disk A, or
    # full_log[:, 'Incl.'] to see which inclinations both disks tried.
    full_log = pd.concat([df_A_fit, df_B_fit], keys=['A', 'B'], names=['Disk'])
    # Pickle the step log df.
    pickle.dump(full_log, open('{}_step-log.pickle'.format(modelPath), "wb"))
    # To read the pickle:
    # f = pickle.load(open('{}_step-log.pickle'.format(modelPath), "rb"))

    # Finally, Create the final best-fit model and residuals
    print("\n\nCreating best fit model now")
    sample_model_in_uvplane(modelPath + '_bestFit', mol=mol)
    sample_model_in_uvplane(modelPath + '_bestFit', option='subtract', mol=mol)
    icr(modelPath + '_bestFit', mol=mol)
    icr(modelPath + '_bestFit_resid', mol=mol)
    print("Best-fit model created: " + modelPath + "_bestFit.im\n\n")

    # Calculate and present the final X2 values.
    finalX2s = chiSq(modelPath + '_bestFit', mol)
    print("Final Raw Chi-Squared Value: ", finalX2s[0])
    print("Final Reduced Chi-Squared Value: ", finalX2s[1])

    # Clock out
    t1 = time.time()
    t_total = (t1 - t0) / 60
    # n+4 to account for best-fit model making and static disks in grid search
    t_per = str(t_total / (n + 4))

    with open(modelPath + '_stepDurations.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(times)

    print("\n\nFinal run duration was", t_total / 60, ' hours')
    print('with each step taking on average', t_per, ' minutes')

    # log file w/ best fit vals, range queried, indices of best vals, best chi2
    with open(modelPath + '_summary.log', 'w') as f:
        s0 = '\nLOG FOR RUN ON' + today + ' FOR THE ' + mol + ' LINE'
        s1 = '\nBest Chi-Squared values [raw, reduced]:\n' + str(finalX2s)
        s2 = '\n\n\nParameter ranges queried:\n'
        s3 = '\nDisk A:\n'
        for i, ps in enumerate(diskAParams):
            s3 = s3 + param_names[i] + str(ps) + '\n'
        s4 = '\nDisk B:\n'
        for i, ps in enumerate(diskBParams):
            s4 = s4 + param_names[i] + str(ps) + '\n'
        s5 = '\n\n\nBest-fit values (Tatm, Tqq, Xmol, outerR, PA, Incl):'
        s6 = '\nDisk A:\n' + str(fit_A_params)
        s7 = '\nDisk B:\n' + str(fit_B_params)
        s8 = '\n\n\nFinal run duration was' + str(t_total / 60) + 'hours'
        s9 = '\nwith each step taking on average' + t_per + 'minutes'
        s10 = '\n\nData file used was ' + dataPath
        s = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
        f.write(s)

    run = GridSearch_Run(modelPath, save_all_plots=True)
    print("Successfully finished everything.")
def fullRun(diskAParams, diskBParams, use_a_previous_result=False):
    """Run it all.

    diskXParams are fed in from full_run.py,
    where the parameter selections are made.
    """
    t0 = time.time()

    # Calculate the number of steps and consequent runtime
    na = 1
    for a in range(0, len(diskAParams)):
        na *= len(diskAParams[a])

    nb = 1
    for b in range(0, len(diskBParams)):
        nb *= len(diskBParams[b])

    n, dt = na + nb, 2.1
    t = n * dt
    t = str(n * dt) + " minutes." if t < 60 else str(n * dt / 60) + " hours."

    # Begin setting up symlink and get directory paths lined up
    scratch_home = '/scratch/jonas/'
    this_run_basename = today

    run_counter = 2
    this_run = this_run_basename
    while already_exists_old(scratch_home + this_run) is True:
        this_run = this_run_basename + '-' + str(run_counter)
        run_counter += 1

    # Cool. Now we know where we're symlinking to.
    scratch_dir = scratch_home + this_run
    modelPath = './models/' + this_run + '/' + this_run

    # Parameter Check:
    print "This run will iterate through these parameters for Disk A:"
    print diskAParams
    print "\nAnd these values for Disk B:\n", diskBParams
    print "\nThis run will take", n, "steps, spanning about", t
    print "\nOutput will be in", modelPath, '\n'
    response = raw_input(
        'Sound good? (Enter to begin, anything else to stop)\n')
    if response != "":
        return "\nGo fix whatever you don't like and try again.\n\n"
    else:
        print "Sounds good!\n"

    # Make the symlink:
    # remove(modelPath)
    # remove(scratch_dir)
    sp.call(['mkdir', scratch_dir])
    sp.call(['ln', '-s', scratch_dir, './models/'])

    # CHECK FOR REUSE
    """This is a little bit janky looking but makes sense. Since we are
    treating the two disks as independent, then if, in one run, we find good
    fits (no edge values), then it doesn't make sense to run that grid again;
    it would be better to just grab the relevant information from that run
    and only fit the disk that needs fitting. That's what this is for."""
    to_skip = ''
    if use_a_previous_result is True:
        response2 = raw_input(
            'Please enter the path to the .fits file to use from a previous',
            'run (should be ./models/date/run_date/datefitted_[A/B].fits)\n')
        if 'A' in response2:
            to_skip = 'fitted_A'
        elif 'B' in response2:
            to_skip = 'fitted_B'
        else:
            print "Bad path; must have 'fitted_A or fitted_B' in it. Try again"
            return

    # STARTING THE RUN #
    # Make the initial static model (B), just with the first parameter values
    dBInit = []
    for i in diskBParams:
        dBInit.append(i[0])

    # Grid search over Disk A, retrieve the resulting pd.DataFrame
    if to_skip != 'A':
        df_A_fit = gridSearch(diskAParams,
                              dBInit,
                              0,
                              modelPath,
                              n,
                              cut_central_chans=False)

    # Find where the chi2 is minimized and save it
    idx_of_BF_A = df_A_fit.index[df_A_fit['Reduced Chi2'] == np.min(
        df_A_fit['Reduced Chi2'])][0]
    print "Index of Best Fit, A is ", idx_of_BF_A

    # Make a list of those parameters to pass the next round of grid searching.
    Ps_A = [
        df_A_fit['Atms Temp'][idx_of_BF_A],
        df_A_fit['Temp Struct'][idx_of_BF_A],
        df_A_fit['Molecular Abundance'][idx_of_BF_A],
        df_A_fit['Outer Radius'][idx_of_BF_A],
        df_A_fit['Pos. Angle'][idx_of_BF_A], df_A_fit['Incl.'][idx_of_BF_A],
        df_A_fit['Offset X'][idx_of_BF_A], df_A_fit['Offset Y'][idx_of_BF_A],
        df_A_fit['Systemic Velocity'][idx_of_BF_A]
    ]
    fit_A_params = np.array(Ps_A)

    print "First disk has been fit\n"

    # Now search over the other disk
    df_B_fit = gridSearch(diskBParams,
                          fit_A_params,
                          1,
                          modelPath,
                          n,
                          steps_so_far=na,
                          cut_central_chans=False)

    idx_of_BF_B = df_B_fit.index[df_B_fit['Reduced Chi2'] == np.min(
        df_B_fit['Reduced Chi2'])][0]

    Ps_B = [
        df_B_fit['Atms Temp'][idx_of_BF_B],
        df_B_fit['Temp Struct'][idx_of_BF_B],
        df_B_fit['Molecular Abundance'][idx_of_BF_B],
        df_B_fit['Outer Radius'][idx_of_BF_B],
        df_B_fit['Pos. Angle'][idx_of_BF_B], df_B_fit['Incl.'][idx_of_BF_B],
        df_B_fit['Offset X'][idx_of_BF_B], df_B_fit['Offset Y'][idx_of_BF_B],
        df_B_fit['Systemic Velocity'][idx_of_BF_B]
    ]
    fit_B_params = np.array(Ps_B)

    # Bind the data frames, output them.
    # Reiterated in tools.py/depickler(), but we can unwrap these vals with:
    # full_log.loc['A', :] to get all the columns for disk A, or
    # full_log[:, 'Incl.'] to see which inclinations both disks tried.
    full_log = pd.concat([df_A_fit, df_B_fit], keys=['A', 'B'], names=['Disk'])
    # Pickle the step log df.
    pickle.dump(full_log, open('{}_step-log.pickle'.format(modelPath), "wb"))
    # To read the pickle:
    # f = pickle.load(open('{}_step-log.pickle'.format(modelPath), "rb"))

    # Finally, Create the final best-fit model.
    print "\n\nCreating best fit model now"
    sample_model_in_uvplane(modelPath + '_bestFit', mol=mol)
    icr(modelPath + '_bestFit', mol=mol)
    print "Best-fit model created: " + modelPath + "_bestFit.im\n\n"

    # Calculate and present the final X2 values.
    finalX2s = chiSq(modelPath + '_bestFit')
    print "Final Raw Chi-Squared Value: ", finalX2s[0]
    print "Final Reduced Chi-Squared Value: ", finalX2s[1]

    # Clock out
    t1 = time.time()
    t_total = (t1 - t0) / 60
    # n+4 to account for best-fit model making and static disks in grid search
    t_per = str(t_total / (n + 4))

    with open(modelPath + '_stepDurations.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(times)

    print "\n\nFinal run duration was", t_total / 60, ' hours'
    print 'with each step taking on average', t_per, ' minutes'

    # log file w/ best fit vals, range queried, indices of best vals, best chi2
    with open('run_' + today + 'summary.log', 'w') as f:
        s0 = '\nLOG FOR RUN ON' + today + ' FOR THE ' + mol + ' LINE'
        s1 = '\nBest Chi-Squared values [raw, reduced]:\n' + str(finalX2s)
        s2 = '\n\n\nParameter ranges queried:\n'
        s3 = '\nDisk A:\n'
        for i, ps in enumerate(diskAParams):
            s3 = s3 + param_names[i] + str(ps) + '\n'
        s4 = '\nDisk B:\n'
        for i, ps in enumerate(diskBParams):
            s4 = s4 + param_names[i] + str(ps) + '\n'
        s5 = '\n\n\nBest-fit values (Tatm, Tqq, Xmol, outerR, PA, Incl):'
        s6 = '\nDisk A:\n' + str(fit_A_params)
        s7 = '\nDisk B:\n' + str(fit_B_params)
        s8 = '\n\n\nFinal run duration was' + str(t_total / 60) + 'hours'
        s9 = '\nwith each step taking on average' + t_per + 'minutes'
        s10 = '\n\nData file used was ' + dataPath
        s = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
        f.write(s)

    plot_gridSearch_log(modelPath, show=False)
    plot_step_duration(modelPath, show=False)
    plot_fits(modelPath + '_bestFit.fits', show=False)