Python removeDuplicate示例，mta_common_functions.removeDuplicate Python示例

示例#1

0

显示文件

def update_holding_list(new_entry, processed_list):
    """
    update <hosue_keeping>/keep_entry list
    Input:  new_entry      --- a list of obsids used
            processed_list --- a list of obsids actually processed
    Output: <hosue_keeping>/keep_entry list
    """
    #
    #-- find whether any of obsids were not proccessed
    #
    missing = mcf.find_missing_elem(new_entry, processed_list)
    file = house_keeping + 'keep_entry'

    f = open(file, 'w')

    if len(missing) > 0:
        #
        #--- if so, print them out
        #
        missing = mcf.removeDuplicate(missing, chk=0)

        for ent in missing:
            f.write(ent)
            f.write('\n')

    f.close()

示例#2

0

显示文件

def get_amp_avg_data(new_entry):
    """
    extract amp_avg information from a fits file for a given obsid
    Input:  new_entry    --- a list of obsids
    Output: amp_data_lst  --- a list of avg_amp kept in hosue_keeping dir 
                             (format: 2013-10-27T06:11:52 0.218615253807107   53275)
            processed_list - a list of obsid which actually used to generate avg_amp
    """
    #
    #--- remove dupilcated entries of obsid
    #
    new_entry = mcf.removeDuplicate(new_entry, chk=0)
    processed_list = []
    amp_data_list = []

    for obsid in new_entry:
        #
        #--- extract fits file(s)
        #
        fits_list = extract_stat_fits_file(obsid, out_dir=temp_dir)
        for fits in fits_list:
            #
            #--- read header entry
            #
            dout = pyfits.open(fits)
            date = dout[0].header['DATE-OBS']
            date.strip()
            #
            #--- extreact column data for ccd_id and drop_amp
            #
            data = pyfits.getdata(fits, 1)
            ccdid = data.field('ccd_id')
            drop_amp = data.field('drop_amp')

            amp_data = []
            sum = 0
            for i in range(0, len(ccdid)):
                #
                #--- amp data is computed from when ccd 7 drop_amp
                #
                if int(ccdid[i]) == 7:
                    val = float(drop_amp[i])
                    amp_data.append(val)
                    sum += val

            if len(amp_data) > 0:
                norm_avg = 0.00323 * sum / float(
                    len(amp_data))  #--- 0.00323 is given by cgrant (03/07/05)

                line = date + '\t' + str(norm_avg) + '\t' + str(obsid) + '\n'
            else:
                line = date + '\t' + '999999' + '\t' + str(obsid) + '\n'

            processed_list.append(obsid)
            amp_data_list.append(line)

    return [processed_list, amp_data_list]

示例#3

0

显示文件

文件： update_data_files.py 项目： tisobe/MTA_old

def cleanUp(cdir):
    """
    sort and remove duplicated lines in all files in given data directory
    Input       cdir   --- directory name
    Output      cdir/files ---- cleaned up files

    """
    if os.listdir(cdir) != []:
        cmd = 'ls ' + cdir + '/* > ' + zspace
        os.system(cmd)
        data = mcf.readFile(zspace)
        mcf.rm_file(zspace)

        for file in data:
            #
            #--- avoid html and png files
            #
            m = re.search('\.', file)
            if m is None:
                mcf.removeDuplicate(file, chk=1, dosort=1)

示例#4

0

显示文件

文件： update_data_files.py 项目： tisobe/Acis_Count_Rate

def cleanUp(cdir):
    
    """
    sort and remove duplicated lines in all files in given data directory
    Input       cdir   --- directory name
    Output      cdir/files ---- cleaned up files

    """
    if os.listdir(cdir) != []:
        cmd = 'ls ' + cdir + '/* > ' +  zspace
        os.system(cmd)
        data = mcf.readFile(zspace)
        mcf.rm_file(zspace)

        for file in data:
#
#--- avoid html and png files
#
            m = re.search('\.', file)
            if m is None:
                mcf.removeDuplicate(file, chk = 1, dosort=1)

示例#5

0

显示文件

文件： create_adjusted_cti_tables.py 项目： tisobe/MTA_old

def clean_cti_data_table(dir):

    """
    remmove data points which are extrme outlyers and then clean up output data tables.
    Input:  dir     --- the directory where the data files are kept
    Output: updated data files in the directory <dir>
    """

    dropped = data_dir + dir + '/dropped_data'
    fo      = open(dropped, 'w')

    dropped_obsids = []

    for elm in elm_list:
        line = 'ELM: ' + elm + '\n'
        fo.write(line)
        for ccd in range(0, 10):

            if ccd == 5 or ccd == 7:
                drop_factor = 5.0               #--- drop_factor sets the boundray of the outlyer: how may signam away?
            else:
                drop_factor = 4.0
#
#--- check the input file exists
#
            dname = data_dir + dir + '/' +  elm + '_ccd' + str(ccd)
            chk   = mcf.isFileEmpty(dname)
            if chk > 0:
                line = 'CCD: ' + str(ccd) + '\n'
                fo.write(line)

                f    = open(dname, 'r')
                data = [line.strip() for line in f.readlines()]
                f.close()
#
#--- separate data into separate array data sets
#
                dcolumns = separate_data(data)

                cti    = ['' for x in range(4)]

                cti[0] = dcolumns[0]
                cti[1] = dcolumns[1]
                cti[2] = dcolumns[2]
                cti[3] = dcolumns[3]
                obsid  = dcolumns[10]

                dom = []
                for ent in dcolumns[8]:
                    time_list = tcnv.dateFormatConAll(ent)
                    dom.append(time_list[7])
#
#--- go around quads 
#
                drop_list = []
                for i in range(0, 4):

                    line = "QUAD" + str(i)+ '\n'
                    fo.write(line)
#
#--- fit a lienar line
#
                    (intc, slope) = linear_fit(dom, cti[i])
                    sum = 0
#
#--- compute a deviation from the fitted line
#
                    diff_save = []
                    for j in range(0, len(dom)):
                        diff = float(cti[i][j]) - (intc + slope * float(dom[j]))
                        diff_save.append(diff)
                        sum += diff * diff
                    sigma = math.sqrt(sum/len(dom))
#
#--- find outlyers
#
                    out_val = drop_factor * sigma
                    for j in range(0, len(dom)):
                        if diff_save[j] > out_val:
                            drop_list.append(j)

                            fo.write(data[j])
                            fo.write('\n')
#
#--- clean up the list; removing duplicated lines
#
                drop_list = mcf.removeDuplicate(drop_list, chk = 0)

                cleaned_data = []
                for i in range(0, len(dom)):
                    chk = 0
                    for comp in drop_list:
                        if i == comp:
                            chk = 1
                            break
                    if chk == 0:
                        cleaned_data.append(data[i])

                cleaned_data = mcf.removeDuplicate(cleaned_data, chk = 0)

                for ent in drop_list:
                    dropped_obsids.append(obsid[ent])

            f = open(dname, 'w')
            for ent in cleaned_data:
                f.write(ent)
                f.write('\n')
            f.close()
            
    fo.close()

    dropped_obsids = mcf.removeDuplicate(dropped_obsids, chk = 0)
    out = data_dir + dir + '/bad_data_obsid'
    f   = open(out, 'w')
    for ent in dropped_obsids:
        f.write(ent)
        f.write('\n')

示例#6

0

显示文件

def cleanup_amp_list():
    """
    remove duplicated obsid entries: keep the newest entry only
    Input:  read from: <hosue_keeping>/amp_avg_lst
    Output: updated <hosue_keeping>/amp_avg_lst
    """

    file = house_keeping + 'amp_avg_list'
    f = open(file, 'r')
    data = [line.strip() for line in f.readlines()]
    f.close()
    #
    #--- reverse the list so that we can check from the newest entry
    #
    data.reverse()
    #
    #--- find out which obsids are listed multiple times
    #
    obsidlist = []
    for ent in data:
        atemp = re.split('\s+|\t+', ent)
        obsid = int(atemp[2])
        obsidlist.append(obsid)

    obsidlist.sort()

    obsidmulti = []
    comp = obsidlist[0]
    for i in range(1, len(obsidlist)):
        if comp == obsidlist[i]:
            obsidmulti.append(obsidlist[i])
        else:
            comp = obsidlist[i]
#
#--- if there are multiple obsid entries, keep the newest one and remove older ones
#
    cleaned = []
    if len(obsidmulti) > 0:
        obsidmulti = mcf.removeDuplicate(obsidmulti)
        #
        #--- "marked" is a marker which indicates whether a specific obsid is already listed
        #
        for i in range(0, len(obsidmulti)):
            marked[i] = 0

        for ent in data:
            atemp = re.split('\s+', ent)
            obsid = int(atemp[2])
            chk = 0
            for i in range(0, len(obsidmulti)):
                if (obsid == obsidmulti[i]) and (marked[i] == 0):
                    marked[i] = 1
                    break
                elif (obsid == obsidmulti[i]) and (marked[i] > 0):
                    chk = 1
                    break

            if chk == 0:
                cleaned.append(ent)
    else:
        cleaned = data
#
#--- reverse back to the original order
#
    cleaned.reverse()
    #
    #--- print out the cleaned list
    #
    f = open(file, 'w')
    for ent in cleaned:
        f.write(ent)
        f.write('\n')
    f.close()