Пример #1
0
def sd2drf(local_dir, remote_dir, filetype='png', keepfolder=False):
    '''function: Upload all files under one folder (including all files under subfolders) to the specified folder 
    input:
        local_dir: local directory
        remote_dir: remote directory,the folder in the student drifters'''

    if local_dir[0] != '/':
        local_dir = '/' + local_dir
    if remote_dir[0] != '/':
        remote_dir = '/' + remote_dir
    cdflist = zl.list_all_files(local_dir)
    files = []
    if filetype == '**':
        files = cdflist
    else:
        for file in cdflist:
            if file.split('.')[1] in filetype:
                files.append(file)
    ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
    drifterlist = list_ftp_allfiles(remote_dir, ftp)
    drflist = []
    if keepfolder:  #keep subdirectory
        for i in range(len(drifterlist)):
            drflist.append(drifterlist[i].replace(remote_dir, local_dir))
        upflist = list(set(files) - set(drflist))
        print(len(upflist))
        ftp.quit()
        if len(upflist) == 0:
            return 0
        for file in upflist:
            ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
            fpath, fname = os.path.split(file)
            remote_dir_file = file.replace(local_dir, remote_dir)
            dir = fpath.replace(local_dir, remote_dir).replace('//', '/')
            mkds(dir, ftp)
            ftp_upload(file, remote_dir_file, ftp)
            ftp.quit()
    else:  #just upload files,cancel subfolder
        for file in drifterlist:
            fpath, fname = os.path.split(file)
            drflist.append(fname)

        upflist = []
        for file in files:
            fpath, fname = os.path.split(file)
            if fname not in drflist:
                upflist.append(file)

        print('the number of upload files:' + str(len(upflist)))
        ftp.quit()
        if len(upflist) == 0:
            return 0
        for file in upflist:
            ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
            fpath, fname = os.path.split(file)
            remote_dir_file = file.replace(fpath, remote_dir)
            dir = remote_dir
            mkds(dir, ftp)
            ftp_upload(file, remote_dir_file, ftp)
            ftp.quit()
Пример #2
0
def sd2drf_update(local_dir, remote_dir):
    '''function: upload the local file to the student drifters
    input:
        local_dir: local directory
        remote_dir: remote directory,the folder in the student drifters'''
    #determine whether the string of the path is right
    if local_dir[0] != '/':
        local_dir = '/' + local_dir
    if remote_dir[0] != '/':
        remote_dir = '/' + remote_dir
    upflist = zl.list_all_files(
        local_dir)  # get all file paths and name in local directory
    ftp = ftplib.FTP('66.114.154.52', 'huanxin',
                     '123321')  #logge in student drifters
    print('the number of upload files:' + str(len(upflist)))
    ftp.quit()
    if len(upflist
           ) == 0:  # If there is no file to upload, then return a value of 0
        return 0
    for file in upflist:  #loop every file, upload file
        ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
        fpath, fname = os.path.split(file)
        remote_dir_file = file.replace(fpath, remote_dir)
        dir = remote_dir
        mkds(
            dir,
            ftp)  #if there is no folder in student drifters, create a new one.
        ftp_upload(file, remote_dir_file, ftp)  # upload file
        ftp.quit()
def check_reformat_data(indir,
                        outdir,
                        startt,
                        endt,
                        pstatus,
                        lack_data,
                        rdnf,
                        LSN2='7a',
                        similarity=0.7,
                        mindepth=10,
                        min_minutes=timedelta(minutes=10),
                        percentage_acceptable=0.25):
    """
    input:
        indir:input directory
        LSN2: the first two letters in lowell_sn, for example:Lowell_SN is '7a4c', the LSN2 is '7a', the default value of LSN2 is '7a' 
        rdnf: In this file include the VP_NUM HULL_NUM and VESSEL_NAME 
        check:vessel name,vessel number,serial number, lat,lon
        add VP_NUM
    function:
        fix the format of value, below is the right format
        the header like this:
            Probe Type	Lowell
            Serial Number	c572
            Vessel Number	28
            VP_NUM	310473
            Vessel Name	Dawn_T
            Date Format	YYYY-MM-DD
            Time Format	HH24:MI:SS
            Temperature	C
            Depth	m
        the value like this:
            HEADING	Datet(GMT)	Lat	Lon	Temperature(C)	Depth(m)
            DATA 	2019-03-30 10:37:00	4002.1266	7006.9986 7.71	 0.79
            DATA 	2019-03-30 10:38:30	4002.1289	7006.9934 7.76	 24.2
            DATA 	2019-03-30 10:40:00	4002.1277	7006.9933 7.79	 1.20
        the depth must make sure have some value bigger than mindepth(this is a parameter, the default value is 10)
        if all of depth value is bigger than mindepth, output the logger have some issue
    """
    #Read telemetry status file and raw data name file
    telemetrystatus_df = rdm.read_telemetrystatus(pstatus)
    raw_data_name_df = pd.read_csv(rdnf, sep='\t')
    #produce a dataframe that use to calculate the number of files
    total_df = pd.concat([
        telemetrystatus_df.loc[:, ['Boat']][:],
        pd.DataFrame(data=[['Total']], columns=['Boat'])
    ],
                         ignore_index=True)
    total_df.insert(1, 'file_total', 0)
    total_df['Boat'] = total_df['Boat'].map(lambda x: x.replace(' ', '_'))
    #get all the files under the input folder and screen out the file of '.csv',and put the path+name in the allfile_lists
    allfile_lists = zl.list_all_files(indir)
    file_lists = []
    for file in allfile_lists:
        fpath, fname = os.path.split(file)  #get the file's path and name
        time_str = fname.split('.')[0].split('_')[2] + ' ' + fname.split(
            '.')[0].split('_')[3]
        time_gmt = datetime.strptime(time_str, "%Y%m%d %H%M%S")
        #time_local=zl.utc2local(time_gmt)#UTC time to local time
        if file[len(file) - 4:] == '.csv':
            if startt <= time_gmt <= endt:
                file_lists.append(file)
    #start check the data and save in the output_dir
    for file in file_lists:
        fpath, fname = os.path.split(file)  #get the file's path and name
        #fix the file name
        #fname=file.split('/')[len(file.split('/'))-1]
        fname = file.split('\\')[len(file.split('\\')) - 1]
        if len(fname.split('_')
               [1]) == 2:  # if the serieal number is only 2 digits make it 4
            new_fname = fname[:3] + LSN2 + fname[3:]
        else:
            new_fname = fname
        #read header and data
        try:
            df_head = zl.nrows_len_to(file, 2, name=['key', 'value'])
            df = zl.skip_len_to(file, 2)  #data
        except KeyboardInterrupt:
            sys.exit()
        except:
            print("worthless file:" + file)
            continue
        #vessel_name=fpath.split('/')[-2:-1][0] #get the vessel name
        vessel_name = fpath.split('\\')[-2:-1][0]
        #check the format of the data
        if len(df.iloc[0]
               ) == 5:  # some files absent the "DATA" in the first column
            df.insert(0, 'HEADING', 'DATA')
        df.columns = [
            'HEADING', 'Datet(GMT)', 'Lat', 'Lon', 'Temperature(C)', 'Depth(m)'
        ]  #rename the name of conlum of data
        df['Depth(m)'] = df['Depth(m)'].map(
            lambda x: '{0:.2f}'.format(float(x)))  #keep two decimal fraction
        #Jim&Mingchao 10,Mar,2020 filter the values that constant in >5 records
        dfs = df['Depth(m)'].map(
            lambda x: float(x))  #change type of str to float
        diffs = np.diff(dfs)
        u, c = np.unique(diffs, return_counts=True)
        if len(c[np.where(u == 0)]) > len(
                df
        ) * percentage_acceptable:  #JiM added the len() around the first part Sep 2020 & corrected spelling
            #print('pressure problem:'+file)
            print('NOTE: pressure problem in ' + fname + ' from ' +
                  vessel_name)  # JiM cleaned up these messages Sep 2020
            rdm.Write_Text(lack_data, file, reason='pressure problem')
            continue
        #Jim&Mingchao 10,Mar,2020 filter the values not enough min minutes
        dts = pd.to_datetime(df['Datet(GMT)'])
        total_diffs = dts[len(dts) - 1] - dts[0]
        if total_diffs < min_minutes:
            #print('bad data! time not more than 10 minutes:'+file)
            print('NOTE: Haul less than 10 minutes for ' + fname + ' from ' +
                  vessel_name)  # JiM cleaned up these messages Sep 2020)
            rdm.Write_Text(lack_data,
                           file,
                           reason='bad data! time not more than 10 minutes'
                           )  #record the name of file exists problem
            continue
        datacheck, count = 1, 0
        for i in range(
                len(df)):  #the value of count is 0 if the data is test data
            count = count + (float(df['Depth(m)'][i]) > mindepth
                             )  # keep track of # of depths>mindepth
            if count > 5:
                if count == i + 1:
                    print('please change the file:' + file +
                          ' make sure the logger is work well!')
                    datacheck = 0
                break
        if datacheck == 0:
            print(vessel_name + ':logger have issue:' + file)
            rdm.Write_Text(lack_data, file, reason='logger have issue'
                           )  #record the name of file exists problem
            continue
        if count == 0:  #if the file is test file,print it
            print("test file:" + file)
            rdm.Write_Text(lack_data, file, reason="test file")
            continue
        try:
            df['Temperature(C)'] = df['Temperature(C)'].map(
                lambda x: '{0:.2f}'.format(float(x))
            )  #keep two decimal fraction
            #keep the lat and lon data format is right,such as 00000.0000w to 0000.0000
            df['Lon'] = df['Lon'].map(
                lambda x: '{0:.4f}'.format(float(rdm.format_lat_lon(x))))
            df['Lat'] = df['Lat'].map(lambda x: '{0:.4f}'.format(
                float(rdm.format_lat_lon(x))))  #keep four decimal fraction
        except:
            rdm.Write_Text(lack_data, file, reason='data is not enough')
            continue
        #Check if the header file contains all the information, and if it is wrong, fix it.
        for j in range(len(df_head)):  #check and fix the vessel number
            if df_head['key'][j].lower() == 'Vessel Number'.lower():
                for i in range(len(telemetrystatus_df)):
                    if telemetrystatus_df['Boat'][i].lower(
                    ) == vessel_name.lower():
                        df_head['value'][j] = str(
                            telemetrystatus_df['Vessel#'][i])
                        break
                break
        header_file_fixed_key = [
            'Date Format', 'Time Format', 'Temperature', 'Depth'
        ]
        header_file_fixed_value = ['YYYY-MM-DD', 'HH24:MI:SS', 'C', 'm']
        EXIST, loc = 0, 0
        for fixed_t in header_file_fixed_key:
            for k in range(len(df_head['key'])):
                if fixed_t.lower() == df_head['key'][k].lower():
                    break
                else:
                    EXIST = 1
                    count = k + 1
            if EXIST == 1:
                df_head = pd.concat([
                    df_head[:count],
                    pd.DataFrame(data=[[fixed_t, header_file_fixed_value[loc]]
                                       ],
                                 columns=['key', 'value'])
                ],
                                    ignore_index=True)
            loc += 1
        for i in range(len(total_df)
                       ):  #caculate the number of every vessel and boat files
            if total_df['Boat'][i].lower() == vessel_name.lower():
                total_df['file_total'][i] = total_df['file_total'][i] + 1
        #if the vessel name and serial number are exist, find the location of them
        vessel_name_EXIST, S_number_EXIST = 0, 0
        for k in df_head.index:
            if df_head['key'][k].lower() == 'Vessel Name'.lower():
                vessel_name_EXIST = 1
                df_head['value'][k] = vessel_name
            if df_head['key'][k].lower() == 'Serial Number'.lower():
                df_head['value'][k] = df_head['value'][k].replace(':', '')
                S_number_EXIST = 1
        #check and fix the vessel name and serial number
        if S_number_EXIST == 0:
            df_head = pd.concat([
                df_head[:1],
                pd.DataFrame(data=[['Serial Number',
                                    new_fname.split('_')[1]]],
                             columns=['key', 'value']), df_head[1:]
            ],
                                ignore_index=True)
        if vessel_name_EXIST == 0:  #
            df_head = pd.concat([
                df_head[:2],
                pd.DataFrame(data=[['Vessel Name', vessel_name]],
                             columns=['key', 'value']), df_head[2:]
            ],
                                ignore_index=True)
        for i in df_head.index:
            if df_head['key'][i].lower() == 'Vessel Number'.lower():
                loc_vp_header = i + 1
                break
        for i in raw_data_name_df.index:
            ratio = zl.str_similarity_ratio(
                vessel_name.lower(),
                raw_data_name_df['VESSEL_NAME'][i].lower())
            ratio_best = 0
            if ratio > similarity:
                if ratio > ratio_best:
                    ratio_best = ratio
                    loc_vp_file = i
        df_head=pd.concat([df_head[:loc_vp_header],pd.DataFrame(data=[['VP_NUM',raw_data_name_df['VP_NUM'][loc_vp_file]]],\
                           columns=['key','value']),df_head[loc_vp_header:]],ignore_index=True)
        #creat the path and name of the new_file and the temperature file
        output_path = fpath.replace(indir, outdir)
        if not os.path.exists(
                output_path
        ):  #check the path of the save file is exist,make it if not
            os.makedirs(output_path)
        #df_head.to_csv(output_path+'/'+new_fname,index=0,header=0)
        df_head.to_csv(output_path + '\\' + new_fname, index=0, header=0)
        df.to_csv(output_path + '\\df_tem.csv',
                  index=0)  #produce the temperature file
        #add the two file in one file and delet the temperature file
        #os.system('cat '+output_path+'\\df_tem.csv'+' >> '+output_path+'\\'+new_fname)
        os.system('type ' + output_path + '\\df_tem.csv' + ' >> ' +
                  output_path + '\\' + new_fname)
        os.remove(output_path + '\\df_tem.csv')


#    #caculate the total of all files and print save as a file.
    try:
        for i in range(len(total_df) - 1):
            total_df['file_total'][len(total_df) - 1] = total_df['file_total'][
                len(total_df) - 1] + total_df['file_total'][i]
        total_df.to_csv(outdir + '\\items_number.txt', index=0)
    except KeyboardInterrupt:
        sys.exit()
    except:
        print("no valuable file!")
Пример #4
0
def sd2drf(local_dir, remote_dir, filetype='**', keepfolder=False):
    '''function: Upload all files under one folder (including all files under subfolders) to the specified folder 
    input:
        local_dir: local directory
        remote_dir: remote directory,the folder in the student drifters
        keepfolder: wheather need keep subdirectory, if we need let the value is True'''

    if local_dir[0] != '/':
        local_dir = '/' + local_dir
    if remote_dir[0] != '/':
        remote_dir = '/' + remote_dir
    cdflist = zl.list_all_files(local_dir)
    files = []
    if filetype == '**':  #upload all files
        files = cdflist
    else:  #filter out the specified format file
        for file in cdflist:
            if file.split('.')[1] in filetype:
                files.append(file)
    ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
    drifterlist = list_ftp_allfiles(
        remote_dir,
        ftp)  #get all filename and file path in the student drifters
    drflist = []
    if keepfolder == True:  #keep subdirectory
        for i in range(
                len(drifterlist)
        ):  #change the path, use to detemine which files is not exist in student drifters
            drflist.append(drifterlist[i].replace(remote_dir, local_dir))
        upflist = list(
            set(files) - set(drflist)
        )  #caculate the files that is not exist in student drifters
        print(len(upflist))
        ftp.quit()
        if len(upflist) == 0:  #return 0 if there is no file need upload.
            return 0
        for file in upflist:  #start loop files that need to upload
            ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
            fpath, fname = os.path.split(
                file)  #seperate the filename and file path
            remote_dir_file = file.replace(
                local_dir, remote_dir
            )  #initial the file path and name in student drifters
            dir = fpath.replace(local_dir, remote_dir).replace(
                '//', '/'
            )  #get the path of student drifters that this file should exist
            mkds(dir,
                 ftp)  #check the path whether exist, if not make a new one
            ftp_upload(file, remote_dir_file, ftp)  #upload file
            ftp.quit()
    else:  #just upload files,cancel subfolder
        for file in drifterlist:  #loop every filepath and filename, collect every filename
            fpath, fname = os.path.split(
                file)  #seperate the filename and file path
            drflist.append(
                fname
            )  # add the filename to the drflist(the list include filenames that under student drifters' directory)
        #filter out which files need upload
        upflist = []
        for file in files:
            fpath, fname = os.path.split(file)
            if fname not in drflist:  # if this file is not exist in student drifter, that file need add to uplist(upload list)
                upflist.append(file)

        print('the number of upload files:' + str(len(upflist)))
        ftp.quit()
        if len(upflist) == 0:  #return 0 if there is no file need upload.
            return 0
        for file in upflist:
            ftp = ftplib.FTP('66.114.154.52', 'huanxin', '123321')
            fpath, fname = os.path.split(file)
            remote_dir_file = file.replace(fpath, remote_dir)
            dir = remote_dir
            mkds(dir,
                 ftp)  #check the path whether exist, if not make a new one
            ftp_upload(file, remote_dir_file, ftp)
            ftp.quit()
Пример #5
0
                                          'lon','lat','dum1','dum2','depth','rangedepth','timerange','temp','stdtemp','year'])
#screen out the valuable data of telemetry in interval
valuable_tele_df = pd.DataFrame(
    data=None,
    columns=['vessel_n', 'esn', 'time', 'lon', 'lat', 'depth',
             'temp'])  #use to save the data during start time and end time
for i in range(len(tele_df)):
    tele_time=datetime.datetime.strptime(str(tele_df['year'].iloc[i])+'-'+str(tele_df['month'].iloc[i])+'-'+str(tele_df['day'].iloc[i])+' '+\
                                         str(tele_df['Hours'].iloc[i])+':'+str(tele_df['minates'].iloc[i])+':'+'00','%Y-%m-%d %H:%M:%S')
    if zl.local2utc(start_time_local) <= tele_time < zl.local2utc(
            end_time_local):
        valuable_tele_df=valuable_tele_df.append(pd.DataFrame(data=[[tele_df['vessel_n'][i],tele_df['esn'][i],tele_time,tele_df['lon'][i],tele_df['lat'][i],tele_df['depth'][i],\
                                                       tele_df['temp'][i]]],columns=['vessel_n','esn','time','lon','lat','depth','temp']))
valuable_tele_df.index = range(len(valuable_tele_df))
#get the path and name of the file that need to match
allfile_lists = zl.list_all_files(input_directory)
######################
file_lists = []
for file in allfile_lists:
    if file[len(file) - 4:] == '.csv':
        file_lists.append(file)
#whether the data of file and telemetry is exist
if len(valuable_tele_df) == 0 and len(file_lists) == 0:
    print(
        'please check the data website of telementry and the directory of raw_data is exist!'
    )
elif len(valuable_tele_df) == 0:
    print('please check the data website of telementry!')
elif len(file_lists) == 0:
    print('please check the directory raw_data is exist!')
else:
        os.makedirs(Hours_save +
                    vessel_lists[i].split('/')[6].split('_hours')[0] + '/')
    plt.savefig(
        os.path.join(Hours_save +
                     vessel_lists[i].split('/')[6].split('_hours')[0] + '/') +
        vessel_lists[i].split('/')[6].split('_hours')[0] + '_hours.ps',
        dpi=dpi,
        orientation='landscape')
    plt.savefig(
        os.path.join(Hours_save +
                     vessel_lists[i].split('/')[6].split('_hours')[0] + '/') +
        vessel_lists[i].split('/')[6].split('_hours')[0] + '_hours.png',
        dpi=dpi,
        orientation='portait')
    plt.show()


#main
hours_lists = zl.list_all_files(Hours_save)
vessel_lists = []  #store the path of every vessel's file
#Loop every vessel's file and Plot
for file in hours_lists:
    if file[len(file) - 9:] == 'hours.csv':
        vessel_lists.append(file)
        for i in range(len(vessel_lists)):
            vessel_df = pd.read_csv(vessel_lists[i])
            plot(vessel_lists=vessel_lists,
                 Hours_save=Hours_save,
                 new_df=vessel_df,
                 dpi=300)
Пример #7
0
import os
import pandas as pd
import zlconversions as zl
from datetime import datetime, timedelta
from pylab import mean, std

#Hardcodes
input_dir = '/home/jmanning/leizhao/programe/raw_data_match/result/checked/'
#end_time=datetime.now()
end_time = datetime.utcnow()
start_time = end_time - timedelta(days=170)
#start_time=end_time-timedelta(weeks=1)
Hours_save = '/home/jmanning/Mingchao/result/Hours_data/'

#main
allfile_lists = zl.list_all_files(input_dir)
file_lists = []  #store the path of every vessel's files
hoursfile_lists = zl.list_all_files(Hours_save)
#filter the raw files and store in file_lists
for file in allfile_lists:
    if file[len(file) - 4:] == '.csv':
        file_lists.append(file)
try:
    for file in file_lists:  # loop raw files
        fpath, fname = os.path.split(file)  #get the file's path and name
        time_str = fname.split('.')[0].split('_')[2] + ' ' + fname.split(
            '.')[0].split('_')[3]
        #GMT time to local time of file
        time_gmt = datetime.strptime(time_str, "%Y%m%d %H%M%S")
        if time_gmt < start_time or time_gmt > end_time:
            continue
Пример #8
0
def match_tele_raw(
        input_dir,
        path_save,
        telemetry_status,
        start_time,
        end_time,
        telemetry_path='https://www.nefsc.noaa.gov/drifter/emolt.dat',
        accept_minutes_diff=20,
        acceptable_distance_diff=2,
        dpi=300):
    """
    match the file and telementy.
    we can known how many file send to the satallite and output the figure
    """

    #read the file of the telementry_status
    telemetrystatus_df = read_telemetrystatus(telemetry_status)
    #st the record file use to write minmum maxmum and average of depth and temperature,the numbers of file, telemetry and successfully matched
    record_file_df=telemetrystatus_df.loc[:,['Boat','Vessel#']].reindex(columns=['Boat','Vessel#','matched_number','file_number','tele_num','max_diff_depth',\
                                      'min_diff_depth','average_diff_depth','max_diff_temp','min_diff_temp','average_diff_temp','sum_diff_depth','sum_diff_temp',\
                                      'min_lat','max_lat','min_lon','max_lon'],fill_value=None)
    #transfer the time format of string to datetime
    start_time_local = datetime.strptime(start_time, '%Y-%m-%d')
    end_time_local = datetime.strptime(end_time, '%Y-%m-%d')
    allfile_lists = zl.list_all_files(input_dir)
    ######################
    file_lists = []
    for file in allfile_lists:
        if file[len(file) - 4:] == '.csv':
            file_lists.append(file)
    #download the data of telementry
    tele_df = read_telemetry(telemetry_path)
    #screen out the data of telemetry in interval
    valuable_tele_df = pd.DataFrame(
        data=None,
        columns=['vessel_n', 'esn', 'time', 'lon', 'lat', 'depth',
                 'temp'])  #use to save the data during start time and end time
    for i in range(len(tele_df)):
        tele_time=datetime.strptime(str(tele_df['year'].iloc[i])+'-'+str(tele_df['month'].iloc[i])+'-'+str(tele_df['day'].iloc[i])+' '+\
                                         str(tele_df['Hours'].iloc[i])+':'+str(tele_df['minates'].iloc[i])+':'+'00','%Y-%m-%d %H:%M:%S')
        if zl.local2utc(start_time_local) <= tele_time < zl.local2utc(
                end_time_local):
            valuable_tele_df=valuable_tele_df.append(pd.DataFrame(data=[[tele_df['vessel_n'][i],tele_df['esn'][i],tele_time,tele_df['lon'][i],tele_df['lat'][i],tele_df['depth'][i],\
                                                       tele_df['temp'][i]]],columns=['vessel_n','esn','time','lon','lat','depth','temp']))
    valuable_tele_df.index = range(len(valuable_tele_df))
    #whether the data of file and telemetry is exist
    if len(valuable_tele_df) == 0 and len(file_lists) == 0:
        print(
            'please check the data website of telementry and the directory of raw_data is exist!'
        )
        sys.exit()
    elif len(valuable_tele_df) == 0:
        print('please check the data website of telementry!')
        sys.exit()
    elif len(file_lists) == 0:
        print('please check the directory raw_data is exist!')
        sys.exit()
    #match the file
    index = telemetrystatus_df['Boat']  #set the index for dictionary
    raw_dict = {
    }  #the dictinary about raw data, use to write the data about 'time','filename','mean_temp','mean_depth'
    tele_dict = {
    }  #the dictionary about telementry data,use to write the data about'time','mean_temp','mean_depth'
    for i in range(len(index)):  #loop every boat
        raw_dict[index[i]] = pd.DataFrame(data=None,
                                          columns=[
                                              'time', 'filename', 'mean_temp',
                                              'mean_depth', 'mean_lat',
                                              'mean_lon'
                                          ])
        tele_dict[index[i]] = pd.DataFrame(data=None,
                                           columns=[
                                               'time', 'mean_temp',
                                               'mean_depth', 'mean_lat',
                                               'mean_lon'
                                           ])
    for file in file_lists:  # loop raw files
        fpath, fname = os.path.split(file)  #get the file's path and name
        # now, read header and data of every file
        header_df = zl.nrows_len_to(file, 2, name=['key',
                                                   'value'])  #only header
        data_df = zl.skip_len_to(file, 2)  #only data

        #caculate the mean temperature and depth of every file
        value_data_df = data_df.ix[(
            data_df['Depth(m)'] >
            0.85 * mean(data_df['Depth(m)']))]  #filter the data
        value_data_df = value_data_df.ix[
            2:]  #delay several minutes to let temperature sensor record the real bottom temp
        value_data_df=value_data_df.ix[(value_data_df['Temperature(C)']>mean(value_data_df['Temperature(C)'])-3*std(value_data_df['Temperature(C)'])) & \
                   (value_data_df['Temperature(C)']<mean(value_data_df['Temperature(C)'])+3*std(value_data_df['Temperature(C)']))]  #Excluding gross error
        value_data_df.index = range(len(value_data_df))  #reindex
        for i in range(len(value_data_df['Lat'])):
            value_data_df['Lat'][i], value_data_df['Lon'][i] = cv.dm2dd(
                value_data_df['Lat'][i], value_data_df['Lon'][i])
        min_lat = min(value_data_df['Lat'].values)
        max_lat = max(value_data_df['Lat'].values)
        min_lon = min(value_data_df['Lon'].values)
        max_lon = max(value_data_df['Lon'].values)
        mean_lat = str(round(mean(value_data_df['Lat'].values), 4))
        mean_lon = str(round(mean(value_data_df['Lon'].values),
                             4))  #caculate the mean depth
        mean_temp = str(
            round(mean(value_data_df['Temperature(C)'][1:len(value_data_df)]),
                  2))
        mean_depth = str(
            abs(int(round(mean(value_data_df['Depth(m)'].values))))).zfill(
                3)  #caculate the mean depth

        #get the vessel number of every file
        for i in range(len(header_df)):
            if header_df['key'][i].lower() == 'vessel number'.lower():
                vessel_number = int(header_df['value'][i])
                break
        #caculate the number of raw files in every vessel,and min,max of lat and lon
        for i in range(len(record_file_df)):
            if record_file_df['Vessel#'][i] == vessel_number:
                if record_file_df['file_number'].isnull()[i]:
                    record_file_df['min_lat'][i] = min_lat
                    record_file_df['max_lat'][i] = max_lat
                    record_file_df['min_lon'][i] = min_lon
                    record_file_df['max_lon'][i] = max_lon
                    record_file_df['file_number'][i] = 1
                else:
                    record_file_df['file_number'][i] = int(
                        record_file_df['file_number'][i] + 1)
                    if record_file_df['min_lat'][i] > min_lat:
                        record_file_df['min_lat'][i] = min_lat
                    if record_file_df['max_lat'][i] < max_lat:
                        record_file_df['max_lat'][i] = max_lat
                    if record_file_df['min_lon'][i] > min_lon:
                        record_file_df['min_lon'][i] = min_lon
                    if record_file_df['max_lon'][i] < max_lon:
                        record_file_df['max_lon'][i] = max_lon

        #match rawdata and telementry data
        time_str = fname.split('.')[0].split('_')[2] + ' ' + fname.split(
            '.')[0].split('_')[3]
        #GMT time to local time of file
        time_local = zl.gmt_to_eastern(time_str[0:4] + '-' + time_str[4:6] +
                                       '-' + time_str[6:8] + ' ' +
                                       time_str[9:11] + ':' + time_str[11:13] +
                                       ':' + time_str[13:15])
        time_gmt = datetime.strptime(time_str, "%Y%m%d %H%M%S")
        #transfer the format latitude and longitude
        lat, lon = value_data_df['Lat'][
            len(value_data_df) - 1], value_data_df['Lon'][len(value_data_df) -
                                                          1]
        #write the data of raw file to dict
        for i in range(len(telemetrystatus_df)):
            if telemetrystatus_df['Vessel#'][i] == vessel_number:
                raw_dict[telemetrystatus_df['Boat'][i]]=raw_dict[telemetrystatus_df['Boat'][i]].append(pd.DataFrame(data=[[time_local,\
                                    fname,float(mean_temp),float(mean_depth),float(mean_lat),float(mean_lon)]],columns=['time','filename','mean_temp','mean_depth','mean_lat','mean_lon']).iloc[0],ignore_index=True)
        #caculate the numbers of successful matchs and the minimum,maximum and average different of temperature and depth, and write this data to record file
        for i in range(len(valuable_tele_df)):
            if valuable_tele_df['vessel_n'][i].split('_')[1] == str(
                    vessel_number):
                if abs(valuable_tele_df['time'][i] - time_gmt) <= timedelta(
                        minutes=accept_minutes_diff):  #time match
                    if zl.dist(lat1=lat,
                               lon1=lon,
                               lat2=float(valuable_tele_df['lat'][i]),
                               lon2=float(valuable_tele_df['lon'][i])
                               ) <= acceptable_distance_diff:  #distance match
                        for j in range(len(record_file_df)):
                            if record_file_df['Vessel#'][j] == vessel_number:
                                diff_temp = round(
                                    (float(mean_temp) -
                                     float(valuable_tele_df['temp'][i])), 4)
                                diff_depth = round(
                                    (float(mean_depth) -
                                     float(valuable_tele_df['depth'][i])), 4)
                                if record_file_df['matched_number'].isnull(
                                )[j]:
                                    record_file_df['matched_number'][j] = 1
                                    record_file_df['sum_diff_temp'][
                                        j] = diff_temp
                                    record_file_df['max_diff_temp'][
                                        j] = diff_temp
                                    record_file_df['min_diff_temp'][
                                        j] = diff_temp
                                    record_file_df['sum_diff_depth'][
                                        j] = diff_depth
                                    record_file_df['max_diff_depth'][
                                        j] = diff_depth
                                    record_file_df['min_diff_depth'][
                                        j] = diff_depth
                                    break
                                else:
                                    record_file_df['matched_number'][j] = int(
                                        record_file_df['matched_number'][j] +
                                        1)
                                    record_file_df['sum_diff_temp'][
                                        j] = record_file_df['sum_diff_temp'][
                                            j] + diff_temp
                                    record_file_df['sum_diff_depth'][
                                        j] = record_file_df['sum_diff_depth'][
                                            j] + diff_depth
                                    if record_file_df['max_diff_temp'][
                                            j] < diff_temp:
                                        record_file_df['max_diff_temp'][
                                            j] = diff_temp
                                    if record_file_df['min_diff_temp'][
                                            j] > diff_temp:
                                        record_file_df['min_diff_temp'][
                                            j] = diff_temp
                                    if record_file_df['max_diff_depth'][
                                            j] < diff_depth:
                                        record_file_df['max_diff_depth'][
                                            j] = diff_depth
                                    if record_file_df['min_diff_depth'][
                                            j] > diff_depth:
                                        record_file_df['min_diff_depth'][
                                            j] = diff_depth
                                    break

    #write 'time','mean_temp','mean_depth' of the telementry to tele_dict
    for i in range(
            len(valuable_tele_df)
    ):  #valuable_tele_df is the valuable telemetry data during start time and end time
        for j in range(len(telemetrystatus_df)):
            if int(valuable_tele_df['vessel_n'][i].split('_')
                   [1]) == telemetrystatus_df['Vessel#'][j]:
                #count the numbers by boats
                if record_file_df['tele_num'].isnull()[j]:
                    record_file_df['tele_num'][j] = 1
                else:
                    record_file_df['tele_num'][
                        j] = record_file_df['tele_num'][j] + 1
                if record_file_df['max_lat'].isnull()[j]:
                    record_file_df['min_lat'][j] = valuable_tele_df['lat'][i]
                    record_file_df['max_lat'][j] = valuable_tele_df['lat'][i]
                    record_file_df['min_lon'][j] = valuable_tele_df['lon'][i]
                    record_file_df['max_lon'][j] = valuable_tele_df['lon'][i]
                else:
                    if record_file_df['min_lat'][j] > valuable_tele_df['lat'][
                            i]:
                        record_file_df['min_lat'][j] = valuable_tele_df['lat'][
                            i]
                    if record_file_df['max_lat'][j] < valuable_tele_df['lat'][
                            i]:
                        record_file_df['max_lat'][j] = valuable_tele_df['lat'][
                            i]
                    if record_file_df['min_lon'][j] > valuable_tele_df['lon'][
                            i]:
                        record_file_df['min_lon'][j] = valuable_tele_df['lon'][
                            i]
                    if record_file_df['max_lon'][j] < valuable_tele_df['lon'][
                            i]:
                        record_file_df['max_lon'][j] = valuable_tele_df['lon'][
                            i]
                #write 'time','mean_temp','mean_depth' of the telementry to tele_dict
                tele_dict[telemetrystatus_df['Boat'][j]]=tele_dict[telemetrystatus_df['Boat'][j]].append(pd.DataFrame(data=[[valuable_tele_df['time'][i],\
                         float(valuable_tele_df['temp'][i]),float(valuable_tele_df['depth'][i]),float(valuable_tele_df['lat'][i]),float(valuable_tele_df['lon'][i])]],columns=['time','mean_temp','mean_depth','mean_lat','mean_lon']).iloc[0],ignore_index=True)
    print("finish the calculate of min_lat and min_lon!")
    for i in range(len(record_file_df)):
        if not record_file_df['matched_number'].isnull()[i]:
            record_file_df['average_diff_depth'][i] = round(
                record_file_df['sum_diff_depth'][i] /
                record_file_df['matched_number'][i], 4)
            record_file_df['average_diff_temp'][i] = round(
                record_file_df['sum_diff_temp'][i] /
                record_file_df['matched_number'][i], 4)
        else:
            record_file_df['matched_number'][i] = 0
        if record_file_df['tele_num'].isnull()[i]:
            record_file_df['tele_num'][i] = 0
        if record_file_df['file_number'].isnull()[i]:
            record_file_df['file_number'][i] = 0

    for i in index:  #loop every boat,  i represent the name of boat
        raw_dict[i] = raw_dict[i].sort_values(by=['time'])
        raw_dict[i].index = range(len(raw_dict[i]))

    record_file_df = record_file_df.drop(['sum_diff_depth', 'sum_diff_temp'],
                                         axis=1)
    #save the record file
    record_file_df.to_csv(path_save + '/' + start_time + '_' + end_time +
                          ' statistics.csv',
                          index=0)
    return raw_dict, tele_dict, record_file_df, index, start_time_local, end_time_local, path_save
Пример #9
0
def check_reformat_data(input_dir,
                        output_dir,
                        telemetry_status_file,
                        raw_data_name_file,
                        Lowell_SN_2='7a',
                        similarity=0.7,
                        mindepth=10):
    """
    check reformat of data
    check:vessel name,vessel number,serial number, lat,lon
    add VP_NUM
    if the file is test file, continue next file
    """
    #read the file of the vessel_number
    telemetrystatus_df = read_telemetrystatus(telemetry_status_file)
    raw_data_name_df = pd.read_csv(raw_data_name_file, sep='\t')

    #produce a dataframe that use to caculate the number of items
    total_df = pd.concat([
        telemetrystatus_df.loc[:, ['Boat']][:],
        pd.DataFrame(data=[['Total']], columns=['Boat'])
    ],
                         ignore_index=True)
    total_df.insert(1, 'file_total', 0)
    #get all the files under the input folder
    #screen out the file of '.csv',and put the path+name in the fil_lists
    allfile_lists = zl.list_all_files(input_dir)
    file_lists = []
    for file in allfile_lists:
        if file[len(file) - 4:] == '.csv':
            file_lists.append(file)

    #start check the data and save in the output_dir
    for file in file_lists:
        fpath, fname = os.path.split(file)  #get the file's path and name
        #fix the file name
        fname = file.split('/')[len(file.split('/')) - 1]
        if len(fname.split('_')
               [1]) == 2:  # if the serieal number is only 2 digits make it 4
            new_fname = fname[:3] + Lowell_SN_2 + fname[3:]
        else:
            new_fname = fname
        # now, read header and data
        try:
            df_head = zl.nrows_len_to(file, 2, name=['key', 'value'])
            df = zl.skip_len_to(file, 2)  #data
        except:
            print("unvaluable file:" + file)
            continue
        #the standard data have 6 columns, sometimes the data possible lack of the column of the HEADING.If lack, fixed it
        if len(df.iloc[0]
               ) == 5:  # some files didn't have the "DATA" in the first column
            df.insert(0, 'HEADING', 'DATA')
        df.columns = [
            'HEADING', 'Datet(GMT)', 'Lat', 'Lon', 'Temperature(C)', 'Depth(m)'
        ]  #rename the name of conlum of data
        #keep the lat and lon data format is right,such as 00000.0000w to 0000.0000
        for i in range(0, len(df['Lat'])):
            df['Lat'][i] = format_lat_lon(df['Lat'][i])
            df['Lon'][i] = format_lat_lon(df['Lon'][i])
        #check if the data is the test data; Is the vessel number right?(test data's vessel number is 99)
        df['Depth(m)'] = df['Depth(m)'].map(
            lambda x: '{0:.2f}'.format(float(x)))  #keep two decimal fraction
        df['Temperature(C)'] = df['Temperature(C)'].map(
            lambda x: '{0:.2f}'.format(float(x)))
        df['Lon'] = df['Lon'].map(lambda x: '{0:.4f}'.format(float(x)))
        df['Lat'] = df['Lat'].map(
            lambda x: '{0:.4f}'.format(float(x)))  #keep four decimal fraction
        count = 0
        for i in range(len(df['Depth(m)'])
                       ):  #the value of count is 0 if the data is test data
            count = count + (float(df['Depth(m)'][i]) > mindepth
                             )  # keep track of # of depths>mindepth
            if count > 5:
                break
        vessel_name = fpath.split('/')[len(fpath.split('/')) -
                                       1:][0]  #get the vessel name
        for j in range(len(df_head)):
            if df_head['key'][j].lower() == 'Vessel Number'.lower():
                LOC_V_number = j
                #check and fix the vessel number
                if count != 0:
                    for i in range(len(telemetrystatus_df)):
                        if telemetrystatus_df['Vessel#'][i] == vessel_name:
                            df_head['value'][j] = str(
                                telemetrystatus_df['Vessel#'][i])
                            break
                        else:
                            continue
                else:
                    df_head['value'][
                        j] = '99'  #the value of the vessel number is 99 if the data is test data
                break

        if df_head['value'][LOC_V_number] == '99':
            df_head = df_head.replace(vessel_name, 'Test')
            print("test file:" + file)  #if the file is test file,print it
            continue
        #check the header file whether exist or right,if not,repair it
        header_file_fixed_key = [
            'Date Format', 'Time Format', 'Temperature', 'Depth'
        ]
        header_file_fixed_value = ['YYYY-MM-DD', 'HH24:MI:SS', 'C', 'm']
        loc = 0
        EXIST = 0
        for fixed_t in header_file_fixed_key:
            for k in range(len(df_head['key'])):
                if fixed_t.lower() == df_head['key'][k].lower():
                    break
                else:
                    EXIST = 1
                    count = k + 1
            if EXIST == 1:
                df_head = pd.concat([
                    df_head[:count],
                    pd.DataFrame(data=[[fixed_t, header_file_fixed_value[loc]]
                                       ],
                                 columns=['key', 'value'])
                ],
                                    ignore_index=True)
            loc = loc + 1
        #caculate the number of every vessel and boat files
        for i in range(len(total_df['Boat'])):
            if total_df['Boat'][i].lower() == vessel_name.lower():
                total_df['file_total'][i] = total_df['file_total'][i] + 1

        #if the vessel name and serial number are exist, find the location of them
        vessel_name_EXIST = 0
        S_number_EXIST = 0
        for k in range(len(df_head['key'])):
            if df_head['key'][k].lower() == 'Vessel Name'.lower():
                vessel_name_EXIST = 1
                df_head['value'][k] = vessel_name
            if df_head['key'][k].lower() == 'Serial Number'.lower():
                if len(df_head['value'][k].split(':')) > 1:
                    df_head['value'][k] = df_head['value'][k].replace(':', '')
                S_number_EXIST = 1
        #check and fix the vessel name and serial number
        if S_number_EXIST == 0:
            df_head = pd.concat([
                df_head[:1],
                pd.DataFrame(data=[['Serial Number',
                                    new_fname.split('_')[1]]],
                             columns=['key', 'value']), df_head[1:]
            ],
                                ignore_index=True)
        if vessel_name_EXIST == 0:  #
            df_head = pd.concat([
                df_head[:2],
                pd.DataFrame(data=[['Vessel Name', vessel_name]],
                             columns=['key', 'value']), df_head[2:]
            ],
                                ignore_index=True)

        for i in range(len(df_head['key'])):
            if df_head['key'][i].lower() == 'Vessel Number'.lower():
                loc_vp_header = i + 1
                break
        for i in range(len(raw_data_name_df['VESSEL_NAME'])):
            ratio = zl.str_similarity_ratio(
                vessel_name.lower(),
                raw_data_name_df['VESSEL_NAME'][i].lower())
            ratio_best = 0
            if ratio > similarity:
                if ratio > ratio_best:
                    ratio_best = ratio
                    loc_vp_file = i
        df_head = pd.concat([
            df_head[:loc_vp_header],
            pd.DataFrame(
                data=[['VP_NUM', raw_data_name_df['VP_NUM'][loc_vp_file]]],
                columns=['key', 'value']), df_head[loc_vp_header:]
        ],
                            ignore_index=True)
        #creat the path and name of the new_file and the temperature file
        output_path = fpath.replace(input_dir, output_dir)
        if not os.path.exists(
                output_path
        ):  #check the path of the save file is exist,make it if not
            os.makedirs(output_path)
        df_head.to_csv(output_path + '/' + new_fname, index=0, header=0)
        df.to_csv(output_path + '/df_tem.csv',
                  index=0)  #produce the temperature file
        #add the two file in one file and delet the temperature file
        os.system('cat ' + output_path + '/df_tem.csv' + ' >> ' + output_path +
                  '/' + new_fname)
        os.remove(output_path + '/df_tem.csv')


#    #caculate the total of all files and print save as a file.
    try:
        for i in range(len(total_df['file_total'])):
            total_df['file_total'][
                len(total_df['file_total']) -
                1] = total_df['file_total'][0] + total_df['file_total'][i]
        total_df.to_csv(output_dir + '/items_number.txt', index=0)
    except:
        print("no valuable file!")