def mergeBst_ch(self,fileName='bst_2018-2019.csv',file_ch = 'bst_ch_2018.txt'): data_df = pd.read_csv('%s%s'%(getBasePath('typhoon'),fileName)) change_idx = 0 with open('%s%s'%(getBasePath('typhoon'),file_ch),'r') as f: while True: tmp = f.readline() if len(tmp)==0: break aTyphoon = self.splitMultiBlank(str(tmp)) linesNum = int(aTyphoon[2]) data_time_list = data_df.time.tolist() for idx in range(linesNum): aLine = self.splitMultiBlank(str(f.readline())) time = int(aLine[0]) lat = int(aLine[2]) * 0.1 lon = int(aLine[3]) * 0.1 if time in data_time_list: idx_data_match = data_df.index[data_df['time'] == time].tolist()[0] lat_data = data_df.iloc[idx_data_match]['lat'] lon_data = data_df.iloc[idx_data_match]['lon'] if lat_data != lat or lon_data!=lon: data_df.ix[idx_data_match,'lat'] = lat data_df.ix[idx_data_match,'lon'] = lon change_idx += 1 print(change_idx) data_df.to_csv('%s%s'%(getBasePath('typhoon'),fileName),index=False,header=True)
def dataTime(self, pickList, dataFilename='dir_1KM.txt', res='1KM'): ''' Extract data which are observed when typhoons occur. :param pickList: the time interval when typhoons occur, which are concluded from tracks data. :param dataFilename: raw file which storing all the data file names. :param res: resolution :return: ''' data_time_df = pd.read_csv('%s%s' % (getBasePath('typhoon'), dataFilename), header=None) data_time_df.columns = ['path'] data_time_df['names'] = data_time_df.applymap( lambda x: x[x.rfind('\\') + 1:]) data_time_df.drop(axis=1, labels=['path'], inplace=True) data_time_df['time'] = data_time_df.applymap(lambda x: x[44:58]) data_time_df['date'] = pd.to_datetime( data_time_df['time'], format='%Y%m%d%H%M%S') # 将数据类型转换为日期类型 data_time_df = data_time_df.set_index('date') # 将date设置为index data_time_df_pick = pd.DataFrame() for x in pickList: data_time_df_pick = pd.concat( [data_time_df_pick, data_time_df[x[0]:x[1]]]) print(data_time_df_pick.shape[0]) data_time_df_pick[['names', 'time']].to_csv( '%sdata_pick_%s.csv' % (getBasePath('typhoon'), res), index=False, header=True, encoding='utf-8')
def trackTime(self, fileName='bst_2018-2019.csv', start_time='2018-07-01'): ''' Extrct the time of tracks data. :param fileName: :return: ''' tracks_df = pd.read_csv('%s%s' % (getBasePath('typhoon'), fileName)) time_list = list(tracks_df['time'].unique()) time_list = ['%s00' % x for x in time_list] time_df = pd.DataFrame(data=time_list, columns=['time']) time_df['date'] = pd.to_datetime(time_df['time'], format='%Y%m%d%H%M%S') # 将数据类型转换为日期类型 time_df = time_df.set_index('date') # 将date设置为index time_df = time_df['2018-07-01':] time_df = time_df.sort(ascending=True) time_df.to_csv('%stime_tracks.csv' % getBasePath('typhoon'), header=None, index=False, encoding='UTF-8') # time_df.groupby() f = open('%stime_tracks.txt' % getBasePath('typhoon'), 'w', encoding='UTF-8') for itm in time_list: f.writelines(itm + '\n') f.close() return time_list
def extractImg(self): fea_train = pd.read_csv('%s/data/train-img-feas.csv'%util.getBasePath(),index_col=0) fea_test = pd.read_csv('%s/data/test-img-feas.csv' % util.getBasePath(),index_col=0) df_imgs = pd.concat([fea_train,fea_test],axis=0,ignore_index=True) self.df = pd.concat([self.df,df_imgs],axis=1) del df_imgs print('get images done')
def __init__(self, n_clusters=10, isRelativePath=True): self.sift_extractor = cv2.xfeatures2d_SIFT.create() if not isRelativePath: self.path = getBasePath() + '/../data/profile_images_%s/%s' else: self.path = getBasePath() + '/data/profile_images_%s/%s' self.n_clusters = n_clusters self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=3724) self.nullImgInds = [ ] # keep tracking those with not existed profile images, int
def __init__(self,resolution,filename,fillValue=999): ''' :param resolution: like '4KM','2KM' ''' self._resolution = resolution self._grid_size = config.IMG_SIZE[self._resolution] self._fillValue = fillValue savefileName = self.saveFilename(filename) GEO_LAT, GEO_LON = self.readRaw('%s/%s' % (getBasePath('data'), filename)) GEO_LAT = self.fillBlank(GEO_LAT, 'lat') GEO_LON = self.fillBlank(GEO_LON, 'lon') self.saveGeo('%s/%s' % (getBasePath('data'), savefileName), GEO_LAT, GEO_LON)
def loadGeoData(self, resolution, category='raw'): df = pd.read_csv("%s/FullMask_Grid_%s_999_NULL_%s.csv" % (getBasePath(''), resolution, 'lat'), sep=',') df['Coordinates'] = list(zip(df.lon, df.lat)) df['Coordinates'] = df['Coordinates'].apply(Point) gdf = gpd.GeoDataFrame(df, geometry='Coordinates') return gdf
def saveGeo(GEO_LAT, GEO_LON, filename='%stransFormula_IMG_2_GEO' % util.getBasePath('data')): lat_df = pd.DataFrame(GEO_LAT) lat_df.to_csv('%s_%s_lat.csv' % (filename, res), header=True, index=True) lon_df = pd.DataFrame(GEO_LON) lon_df.to_csv('%s_%s_lon.csv' % (filename, res), header=True, index=True)
def readBST_Track_file(self,fileName='bst_2018-2019.txt',saveFileName='bst_2018-2019.csv'): # track_df = pd.DataFrame(columns=self._columns) itm_list = [] with open('%s%s'%(getBasePath('typhoon'),fileName),'rb') as f: while True: tmp = f.readline() if len(tmp)==0: break aTyphoon = self.splitMultiBlank(str(tmp)) linesNum = int(aTyphoon[2]) aTyphoonID = aTyphoon[5] aTyphoonName = aTyphoon[7] for idx in range(linesNum): aLine = self.splitMultiBlank(str(f.readline())) # itm_time = f[0] # lat = int(f[3]) * 0.1 # lon = int(f[4]) * 0.1 # center_press = int(f[5]) # max_sustained_wind_spd = int(f[6]) # direct_longest_rad_50kt = f[7] # longest_rad_50kt = int(f[8]) # shortest_rad_50kt = int(f[9]) # direct_longest_rad_30kt = f[10] # longest_rad_30kt = int(f[11]) # shortest_rad_30kt = int(f[12]) aItm = ['20%s'%aLine[0],int(aLine[3]) * 0.1,int(aLine[4]) * 0.1,aTyphoonID,aTyphoonName, int(re.sub("\D", "", aLine[5])),int(re.sub("\D", "", aLine[6]))] if int(re.sub("\D", "", aLine[6])) == 0: aItm +=['8',0,0,'8',0,0] else: aItm += [aLine[7][0], int(aLine[7][1:]),int(aLine[8]), aLine[9][0], int(aLine[9][1:]),int(aLine[10]) ] itm_list.append(aItm) if(len(aItm)!=13): print(aItm) # track_df.loc[df_idx] = aItm # df_idx += 1 track_df = pd.DataFrame(itm_list,columns=self._columns) track_df.to_csv('%s%s'%(getBasePath('typhoon'),saveFileName),index=False,header=True)
def drawMap(self, filename, dn_ch_arr, ch_name): ''' Save extracted image file. :param dn_ch: the dn data of one specific channel ''' tmp = filename.split('_')[9] # extract filename from raw filename filename = '%s%s/%s_Channel%s.jpg' % (getBasePath('img'), self._resolution, tmp, ch_name) img = cv2.cvtColor(dn_ch_arr, cv2.COLOR_BGR2RGB) cv2.imwrite(filename, img, [int(cv2.IMWRITE_JPEG_QUALITY), self._saveIMGQuality])
def __init__(self, imgNameList, tasktype='train', path="%s/../data/%s_profile_images/profile_images_%s"): self.imgNameList = imgNameList if tasktype not in ['train', 'test']: print("please set tasktype as train or test") self.tasktype = tasktype self.imgList = [] self.imgBasePath = path % (getBasePath(), self.tasktype, self.tasktype) self.imgExisIndex = []
def getAllFilesList(self, basepath='F:\风云数据\\1KM\\1Km全圆盘数据', res='1KM'): # miss_list = [] f = open("%sdir_%s.txt" % (getBasePath('typhoon'), res), "w", encoding='UTF-8') for root, dirs, files in os.walk(basepath): for file in files: tmp = os.path.join(root, file) # if tmp.endswith('.null') or tmp.endswith('hdf.td') or tmp.endswith('hdf.td.cfg'): # miss_list.append(tmp) f.writelines(tmp + "\n") f.close()
def getIMGCoord(self,filename='bst_2018-2019.csv'): df = pd.read_csv('%s%s'%(getBasePath('typhoon'),filename)) for res in ['4KM','2KM','1KM']: transformer = CoordTrans(res) def valuation_formula(x, y,type='l'): if type=='l': return int(transformer.geo2ImgCoord(x,y)[0]) elif type=='c': return int(transformer.geo2ImgCoord(x,y)[1]) key_l,key_c = 'l_img_%s'%res, 'c_img_%s'%res, df[key_l] = df.apply(lambda row: valuation_formula(row['lon'], row['lat'],'l'), axis=1) df[key_c] = df.apply(lambda row: valuation_formula(row['lon'], row['lat'],'c'), axis=1) print(df.head()) df.to_csv('%s%s' % (getBasePath('typhoon'), filename), index=False, header=True)
def pickDataByTrackTime(self, trackFilename='time_tracks.csv', dataFilename='data_pick_1KM.csv', res='1KM'): track_time_df = pd.read_csv('%s%s' % (getBasePath('typhoon'), trackFilename)) track_time_df.columns = ['time'] track_time_df.set_index('time', inplace=True) # 将date设置为index track_time_df.index = pd.to_datetime( track_time_df.index, format='%Y%m%d%H%M%S') # 将数据类型转换为日期类型 track_time_list = list(track_time_df.index.values) data_time_df = pd.read_csv('%s%s' % (getBasePath('typhoon'), dataFilename)) data_time_df.set_index('time', inplace=True) # 将date设置为index data_time_df.index = pd.to_datetime( data_time_df.index, format='%Y%m%d%H%M%S') # 将数据类型转换为日期类型 data_match_df = pd.DataFrame() for idx in track_time_list: if idx in data_time_df.index: idx_int = data_time_df.index.get_loc(idx) data_match_df = pd.concat( [data_match_df, data_time_df.iloc[idx_int]]) print(data_match_df.shape[0]) data_match_df.to_csv('%sdata_match_%s.csv' % (getBasePath('typhoon'), res), index=False, header=False, encoding='utf-8')
def loadData(self, dataset): indexlist_ = ['id', 'uname', 'url', 'covImgStatus', 'verifStatus', 'textColor', 'pageColor', 'themeColor', 'isViewSizeCustom', 'utcOffset', 'location', 'isLocVisible', 'uLanguage', 'creatTimestamp', 'uTimeZone', 'numFollowers', 'numPeopleFollowing', 'numStatUpdate', 'numDMessage', 'category', 'avgvisitPerSecond', 'avgClick', 'profileImg', 'numPLikes'] basepath = getBasePath() astr = "%s/data/%s.csv" if dataset not in ['train', 'test']: print("Invalid dataset type, only train and test are supported") return "" filename = astr % (basepath, dataset) df = pd.read_csv(filename) if dataset == 'test': df.columns = indexlist_[:-1] else: df.columns = indexlist_ return df
def loadOneRadTempData(self, filename): ''' Load the radiation temperature data of one specific HDF file. :param filename: A file name of HDF file. :return: ''' H5f = h5py.File(getBasePath('data') + filename, 'r') self._IMG_VALID_REG = H5f['NOMObsColumn'][:] dn_ch_list = [] rad_temp_list = [] lat_size, lon_size = config.IMG_SIZE[ self._resolution][0], config.IMG_SIZE[self._resolution][1] for chan in self._chan_num_list: dn_ch_list.append( H5f[config.NOM_KEY_VALUE[self._resolution][chan]][:]) rad_temp_list.append( H5f[config.CALIB_KEY_VALUE[self._resolution][chan]][:]) for idx, (dn, rad) in enumerate(zip(dn_ch_list, rad_temp_list)): for lat_ind in range(lat_size): valid_reg_interval = self._IMG_VALID_REG[lat_ind][:] if valid_reg_interval[0] > -1 and valid_reg_interval[1] > -1: for lon_ind in range(valid_reg_interval[0], valid_reg_interval[1] + 1): tmp = int(dn[lat_ind][lon_ind]) if tmp >= 65535: # dn[lat_ind][lon_ind] = self._fillValue[idx] dn[lat_ind][lon_ind] = 65535 else: dn[lat_ind][lon_ind] = rad[tmp] else: dn[lat_ind][:] = 65535 dn_ch_list[idx] = dn H5f.close() return dn_ch_list
def loadModel(self): bs = getBasePath() return tf.keras.models.load_model(self.modelpath % getBasePath())
def loadModel(self): return keras.models.load_model('%s/savedModel/model-mlp.h5' % getBasePath())
def savemodel(self): self.model.save('%s/savedModel/model-mlp.h5' % getBasePath())
def loadModel(self): return joblib.load('%s/savedModel/model-rf.joblib' % getBasePath())
def savemodel(self): joblib.dump(self.model, '%s/savedModel/model-rf.joblib' % getBasePath(), compress=0)
def loadModel(model_name): return joblib.load('%s/savedModel/%s.joblib' % (getBasePath(), model_name))
def savemodel(model, modelname): joblib.dump(model, '%s/savedModel/%s.joblib' % (getBasePath(), modelname), compress=0)
def savemodel(self): self.model.save(self.modelpath % getBasePath())
features = self._get_layer_output(X, -3) # the last third layer return features def savemodel(self): self.model.save(self.modelpath % getBasePath()) def loadModel(self): bs = getBasePath() return tf.keras.models.load_model(self.modelpath % getBasePath()) if __name__ == '__main__': import pandas as pd import numpy as np df_train = pd.read_csv("%s/../data/train.csv" % getBasePath()) df_test = pd.read_csv("%s/../data/test.csv" % getBasePath()) indexlist_ = [ 'id', 'uname', 'url', 'covImgStatus', 'verifStatus', 'textColor', 'pageColor', 'themeColor', 'isViewSizeCustom', 'utcOffset', 'location', 'isLocVisible', 'uLanguage', 'creatTimestamp', 'uTimeZone', 'numFollowers', 'numPeopleFollowing', 'numStatUpdate', 'numDMessage', 'category', 'avgvisitPerSecond', 'avgClick', 'profileImg', 'numPLikes' ] df_train.columns = indexlist_ df_test.columns = indexlist_[:-1] imgNamelist_train = df_train['profileImg'].values imgNamelist_test = df_test['profileImg'].values yclassNum = 10