def run_retrieval_names(self): ''' Perform the retrieval of a range of PIDs ''' # Get start time start_time = time.time() end_time = start_time # Initalize the needed datasources. # Right now this is hardcoded into the initializer, so for config # you need to manage this yourself. data_provider = pa.init_param_sampleprovider() data_provider.set_system_element_as_any() retr_time_total, conv_time_total = (0, 0) param_names_invalid = {} gen_files = [] var_name = '' var_type = '' # Set preparation time stamp prep_time = time.time() # Get parameter names for the range of parameter ids, and retrieve samples as DataFrame (param_pids, param_syselem) = data_provider.get_parameter_pid_sysel_from_names(self.param_names) samples = data_provider.get_parameter_sysel_data_objs(self.param_names, param_syselem, self.timestamp_start, self.timestamp_end) # Set retrieval time stamp retr_time = time.time() # Convert sample columns to binary tables i = 0 for column in samples: pname = self.param_names[i] pid = param_pids[i] # Currently only FITS files are generated # Build initial primary HDU for FITS file hdul = self.fits_build_hdr(pid, pid) # Loop on samples to add values to the resulting vectors time_stamps = [] values = [] start = True for s in column: if start: var_name = s.get_name() var_type = s.get_type() if var_name != pname: logging.warning("ERROR: Param. name does not match with expectation!") logging.info('Generating table for PID {} - {} (type={})' .format(pid, var_name, var_type)) start = False time_stamps.append(s.get_time()) value = s.get_value() if var_type == DateTimeType: value = unix_ms_to_datestr(value) values.append(value) if var_type == DateTimeType: var_type = StringType if start: param_names_invalid[str(pid - 1)] = pname continue type_conv = Ares2FitsConversion[str(var_type)] if var_type == StringType: size_fld = len(max(values, key=len)) type_conv = type_conv.format(size_fld if size_fld > 0 else 1) t = fits.BinTableHDU.from_columns([fits.Column(name='TIMESTAMP', array=np.array(time_stamps), format='K'), fits.Column(name=var_name, array=np.array(values), format=type_conv)]) hdul.append(t) # Remove FITS file if exists, and (re)create it self.from_pid, self.to_pid = (pid, pid) self.from_pid_blk, self.to_pid_blk = (pid, pid) self.name = pname file_name = '{}/{}.fits'.format(self.outdir, self.generate_filename(self.file_tpl)) self.save_to_fits(hdul, file_name) gen_files.append(file_name) logging.info('Saved file {}'.format(file_name)) i = i + 1 end_time = time.time() retr_time_total = retr_time_total + (retr_time - start_time) conv_time_total = conv_time_total + (end_time - retr_time) full_time_total = end_time - start_time logging.info("Data retrieval: {:10.3f} s".format(retr_time_total)) logging.info("Data conversion: {:10.3f} s".format(conv_time_total)) logging.info("Total exec. time: {:10.3f} s".format(full_time_total)) if len(param_names_invalid) > 0: logging.info("The following parameters could not be converted:") for p in param_names_invalid.keys(): logging.info('{}: "{}"'.format(p, param_names_invalid[p])) return (retr_time_total, conv_time_total, full_time_total, param_names_invalid, gen_files)
def main(): # Get start time start_time = time.time() # Parse command line arguments args = get_args() #print(args) # Define config. file if not set in the local environment if not 'PYAREX_INI_FILE' in os.environ: os.environ['PYAREX_INI_FILE'] = args.config_file # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself. data_provider = pa.init_param_sampleprovider() # Get some params that you want to look for and of course some times as well from_pid, to_pid = (args.from_pid, args.to_pid) year1, doy1, h1, m1, s1 = args.from_date timestamp_start = unix_ydoy_to_ms(year1, doy1, h1, m1, s1) year2, doy2, h2, m2, s2 = args.to_date timestamp_end = unix_ydoy_to_ms(year2, doy2, h2, m2, s2) base_name = ("test5_{0}-{1}__" + "{2}-{3}_{4:02d}{5:02d}{6:02d}_" + "{7}-{8}_{9:02d}{10:02d}{11:02d}").format( from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2, h2, m2, s2) print(( "-----------------------------------------------------------------------------------\n" + "Retrieving samples for parameters with parameter ids in the range {0}:{1}\n" + "from the date {2}.{3}.{4:02d}:{5:02d}:{6:02d} " + "to the date {7}.{8}.{9:02d}:{10:02d}:{11:02d}\n").format( from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2, h2, m2, s2)) # Get parameter names for the range of parameter ids, and retrieve samples as DataFrame param_names = data_provider.get_parameter_names_from_pids(from_pid, to_pid) #df = data_provider.get_parameter_data_df(param_names, timestamp_start, timestamp_end) #samples = data_provider.get_parameter_data_objs(param_names, timestamp_start, timestamp_end) prep_time = time.time() first = True for pname in param_names: dfnew = data_provider.get_parameter_data_df([pname], timestamp_start, timestamp_end) if first: df = dfnew first = False else: df = pd.concat([df, dfnew], axis=1) exec_time = time.time() print("#---- df -----------------------") df.info() df.describe() print(df.shape) #for row in samples: # print(row) # for sample in row: # print("{} {} {} {}".format(sample.get_time(), sample.get_name(), sample.get_type(), sample.get_value())) # break # Save to CSV #df1.to_csv(base_name + '.1.csv') #df2.to_csv(base_name + '.2.csv') gzip_time = time.time() #df = pd.concat([df1, df2], axis=1) #df.info() #print(df.shape) df.to_csv(base_name + '.csv') # Save to pickle #df.to_pickle(base_name + '.pkl') # Save to FITS t = Table.from_pandas(df) print(t) t.write(base_name + '.fits', format='fits') #fits_file = fitsio.FITS(base_name + '.fits', 'rw') #fits_file.write(df.to_records(index=False)) #fits_file.close() # Save to HDF df.to_hdf(base_name + '.h5', key='df', mode='w') #ax = plt.gca() #df.plot(kind='line',x='timestamp',y=1,ax=ax) #df.plot(kind='line',x='timestamp',y=100, color='red', ax=ax) #plt.show() gzip_file(base_name + '.csv', base_name + '.csv.gz') end_time = time.time() print(''' Data retrieval: {:10.3f} s Data storing: {:10.3f} s Compressing: {:10.3f} s Total exec. time: {:10.3f} s '''.format(exec_time - prep_time, gzip_time - exec_time, end_time - gzip_time, end_time - start_time))
def main(): # Get start time start_time = time.time() # Parse command line arguments args = get_args() # Define config. file if not set in the local environment if not 'PYAREX_INI_FILE' in os.environ: os.environ['PYAREX_INI_FILE'] = args.config_file # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself. data_provider = pa.init_param_sampleprovider() # Define process inputs from_pid, to_pid, timestamp_start, timestamp_end, base_name, pids_step = define_inputs( args) retr_time_total, conv_time_total = (0, 0) keep_retrieving = True i_pid = from_pid j_pid = i_pid + pids_step - 1 param_names_invalid = {} while keep_retrieving: # Set preparation time stamp prep_time = time.time() # Get parameter names for the range of parameter ids, and retrieve samples as DataFrame param_names = data_provider.get_parameter_names_from_pids(i_pid, j_pid) samples = data_provider.get_parameter_data_objs( param_names, timestamp_start, timestamp_end) # Set retrieval time stamp retr_time = time.time() # Build initial primary HDU for FITS file hdul = fits_build_hdr(i_pid, j_pid) # Convert sample columns to binary tables i = 0 pid = i_pid for column in samples: # Loop on samples to add values to the resulting vectors time_stamps = [] values = [] start = True for s in column: if start: var_name = s.get_name() var_type = s.get_type() if var_name != param_names[i]: print( "ERROR: Param. name does not match with expectation!" ) print( 'Generating table {} of {} for PID {} - {} (type={})'. format(i + 1, pids_step, pid, var_name, var_type)) start = False time_stamps.append(s.get_time()) value = s.get_value() if var_type == DateTimeType: value = unix_ms_to_datestr(value) values.append(value) if var_type == DateTimeType: var_type = StringType i = i + 1 pid = pid + 1 if start: param_names_invalid[str(pid - 1)] = param_names[i - 1] continue type_conv = Ares2FitsConversion[str(var_type)] if var_type == StringType: size_fld = len(max(values, key=len)) type_conv = type_conv.format(size_fld if size_fld > 0 else 1) t = fits.BinTableHDU.from_columns([ fits.Column(name='TIMESTAMP', array=np.array(time_stamps), format='K'), fits.Column(name=var_name, array=np.array(values), format=type_conv) ]) hdul.append(t) # Remove FITS file if exists, and (re)create it file_name = base_name + '.{}-{}.fits'.format(i_pid, j_pid) save_to_fits(hdul, file_name) print('Saved file {}'.format(file_name)) end_time = time.time() retr_time_total = retr_time_total + (retr_time - prep_time) conv_time_total = conv_time_total + (end_time - retr_time) i_pid = j_pid + 1 j_pid = i_pid + pids_step - 1 if j_pid > to_pid: j_pid = to_pid keep_retrieving = (i_pid < to_pid) full_time_total = end_time - start_time print( ("Data retrieval: {:10.3f} s\n" + "Data conversion: {:10.3f} s\n" + "Total exec. time: {:10.3f} s").format(retr_time_total, conv_time_total, full_time_total)) if len(param_names_invalid) > 0: print("The following parameters could not be converted:") print('\n'.join([ '{}: "{}"'.format(i, param_names_invalid[i]) for i in param_names_invalid.keys() ]))
epoch = datetime.datetime.utcfromtimestamp(0) dt = datetime.datetime(y, m, d, h, mi, s) return int((dt - epoch).total_seconds() * 1000.0 + ms) def unix_ydoy_to_ms(y, doy, h=0, mi=0, s=0, ms=0): epoch = datetime.datetime.utcfromtimestamp(0) dt = datetime.datetime(y, 1, 1, h, mi, s) + datetime.timedelta(doy - 1) return int((dt - epoch).total_seconds() * 1000.0 + ms) if not 'PYAREX_INI_FILE' in os.environ: os.environ['PYAREX_INI_FILE'] = os.getcwd() + '/jc.ini' # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself. data_provider = pa.init_param_sampleprovider() # Get some params that you want to look for and of course some times as well from_pid = 1 to_pid = 32424 timestamp_start = unix_ydoy_to_ms(2013, 354) timestamp_end = unix_ydoy_to_ms(2013, 355) # Get the data from HBase and transform into Pandas DF #df = data_provider.get_parameter_data_df(params, timestamp_start, timestamp_end) #print(df.shape) # Or, if you prefer, get them into sample objects and get information from the objects. # The method returns a nested generator. (param_names, samples) = data_provider.get_parameter_pids_data_objs(from_pid, to_pid,
def main(): # Get start time start_time = time.time() # Parse command line arguments args = get_args() #print(args) # Define config. file if not set in the local environment if not 'PYAREX_INI_FILE' in os.environ: os.environ['PYAREX_INI_FILE'] = args.config_file # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself. data_provider = pa.init_param_sampleprovider() # Get some params that you want to look for and of course some times as well from_pid, to_pid = (args.from_pid, args.to_pid) year1, doy1, h1, m1, s1 = args.from_date timestamp_start = unix_ydoy_to_ms(year1, doy1, h1, m1, s1) year2, doy2, h2, m2, s2 = args.to_date timestamp_end = unix_ydoy_to_ms(year2, doy2, h2, m2, s2) base_name = ("test3_{0}-{1}__" + "{2}-{3}_{4:02d}{5:02d}{6:02d}_" + "{7}-{8}_{9:02d}{10:02d}{11:02d}").format( from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2, h2, m2, s2) prep_time = time.time() print(( "Retrieving samples for parameters with parameter ids in the range {0}:{1}\n" + "from the date {2}.{3}.{4:02d}:{5:02d}:{6:02d} " + "to the date {7}.{8}.{9:02d}:{10:02d}:{11:02d}\n").format( from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2, h2, m2, s2)) # Get parameter names for the range of parameter ids param_names = data_provider.get_parameter_names_from_pids(from_pid, to_pid) # Getting as data frame df = data_provider.get_parameter_data_df(param_names, timestamp_start, timestamp_end) exec_time = time.time() df.info() #df.describe() print(df.shape) # Save to picle df.to_pickle('test3-32424params-1min.pkl') # Save to FITS #t = Table.from_pandas(df) #t.write('test3-1000params-1h.fits', format='fits') # Save to HDF #df.to_hdf(base_name + '.h5', key='df', mode='w') # Save to CSV df.to_csv(base_name + '.csv') gzip_time = time.time() gzip_file(base_name + '.csv', base_name + '.csv.gz') end_time = time.time() print(''' Data retrieval: {:10.3f} s Data storing: {:10.3f} s Compressing: {:10.3f} s Total exec. time: {:10.3f} s '''.format(exec_time - prep_time, gzip_time - exec_time, end_time - gzip_time, end_time - start_time))