예제 #1
0
    def run_retrieval_names(self):
        '''
        Perform the retrieval of a range of PIDs
        '''

        # Get start time
        start_time = time.time()
        end_time = start_time

        # Initalize the needed datasources.
        # Right now this is hardcoded into the initializer, so for config
        # you need to manage this yourself.
        data_provider = pa.init_param_sampleprovider()
        data_provider.set_system_element_as_any()

        retr_time_total, conv_time_total = (0, 0)

        param_names_invalid = {}
        gen_files = []
        var_name = ''
        var_type = ''

        # Set preparation time stamp
        prep_time = time.time()

        # Get parameter names for the range of parameter ids, and retrieve samples as DataFrame
        (param_pids, param_syselem) = data_provider.get_parameter_pid_sysel_from_names(self.param_names)
        samples = data_provider.get_parameter_sysel_data_objs(self.param_names,
                                                              param_syselem,
                                                              self.timestamp_start,
                                                              self.timestamp_end)

        # Set retrieval time stamp
        retr_time = time.time()

        # Convert sample columns to binary tables
        i = 0
        for column in samples:

            pname = self.param_names[i]
            pid = param_pids[i]

            # Currently only FITS files are generated

            # Build initial primary HDU for FITS file
            hdul = self.fits_build_hdr(pid, pid)

            # Loop on samples to add values to the resulting vectors
            time_stamps = []
            values = []
            start = True

            for s in column:
                if start:
                    var_name = s.get_name()
                    var_type = s.get_type()
                    if var_name != pname:
                        logging.warning("ERROR: Param. name does not match with expectation!")
                    logging.info('Generating table for PID {} - {} (type={})'
                                 .format(pid, var_name, var_type))
                    start = False

                time_stamps.append(s.get_time())

                value = s.get_value()
                if var_type == DateTimeType:
                    value = unix_ms_to_datestr(value)

                values.append(value)

                if var_type == DateTimeType:
                    var_type = StringType

            if start:
                param_names_invalid[str(pid - 1)] = pname
                continue

            type_conv = Ares2FitsConversion[str(var_type)]
            if var_type == StringType:
                size_fld = len(max(values, key=len))
                type_conv = type_conv.format(size_fld if size_fld > 0 else 1)

            t = fits.BinTableHDU.from_columns([fits.Column(name='TIMESTAMP',
                                                           array=np.array(time_stamps),
                                                           format='K'),
                                               fits.Column(name=var_name,
                                                           array=np.array(values),
                                                           format=type_conv)])
            hdul.append(t)

            # Remove FITS file if exists, and (re)create it
            self.from_pid, self.to_pid = (pid, pid)
            self.from_pid_blk, self.to_pid_blk = (pid, pid)
            self.name = pname
            file_name = '{}/{}.fits'.format(self.outdir, self.generate_filename(self.file_tpl))
            self.save_to_fits(hdul, file_name)
            gen_files.append(file_name)
            logging.info('Saved file {}'.format(file_name))

            i = i + 1

            end_time = time.time()

            retr_time_total = retr_time_total + (retr_time - start_time)
            conv_time_total = conv_time_total + (end_time - retr_time)

        full_time_total = end_time - start_time

        logging.info("Data retrieval:   {:10.3f} s".format(retr_time_total))
        logging.info("Data conversion:  {:10.3f} s".format(conv_time_total))
        logging.info("Total exec. time: {:10.3f} s".format(full_time_total))
        if len(param_names_invalid) > 0:
            logging.info("The following parameters could not be converted:")
            for p in param_names_invalid.keys():
                logging.info('{}: "{}"'.format(p, param_names_invalid[p]))

        return (retr_time_total, conv_time_total, full_time_total, param_names_invalid, gen_files)
예제 #2
0
def main():
    # Get start time
    start_time = time.time()

    # Parse command line arguments
    args = get_args()
    #print(args)

    # Define config. file if not set in the local environment
    if not 'PYAREX_INI_FILE' in os.environ:
        os.environ['PYAREX_INI_FILE'] = args.config_file

    # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself.
    data_provider = pa.init_param_sampleprovider()

    # Get some params that you want to look for and of course some times as well
    from_pid, to_pid = (args.from_pid, args.to_pid)

    year1, doy1, h1, m1, s1 = args.from_date
    timestamp_start = unix_ydoy_to_ms(year1, doy1, h1, m1, s1)

    year2, doy2, h2, m2, s2 = args.to_date
    timestamp_end = unix_ydoy_to_ms(year2, doy2, h2, m2, s2)

    base_name = ("test5_{0}-{1}__" + "{2}-{3}_{4:02d}{5:02d}{6:02d}_" +
                 "{7}-{8}_{9:02d}{10:02d}{11:02d}").format(
                     from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2,
                     h2, m2, s2)

    print((
        "-----------------------------------------------------------------------------------\n"
        +
        "Retrieving samples for parameters with parameter ids in the range {0}:{1}\n"
        + "from the date {2}.{3}.{4:02d}:{5:02d}:{6:02d} " +
        "to the date {7}.{8}.{9:02d}:{10:02d}:{11:02d}\n").format(
            from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2, h2, m2,
            s2))

    # Get parameter names for the range of parameter ids, and retrieve samples as DataFrame
    param_names = data_provider.get_parameter_names_from_pids(from_pid, to_pid)
    #df = data_provider.get_parameter_data_df(param_names, timestamp_start, timestamp_end)
    #samples = data_provider.get_parameter_data_objs(param_names, timestamp_start, timestamp_end)

    prep_time = time.time()

    first = True
    for pname in param_names:
        dfnew = data_provider.get_parameter_data_df([pname], timestamp_start,
                                                    timestamp_end)
        if first:
            df = dfnew
            first = False
        else:
            df = pd.concat([df, dfnew], axis=1)

    exec_time = time.time()

    print("#---- df -----------------------")
    df.info()
    df.describe()
    print(df.shape)

    #for row in samples:
    #    print(row)
    #    for sample in row:
    #        print("{} {} {} {}".format(sample.get_time(), sample.get_name(), sample.get_type(), sample.get_value()))
    #        break

    # Save to CSV
    #df1.to_csv(base_name + '.1.csv')
    #df2.to_csv(base_name + '.2.csv')

    gzip_time = time.time()

    #df = pd.concat([df1, df2], axis=1)
    #df.info()
    #print(df.shape)

    df.to_csv(base_name + '.csv')

    # Save to pickle
    #df.to_pickle(base_name + '.pkl')

    # Save to FITS
    t = Table.from_pandas(df)
    print(t)
    t.write(base_name + '.fits', format='fits')
    #fits_file = fitsio.FITS(base_name + '.fits', 'rw')
    #fits_file.write(df.to_records(index=False))
    #fits_file.close()

    # Save to HDF
    df.to_hdf(base_name + '.h5', key='df', mode='w')

    #ax = plt.gca()
    #df.plot(kind='line',x='timestamp',y=1,ax=ax)
    #df.plot(kind='line',x='timestamp',y=100, color='red', ax=ax)
    #plt.show()

    gzip_file(base_name + '.csv', base_name + '.csv.gz')

    end_time = time.time()

    print('''
         Data retrieval:   {:10.3f} s
         Data storing:     {:10.3f} s
         Compressing:      {:10.3f} s
         Total exec. time: {:10.3f} s
         '''.format(exec_time - prep_time, gzip_time - exec_time,
                    end_time - gzip_time, end_time - start_time))
예제 #3
0
def main():
    # Get start time
    start_time = time.time()

    # Parse command line arguments
    args = get_args()

    # Define config. file if not set in the local environment
    if not 'PYAREX_INI_FILE' in os.environ:
        os.environ['PYAREX_INI_FILE'] = args.config_file

    # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself.
    data_provider = pa.init_param_sampleprovider()

    # Define process inputs
    from_pid, to_pid, timestamp_start, timestamp_end, base_name, pids_step = define_inputs(
        args)

    retr_time_total, conv_time_total = (0, 0)

    keep_retrieving = True
    i_pid = from_pid
    j_pid = i_pid + pids_step - 1
    param_names_invalid = {}

    while keep_retrieving:

        # Set preparation time stamp
        prep_time = time.time()

        # Get parameter names for the range of parameter ids, and retrieve samples as DataFrame
        param_names = data_provider.get_parameter_names_from_pids(i_pid, j_pid)
        samples = data_provider.get_parameter_data_objs(
            param_names, timestamp_start, timestamp_end)

        # Set retrieval time stamp
        retr_time = time.time()

        # Build initial primary HDU for FITS file
        hdul = fits_build_hdr(i_pid, j_pid)

        # Convert sample columns to binary tables
        i = 0
        pid = i_pid

        for column in samples:

            # Loop on samples to add values to the resulting vectors
            time_stamps = []
            values = []
            start = True

            for s in column:
                if start:
                    var_name = s.get_name()
                    var_type = s.get_type()
                    if var_name != param_names[i]:
                        print(
                            "ERROR: Param. name does not match with expectation!"
                        )
                    print(
                        'Generating table {} of {} for PID {} - {} (type={})'.
                        format(i + 1, pids_step, pid, var_name, var_type))
                    start = False

                time_stamps.append(s.get_time())

                value = s.get_value()
                if var_type == DateTimeType:
                    value = unix_ms_to_datestr(value)

                values.append(value)

                if var_type == DateTimeType:
                    var_type = StringType

            i = i + 1
            pid = pid + 1

            if start:
                param_names_invalid[str(pid - 1)] = param_names[i - 1]
                continue

            type_conv = Ares2FitsConversion[str(var_type)]
            if var_type == StringType:
                size_fld = len(max(values, key=len))
                type_conv = type_conv.format(size_fld if size_fld > 0 else 1)

            t = fits.BinTableHDU.from_columns([
                fits.Column(name='TIMESTAMP',
                            array=np.array(time_stamps),
                            format='K'),
                fits.Column(name=var_name,
                            array=np.array(values),
                            format=type_conv)
            ])
            hdul.append(t)

        # Remove FITS file if exists, and (re)create it
        file_name = base_name + '.{}-{}.fits'.format(i_pid, j_pid)
        save_to_fits(hdul, file_name)
        print('Saved file {}'.format(file_name))

        end_time = time.time()

        retr_time_total = retr_time_total + (retr_time - prep_time)
        conv_time_total = conv_time_total + (end_time - retr_time)

        i_pid = j_pid + 1
        j_pid = i_pid + pids_step - 1
        if j_pid > to_pid:
            j_pid = to_pid

        keep_retrieving = (i_pid < to_pid)

    full_time_total = end_time - start_time

    print(
        ("Data retrieval:   {:10.3f} s\n" + "Data conversion:  {:10.3f} s\n" +
         "Total exec. time: {:10.3f} s").format(retr_time_total,
                                                conv_time_total,
                                                full_time_total))
    if len(param_names_invalid) > 0:
        print("The following parameters could not be converted:")
        print('\n'.join([
            '{}: "{}"'.format(i, param_names_invalid[i])
            for i in param_names_invalid.keys()
        ]))
예제 #4
0
    epoch = datetime.datetime.utcfromtimestamp(0)
    dt = datetime.datetime(y, m, d, h, mi, s)
    return int((dt - epoch).total_seconds() * 1000.0 + ms)


def unix_ydoy_to_ms(y, doy, h=0, mi=0, s=0, ms=0):
    epoch = datetime.datetime.utcfromtimestamp(0)
    dt = datetime.datetime(y, 1, 1, h, mi, s) + datetime.timedelta(doy - 1)
    return int((dt - epoch).total_seconds() * 1000.0 + ms)


if not 'PYAREX_INI_FILE' in os.environ:
    os.environ['PYAREX_INI_FILE'] = os.getcwd() + '/jc.ini'

# Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself.
data_provider = pa.init_param_sampleprovider()

# Get some params that you want to look for and of course some times as well
from_pid = 1
to_pid = 32424
timestamp_start = unix_ydoy_to_ms(2013, 354)
timestamp_end = unix_ydoy_to_ms(2013, 355)

# Get the data from HBase and transform into Pandas DF
#df = data_provider.get_parameter_data_df(params, timestamp_start, timestamp_end)
#print(df.shape)

# Or, if you prefer, get them into sample objects and get information from the objects.
# The method returns a nested generator.
(param_names,
 samples) = data_provider.get_parameter_pids_data_objs(from_pid, to_pid,
예제 #5
0
def main():
    # Get start time
    start_time = time.time()

    # Parse command line arguments
    args = get_args()
    #print(args)

    # Define config. file if not set in the local environment
    if not 'PYAREX_INI_FILE' in os.environ:
        os.environ['PYAREX_INI_FILE'] = args.config_file

    # Initalize the needed datasources. Right now this is hardcoded into the initializer, so for config you need to manage this yourself.
    data_provider = pa.init_param_sampleprovider()

    # Get some params that you want to look for and of course some times as well
    from_pid, to_pid = (args.from_pid, args.to_pid)

    year1, doy1, h1, m1, s1 = args.from_date
    timestamp_start = unix_ydoy_to_ms(year1, doy1, h1, m1, s1)

    year2, doy2, h2, m2, s2 = args.to_date
    timestamp_end = unix_ydoy_to_ms(year2, doy2, h2, m2, s2)

    base_name = ("test3_{0}-{1}__" + "{2}-{3}_{4:02d}{5:02d}{6:02d}_" +
                 "{7}-{8}_{9:02d}{10:02d}{11:02d}").format(
                     from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2,
                     h2, m2, s2)

    prep_time = time.time()

    print((
        "Retrieving samples for parameters with parameter ids in the range {0}:{1}\n"
        + "from the date {2}.{3}.{4:02d}:{5:02d}:{6:02d} " +
        "to the date {7}.{8}.{9:02d}:{10:02d}:{11:02d}\n").format(
            from_pid, to_pid, year1, doy1, h1, m1, s1, year2, doy2, h2, m2,
            s2))

    # Get parameter names for the range of parameter ids
    param_names = data_provider.get_parameter_names_from_pids(from_pid, to_pid)

    # Getting as data frame
    df = data_provider.get_parameter_data_df(param_names, timestamp_start,
                                             timestamp_end)

    exec_time = time.time()

    df.info()
    #df.describe()
    print(df.shape)

    # Save to picle
    df.to_pickle('test3-32424params-1min.pkl')

    # Save to FITS
    #t = Table.from_pandas(df)
    #t.write('test3-1000params-1h.fits', format='fits')

    # Save to HDF
    #df.to_hdf(base_name + '.h5', key='df', mode='w')

    # Save to CSV
    df.to_csv(base_name + '.csv')

    gzip_time = time.time()

    gzip_file(base_name + '.csv', base_name + '.csv.gz')

    end_time = time.time()

    print('''
         Data retrieval:   {:10.3f} s
         Data storing:     {:10.3f} s
         Compressing:      {:10.3f} s
         Total exec. time: {:10.3f} s
         '''.format(exec_time - prep_time, gzip_time - exec_time,
                    end_time - gzip_time, end_time - start_time))