Exemplo n.º 1
0
def check_exists(for_run):
    """!
    Check if data for given Generated time already exists
    @param for_run datatime to check database for run data from
    @return Whether or not data exists in database for given run
    """
    try:
        cnxn = db.open_local_db()
        df = pandas.read_sql('SELECT * FROM HINDCAST.DAT_Historic', cnxn)
    finally:
        cnxn.close()
    return 1 == len(df.query('generated == \'{}\''.format(for_run)))
Exemplo n.º 2
0
 def check_exists(self, for_run):
     """!
     Check if data for given generated time already exists
     @param self Pointer to this
     @param for_run Time of run
     @return Whether or not record is already in database
     """
     cnxn = None
     try:
         cnxn = db.open_local_db()
         df = pandas.read_sql(
             'SELECT * FROM INPUTS.DAT_Model WHERE model=\'{}\''.format(
                 self.name), cnxn)
     finally:
         if cnxn:
             cnxn.close()
     return 1 == len(df.query('generated == \'{}\''.format(for_run)))
Exemplo n.º 3
0
            # HACK: for some reason Latitude's type is object, so fix that
            df['latitude'] = df['latitude'].astype(float)
            df = df.set_index(index)
            stations.append(df)
    all = pandas.concat(stations)
    # all['lon'] = all['lon'].apply(lambda x: x if x <= 180 else x - 360)
    all = all.reset_index().set_index(index).sort_index()
    return all


if __name__ == '__main__':
    # get data for each year and save it
    #~ for year in [2019]:
    cnxn = None
    try:
        cnxn = db.open_local_db()
        hindcasts = pandas.read_sql(
            'SELECT * FROM HINDCAST.DAT_Model WHERE model=\'{}\''.format(
                MODEL_NAME), cnxn)
        for year in range(1948, datetime.datetime.now().year):
            #for year in [1948]:
            ## data for current year
            if (1 != len(hindcasts.query('year == {}'.format(year)))):
                df = get_year(year)
                retry = True
                # HACK: keeps failing on insert timing out so try until it doesn't
                db.insert_weather('HINDCAST', 'DAT_Hindcast', df, 'year')
            else:
                print('Already have {} data for {}'.format(MODEL_NAME, year))
    finally:
        if cnxn:
Exemplo n.º 4
0
 def load_specific_records(self, for_run, force=False):
     """!
     Load records for specific period
     @param self Pointer to self
     @param for_run Which run of model to use
     @param force Whether or not to force loading if records already exist
     @return Timestamp of run that records were loaded for
     """
     # save into database that corresponds to the start of this run
     # if at any point for_run is already in the database then we're done
     # want to put the data into the database for the start date but check if it exists based on for_run
     actual_dates = list(
         map(lambda hour: for_run + datetime.timedelta(hours=hour),
             self.hours))
     if not force:
         logging.debug(
             'Checking if data is already present for {} model generated at {}'
             .format(self.name, for_run))
         # check that we have all the timesteps
         cnxn = None
         try:
             cnxn = db.open_local_db()
             have_dates = pd.read_sql(
                 """SELECT DISTINCT(fortime)
                                         FROM INPUTS.DAT_Forecast f
                                         LEFT JOIN INPUTS.DAT_LocationModel loc ON f.locationmodelid=loc.locationmodelid
                                         LEFT JOIN INPUTS.DAT_Model m ON m.modelgeneratedid=loc.modelgeneratedid
                                         WHERE model='{}'
                                         AND generated='{}'""".format(
                     self.name, for_run), cnxn).sort_values(['fortime'])
         finally:
             if cnxn:
                 cnxn.close()
         need_dates = [
             x for x in actual_dates
             if not have_dates['fortime'].eq(x).any()
         ]
         if len(need_dates) == 0:
             logging.debug('Data already loaded - aborting')
             return pd.Timestamp(for_run)
         actual_dates = need_dates
     results = []
     date = for_run.strftime(r'%Y%m%d')
     time = int(for_run.strftime(r'%H'))
     save_dir = common.ensure_dir(
         os.path.join(self.DIR_DATA, '{}{:02d}'.format(date, time)))
     # args = []
     pool = Pool(5)
     for hour in self.hours:
         logging.info(
             "Downloading {} records from {} run for hour {}".format(
                 self.name, for_run, hour))
         for_date = for_run + datetime.timedelta(hours=hour)
         diff = for_date - for_run
         real_hour = int((diff.days * 24) + (diff.seconds / 60 / 60))
         save_as = '{}_{}{:02d}_{}_{:03d}'.format(self.name, date, time,
                                                  "{}", real_hour)
         for_what = ['TMP', 'UGRD', 'VGRD', 'RH']
         if 0 != real_hour:
             for_what = for_what + ['APCP']
         for_what = list(map(lambda x: self.indices[x], for_what))
         n = len(for_what)
         cur_args = list(
             zip([self.host] * n, [self.dir] * n, [self.mask] * n,
                 [save_dir] * n, [date] * n, [time] * n, [real_hour] * n,
                 [save_as] * n, for_what))
         # args = args + cur_args
         # NOTE: not as fast as pooling everything, but if the hour fails we don't have a bunch of extra requests
         pool.map(do_save, cur_args)
     n = len(actual_dates)
     # more than the number of cpus doesn't seem to help
     pool = Pool(min(n, os.cpu_count()))
     results = list(
         pool.map(
             save_wx,
             zip([save_dir] * n, [self.name] * n, [for_run] * n,
                 actual_dates)))
     # for d in actual_dates:
     # save_wx([save_dir, self.name, for_run, d])
     # return the run that we ended up loading data for
     # HACK: Timestamp format is nicer than datetime's
     return pd.Timestamp(for_run)
Exemplo n.º 5
0
def load_file(for_run, force=False):
    """!
    Load data for a given run into the database
    @param for_run datetime to load data for
    @param force Whether or not to save if file already exists
    @return Timestamp for run data that was loaded
    """
    if not force:
        logging.debug(
            'Checking if data is already present for long range matches generated at {}'
            .format(for_run))
        exists = check_exists(for_run)
        if exists:
            logging.debug('Data already loaded - aborting')
            return pandas.Timestamp(for_run)
    filename = get_file_name(for_run)
    df = pandas.read_csv(filename)
    # throw out everything except years and months
    df = df[df.columns[:13]]
    # change months into numbers
    df.columns = ['year'] + list(range(1, 13))
    df = pandas.melt(df,
                     id_vars=['year'],
                     var_name='month',
                     value_name='value')
    # remove NaN rows
    df = df.query('value == value')
    # get warnings later if we don't set this
    df.is_copy = False

    # convert to ratio while checking for % and other possible inputs
    def fix_ratio(x):
        if isinstance(x, str):
            if '%' == x[-1]:
                x = int(x[:-1])
            x = int(x)
        return x if 1 >= x else x / 100.0

    #~ fix_ratio(df.values[0][2])
    df['value'] = df['value'].apply(fix_ratio)
    df['generated'] = pandas.to_datetime(for_run, utc=True)
    #~ df['generated'] = df['generated'].astype('datetime64[s]')
    df = df.set_index(['generated', 'year', 'month'])
    schema = 'HINDCAST'
    final_table = 'DAT_HistoricMatch'

    def do_insert_only(cnxn, table, data):
        """Insert and assume success because no duplicate keys should exist"""
        # rely on deleting from FK table to remove everything from this table, so just insert
        stmt_insert = db.make_insert_statement(table,
                                               data.reset_index().columns)
        db.trans_insert_data(cnxn, data, stmt_insert)

    try:
        cnxn = db.open_local_db()
        cur_df = df
        cur_df = db.write_foreign(cnxn, schema, 'DAT_Historic', ['generated'],
                                  db.trans_save_data, cur_df)
        logging.debug('Writing data to {}'.format(final_table))
        do_insert_only(cnxn, '{}.{}'.format(schema, final_table), cur_df)
        cnxn.commit()
    finally:
        cnxn.close()
    return pandas.Timestamp(for_run)