def combine_files_setup(self, outformat='pkl.gz'):
        """Combine multiple files (for a given site) into a single file data set.  Will also use PVLib get_clearsky
        method and fill in Clearsky GHI pvlib column.

        Returns
        -------
        None
        """
        time_cols = ['Year', 'Month', 'Day', 'Hour', 'Minute']
        for id, file_set in self.files_df.groupby(self.files_df['id']):
            files = [os.path.join(self.path_to_read_dir, f) for f in file_set.index]
            header = pd.read_csv(files[0], nrows=2)  # read header to get time zone, latitude, longitude, elevation
            tz = 'Etc/GMT' + header['Time Zone'][0].replace('-', '+')  # negative sign confuses 'Etc/GMTXX' timezone?
            df = pd.concat([pd.read_csv(f, skiprows=2) for f in files])
            df.index = pd.to_datetime(df[time_cols])
            df.index = df.index.tz_localize(tz)
            df = df.drop(time_cols, axis=1)
            latitude = float(header['Latitude'][0])
            longitude = float(header['Longitude'][0])
            elevation = float(header['Elevation'][0])
            # add Is clear NSRDB column and Clearsky GHI pvlib column
            # Scale Clearsky GHI pvlib to match periods of clarity between
            detection = cs_detection.ClearskyDetection(df, copy=False, set_ghi_status=True)
            detection.set_nsrdb_sky_status(label='Is clear NSRDB')
            detection.generate_pvlib_clearsky(latitude, longitude, elevation, tz=tz)
            detection.scale_model('GHI', 'Clearsky GHI pvlib', 'Is clear NSRDB')
            df = detection.df
            if outformat == 'pkl':
                pd.to_pickle(df, os.path.join(self.path_to_write_dir, str(int(id))) + '.pkl')
            elif outformat == 'pkl.gz':
                pd.to_pickle(df, os.path.join(self.path_to_write_dir, str(int(id))) + '.pkl.gz')
            elif outformat == 'csv':
                df.to_csv(os.path.join(self.path_to_write_dir, str(int(id))) + '.csv')
        print('Files successfully written to {}'.format(self.path_to_write_dir))
Exemplo n.º 2
0
len(nsrdb.df)

# # Investigate input data

# ## ABQ

# In[43]:

nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

# In[44]:

train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates('01-01-2013', '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)

# In[45]:

clf = ensemble.RandomForestClassifier(random_state=42)

# In[46]:

feature_cols = [
    'tfn', 'abs_ideal_ratio_diff grad', 'abs_ideal_ratio_diff grad mean',
    'abs_ideal_ratio_diff grad std', 'abs_ideal_ratio_diff grad second',
    'abs_ideal_ratio_diff grad second mean',
    'abs_ideal_ratio_diff grad second std',
pred = test.iter_predict_daily(feature_cols,
                               'GHI',
                               'Clearsky GHI pvlib',
                               clf,
                               3,
                               by_day=True,
                               multiproc=True)
pred = pred.astype(bool)

# In[127]:

vis = visualize.Visualizer()

# In[128]:

srrl_tmp = cs_detection.ClearskyDetection(nsrdb_srrl.df)
srrl_tmp.intersection(ground.df.index)
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(srrl_tmp.df['sky_status'] == 0) & (pred)]['GHI'],
                   'ML clear only')
vis.add_circle_ser(test.df[(srrl_tmp.df['sky_status'] == 1) & (~pred)]['GHI'],
                   'NSRDB clear only')
vis.add_circle_ser(test.df[(srrl_tmp.df['sky_status'] == 1) & (pred)]['GHI'],
                   'ML+NSRDB clear only')
# vis.add_line_ser(test.df['abs_ideal_ratio_diff'] * 100)

# In[129]:

vis.show()
'GHI Clearsky GHI pvlib gradient second ratio min', 
'GHI Clearsky GHI pvlib gradient second ratio max', 
'GHI Clearsky GHI pvlib line length ratio',
'GHI Clearsky GHI pvlib line length ratio gradient',
'GHI Clearsky GHI pvlib line length ratio gradient second'
]

target_cols = ['sky_status']


# # Train/test on NSRDB data to find optimal parameters

# In[66]:


train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates('01-01-2010', '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)


# In[67]:


train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')


# In[68]:


utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
# In[5]:


ground.df.index[0], ground.df.index[-1]


# In[6]:


nsrdb.df.index[0], nsrdb.df.index[-1]


# In[7]:


ground2 = cs_detection.ClearskyDetection(ground.df, 'GHI', 'Clearsky GHI pvlib', solar_noon_col='abs(t-tnoon)')


# In[8]:


ground2.trim_dates('01-01-2002', '01-01-2015')
ground2.df = ground2.df[ground2.df.index.minute % 30 == 0]


# In[9]:


nsrdb2 = cs_detection.ClearskyDetection(nsrdb.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status', solar_noon_col='abs(t-tnoon)')

Exemplo n.º 6
0
def split_df_by_date(obj, start, mid, end):
    train = cs_detection.ClearskyDetection(obj.df)
    train.trim_dates(start, mid)
    test = cs_detection.ClearskyDetection(obj.df)
    test.trim_dates(mid, end)
    return train, test

# In[3]:


len(nsrdb.df)


# # Train/test on NSRDB data to find optimal parameters

# ## Default classifier

# In[4]:


train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)


# In[5]:


train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')


# In[6]:


clf = ensemble.RandomForestClassifier(random_state=42)
import pygal


# # Train on default data

# In[2]:


detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')


# In[3]:


train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)


# In[4]:


clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)


# In[5]:


clf = train_obj.fit_model(clf)
Exemplo n.º 9
0

# In[8]:


clf = nsrdb.fit_model(feature_cols, target_cols, clf)


# Training vs the clearsky model in NSRDB is quite accurate.  I don't really want to use this clearsky curve though since it's unavailable for ground based measurements.

# ### Visualize

# In[9]:


train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)


# In[10]:


clf.fit(train.df[feature_cols], train.df[target_cols])


# In[11]:


pred = clf.predict(test.df[feature_cols]).flatten()

# In[3]:


len(nsrdb.df)


# # Train/test on NSRDB data to find optimal parameters

# ## Default classifier

# In[4]:


train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('11-01-2015', '01-07-2015')


# In[5]:


train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')


# In[6]:


clf = ensemble.RandomForestClassifier(random_state=42)
Exemplo n.º 11
0
target_cols = ['sky_status']

# # Align date ranges

# In[6]:

ground.df.index[0], ground.df.index[-1]

# In[7]:

nsrdb.df.index[0], nsrdb.df.index[-1]

# In[8]:

ground2 = cs_detection.ClearskyDetection(ground.df)

# In[9]:

ground2.trim_dates('01-01-2008', '01-01-2012')
ground2.df = ground2.df[ground2.df.index.minute % 30 == 0]

# In[10]:

nsrdb2 = cs_detection.ClearskyDetection(nsrdb.df)

# In[11]:

nsrdb2.trim_dates('01-01-2008', '01-01-2012')
nsrdb2.df = nsrdb2.df[nsrdb2.df.index.minute % 30 == 0]
Exemplo n.º 12
0
import matplotlib
import pv_clf
import numpy as np

get_ipython().magic('matplotlib notebook')

get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')


# In[507]:


nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)


# In[508]:


X = np.asarray([train.df.index.values, train.df['GHI'].values, train.df['Clearsky GHI pvlib'].values]).T


# In[509]:


X.shape
nsrdb = cs_detection.ClearskyDetection.read_pickle('ornl_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('EST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

# In[3]:

len(nsrdb.df)

# # Train/test on NSRDB data to find optimal parameters

# ## Default classifier

# In[4]:

train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates('01-01-2010', '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)

# In[5]:

train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')

# In[6]:

clf = ensemble.RandomForestClassifier(random_state=42)

# In[7]:

utils.calc_all_window_metrics(train.df,
Exemplo n.º 14
0
nsrdb.to_pickle('srrl_nsrdb_cloudy.pkl', overwrite=True)


# In[14]:


ground.to_pickle('srrl_ground_cloudy.pkl', overwrite=True)


# # Science

# In[16]:


ground_small = cs_detection.ClearskyDetection(ground.df)


# In[17]:


ground_small.trim_dates('07-01-2006', '07-08-2006')


# In[18]:


vis = Visualizer()
vis.add_line_ser(ground_small.df['GHI'], 'GHI')
vis.add_line_ser(ground_small.df['Clearsky GHI pvlib'], 'GHIcs')
vis.add_line_ser(ground_small.df['Total Cloud Cover [%]'], 'TCC')
np.set_printoptions(precision=4)
get_ipython().magic('matplotlib inline')

get_ipython().magic("config InlineBackend.figure_format = 'retina'")

matplotlib.rcParams.update({'font.size': 16})

import warnings
warnings.filterwarnings(action='ignore')


plt.close('all')

# Train on default data# nsrdb = pd.read_pickle('abq_nsrdb_1.pkl.gz')
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)clf = train_obj.fit_model(clf)pred = test_obj.predict(clf)print(metrics.accuracy_score(test_obj.df['sky_status'], pred))print(metrics.recall_score(test_obj.df['sky_status'], pred))cm = metrics.confusion_matrix(test_obj.df['sky_status'], pred)visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))fig, ax = plt.subplots(figsize=(12, 8))

_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)

_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')

_ = fig.tight_layout()fig, ax = plt.subplots(figsize=(12, 8))

nsrdb_mask = test_obj.df['sky_status'].values