def setUp(self): self.holidays = holidays.England() self.holidays = holidays.Wales() self.holidays = holidays.Scotland() self.holidays = holidays.IsleOfMan() self.holidays = holidays.NorthernIreland() self.holidays = holidays.UK()
def date_range(start_date, end_date, weekdays=None, exclude_holidays=True): """ Generate a list of all dates within the given period Parameters ---------- start_date : datetime.date object Starting date of the period end_date : datetime.date object Ending date of the period weekdays : list If specified, constrain to these days of the week only, e.g., ['Tuesday', 'Friday'] Returns ------- rng : list List of dates in the format of datetime.date """ rng = [] d = start_date while d <= end_date: if weekdays is None or list( calendar.day_name)[d.weekday()] in weekdays: if not exclude_holidays or d not in holidays.UK(): rng.append(d) d += timedelta(days=1) return rng
def get_dates(df, date_column): """Converts a given date to various formats and returns an updated DataFrame. Args: df: Pandas DataFrame. date_column: Returns: Original DataFrame with additional date columns. """ df['day'] = df[date_column].dt.strftime("%d").astype(int) # Day of month with leading zero df['month'] = df[date_column].dt.strftime("%m").astype(int) # Month of year with leading zero df['year'] = df[date_column].dt.strftime("%Y").astype(int) # Full numeric four digit year df['year_month'] = df[date_column].dt.strftime("%Y%m").astype(int) # Full numeric four digit year plus month df['week_number'] = df[date_column].dt.strftime("%U").astype(int) # Week number with leading zero df['day_number'] = df[date_column].dt.strftime("%j").astype(int) # Day number with leading zero df['day_name'] = df[date_column].dt.strftime("%A") # Day name, i.e. Sunday df['month_name'] = df[date_column].dt.strftime("%B") # Month name, i.e. January df['mysql_date'] = df[date_column].dt.strftime("%Y-%d-%m") # MySQL date, i.e. 2020-30-01 df['quarter'] = df[date_column].dt.quarter.astype(int) # Quarter with leading zero, i.e. 01 df['week_day_number'] = df[date_column].dt.strftime("%w").astype(int) # Weekday number, i.e. 0 = Sunday, 1 = Monday df['is_weekend'] = ((pd.DatetimeIndex(df['date']).dayofweek) // 5 == 1).astype(int) # 1 if weekend, 0 if weekday uk_holidays = holidays.UK() df['is_uk_holiday'] = np.where(df.date.isin(uk_holidays), 1, 0).astype(int) # Return 1 if this is a holiday return df
def contract_expiry(contract_date): UK = sorted(holidays.UK(state=None, years=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]).items()) holiday_list = [] for d, n in UK: holiday_list.append(d) date_exp = workday(contract_date, -2, holiday_list) return date_exp
def make_uk_holidays(start=1991, end=2040) -> pd.DataFrame: """return a DataFrame of all uk holidays (including easter sunday)""" uk = holidays.UK(years=list(range(start, end))) uk_easter = {dt: nm for dt, nm in uk.items() if 'Easter Monday' in nm} uk_easter_sundays = [(dt - timedelta(days=1), 'Easter Sunday') for dt in uk_easter.keys()] uk = pd.DataFrame(list(uk.items()) + uk_easter_sundays, columns=['ds', 'holiday']) uk['ds'] = pd.to_datetime(uk['ds']) return uk
def add_holiday(x): time_range = pd.date_range(start='2015-12-31', end='2019-01-01', freq='h') country_holidays = {'UK': holidays.UK(), 'US': holidays.US(), 'IRL': holidays.Ireland(), 'CAN': holidays.Canada()} holiday_mapping = pd.DataFrame() for site in range(16): holiday_mapping_i = pd.DataFrame({'site': site, 'timestamp': time_range}) holiday_mapping_i['h0'] = holiday_mapping_i['timestamp'].apply( lambda x: x in country_holidays[locate[site]['country']]).astype(int) holiday_mapping = pd.concat([holiday_mapping, holiday_mapping_i], axis=0) x = pd.merge([x, holiday_mapping], on=['site', 'timestamp'], how='left') return x
def holiday_adjust(trade_date, delta): forward_date = trade_date + delta year = forward_date.year # if trade_date is holiday if (forward_date in hol.Australia() or forward_date in hol.US() or forward_date in hol.UK() or forward_date in hol.Japan()): forward_date = forward_date + dt.timedelta(days=1) holiday_adjust(forward_date, dt.timedelta()) # date is weekend elif forward_date.weekday() >= 5: forward_date = forward_date + dt.timedelta(days=1) holiday_adjust(forward_date, dt.timedelta()) return forward_date
def test_all_holidays_present(self): uk_2015 = holidays.UK(years=[2015]) all_holidays = [ "New Year's Day", "Good Friday", "Easter Monday [England/Wales/Northern Ireland]", "May Day", "Spring Bank Holiday", "Late Summer Bank Holiday [England/Wales/Northern Ireland]", "Christmas Day", "Boxing Day", "St. Patrick's Day [Northern Ireland]", ] for holiday in all_holidays: self.assertIn(holiday, uk_2015.values())
def send_daily_notifications(self): """ Send daily summary mail notification. The opposite of notify_user() :param self: :return: """ # test whether today is a working day (a "business day" or "bday"), and if not then bail out; # we don't want to bother people with emails at the weekend or on statutory holidays today = date.today() holiday_calendar = holidays.UK() # the test is in two parts: first we check for a holiday, then for a conventional working day # (in future perhaps allow individual users to choose their own working-day pattern). # Annoyingly, numpy.is_busday() won't accept objects generated by the holidays module # as a holiday calendar (it wants an array-like of datetime) # is today a UK holiday? if today in holiday_calendar: return # is today a working day? if not is_busday(today, holidays=None): return # search through all active users and dispatch notifications # we treat students and faculty slightly differently so we have different dispatchers for them # find all students students = db.session.query(User).filter( User.active == True, User.roles.any(Role.name == 'student')).all() student_tasks = group( dispatch_student_notifications.si(r.id) for r in students if r is not None) # find all faculty faculty = db.session.query(User).filter( User.active == True, User.roles.any(Role.name == 'faculty')).all() faculty_tasks = group( dispatch_faculty_notifications.si(r.id) for r in faculty if r is not None) task = group(student_tasks, faculty_tasks) raise self.replace(task)
def find_good_epics(): spreads_and_epics = [] i_count = 0 pick_from_epics = [] full_hol_list = [] ################################################################### tz = pytz.timezone('Europe/Berlin') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) ger_today = str(str("GER_" + str(todays_date))) print("Europe/Berlin :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('Europe/London') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) gb_today = str(str("GB_" + str(todays_date))) print("Europe/London :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('America/New_York') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) us_today = str(str("US_" + str(todays_date))) print("America/New_York :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('Australia/Sydney') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) aus_today = str(str("AUS_" + str(todays_date))) print("Australia/Sydney :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('Asia/Tokyo') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) jp_today = str(str("JP_" + str(todays_date))) print("Asia/Tokyo :- Today's Date is ..." + str(todays_date)) ################################################################### b_ger_hol = False b_uk_hol = False b_us_hol = False b_aus_hol = False b_jp_hol = False for date, name in sorted(holidays.DE(years=YEAR_var).items()): full_hol_list.append(str("GER_" + str(date))) for date, name in sorted(holidays.UK(years=YEAR_var).items()): full_hol_list.append(str("GB_" + str(date))) for date, name in sorted(holidays.US(years=YEAR_var).items()): full_hol_list.append(str("US_" + str(date))) for date, name in sorted(holidays.AU(years=YEAR_var).items()): full_hol_list.append(str("AUS_" + str(date))) for date, name in sorted(holidays.JP(years=YEAR_var).items()): full_hol_list.append(str("JP_" + str(date))) full_hol_list = sorted(full_hol_list) for d in full_hol_list: #print (d) if str(d) == ger_today: b_ger_hol = True if str(d) == gb_today: b_uk_hol = True if str(d) == us_today: b_us_hol = True if str(d) == aus_today: b_aus_hol = True if str(d) == jp_today: b_jp_hol = True for epic_id in main_epic_ids: tmp_lst = [] base_url = REAL_OR_NO_REAL + '/markets/' + epic_id auth_r = requests.get(base_url, headers=authenticated_headers) d = json.loads(auth_r.text) try: i_count = i_count + 1 if epic_id.find('MXN') != -1: #print("!!DEBUG!!...skipping, FOUND MXN in..." + str(epic_id)) time.sleep(1) elif epic_id.find('SEK') != -1: #print("!!DEBUG!!...skipping, FOUND SEK in..." + str(epic_id)) time.sleep(1) elif epic_id.find('NOK') != -1: #print("!!DEBUG!!...skipping, FOUND NOK in..." + str(epic_id)) time.sleep(1) elif epic_id.find('CNH') != -1: #print("!!DEBUG!!...skipping, FOUND CNH in..." + str(epic_id)) time.sleep(1) else: b_TRADE_OK = True if b_TRADE_OK: current_bid = d['snapshot']['bid'] ask_price = d['snapshot']['offer'] spread = float(current_bid) - float(ask_price) if float(spread) >= -1.51: # tmp_lst.append(epic_id) # spreads_and_epics.append(tmp_lst) pick_from_epics.append(epic_id) # print ("bid : " + str(current_bid)) # print ("ask : " + str(ask_price)) # print ("-------------------------") # print ("spread : " + str(spread)) # print ("-------------------------") print( "!!DEBUG!!...FOUND GOOD EPIC {} spread {}...{}/{}". format(epic_id, spread, i_count, len(main_epic_ids))) time.sleep(1) else: print( "!!DEBUG!!...skipping, NO GOOD EPIC {} spread {} ....Checking next epic spreads...{}/{}" .format(epic_id, spread, i_count, len(main_epic_ids))) time.sleep(1) continue else: print( "!!DEBUG!!...skipping, NOT CURRENTLY TRADEABLE EPIC {} ....Checking next epic spreads...{}/{}" .format(epic_id, i_count, len(main_epic_ids))) except Exception as e: print(e) pass return (pick_from_epics)
def featureCreation(feed, window, h, grouper, dataDir, apiDic, r_id=None, longestfeed=False): import pandas as pd from bdateutil import isbday import holidays from sklearn.decomposition import PCA feed = pd.DataFrame(feed) r_lat_long = apiDic.loc[(apiDic['id'] == int(r_id)), 'lat_long'][1] # Quarter of hour counter = 0 array = [] for i in pd.date_range('00:00', '23:45', freq=grouper): feed.loc[(feed.index.hour == i.hour) & (feed.index.minute == i.minute), grouper] = counter array.append(feed.loc[feed[grouper] == counter].values) counter += 1 # Hour of day feed['hourofday'] = feed.index.hour # Day of week feed['dayofweek'] = feed.index.dayofweek # Month feed['month'] = feed.index.month # Working day f = np.vectorize(lambda x: isbday( x, holidays=holidays.UK(years=[2013, 2014, 2015, 2016, 2017]))) feed['isworkingday'] = f(feed.index.date) # Weather data weather = pd.DataFrame.from_csv( os.path.join(dataDir, 'WEATHER_DATA', '%s.csv' % r_lat_long.replace(" ", ""))) # Converting text date into datetime weather['cleandate'] = weather['utcdate'].apply(lambda x: evalDate(x)) weather.index = weather['cleandate'] # Deleting irrelevant columns if 'date' in weather.columns: del weather['date'] if 'date.1' in weather.columns: del weather['date.1'] if 'utcdate' in weather.columns: del weather['utcdate'] if 'Unnamed: 0' in weather.columns: del weather['Unnamed: 0'] # Droping duplicates weather = weather.drop_duplicates(subset='cleandate') weather = weather.reindex( pd.date_range(weather['cleandate'].min(), weather['cleandate'].max(), freq=grouper)) #, method='backfill') weather = weather.loc[:, ('conds', 'dewptm', 'fog', 'hail', 'hum', 'precipm', 'pressurem', 'rain', 'snow', 'tempm', 'thunder', 'wdire', 'wgustm', 'windchillm', 'wspdm')] weather.loc[:, 'conds'] = weather.loc[:, 'conds'].fillna('Unknown') weather.loc[:, 'wdire'] = weather.loc[:, 'wdire'].fillna('Variable') le = le2 = preprocessing.LabelEncoder() le.fit(weather['conds']) weather['conds'] = le.transform(weather['conds']) le2.fit(weather['wdire']) weather['wdire'] = le2.transform(weather['wdire']) weather.replace([-9999.0, -999.0], [np.nan, np.nan], inplace=True) weather.loc[:, ('precipm', 'wgustm')] = weather.loc[:, ('precipm', 'wgustm')].fillna(0) weather.windchillm = weather.windchillm.fillna(weather.tempm) weather = weather.interpolate() if (weather.index.min() < feed.index.min()): if (weather.index.max() < feed.index.max()): weather = weather.ix[feed.index.min():, :] feed = feed.ix[:weather.index.max(), :] else: weather = weather.ix[feed.index.min():feed.index.max(), :] else: if (weather.index.max() < feed.index.max()): feed = feed.ix[weather.index.min():weather.index.max(), :] else: feed = feed.ix[weather.index.min():, :] weather = weather.ix[:feed.index.max(), :] features, response = mlf.ts_to_mimo(feed.ix[:, 0], window, h) n_factors = 1 pca = PCA(n_components=n_factors) pca.fit(weather) while (pca.explained_variance_ratio_.sum() < 0.99): n_factors += 1 pca = PCA(n_components=n_factors) pca.fit(weather) reduced = pd.DataFrame(pca.transform(weather)) c = np.zeros((features.shape[0], (h * len(reduced.columns)))) for column in range(len(reduced.columns)): c[:, (column * h):((1 + column) * h)] = mlf.weather_to_mimo( reduced.ix[:, column], window, h) features = np.concatenate((feed.ix[(window + h - 1):, ('isworkingday', grouper, 'hourofday', 'dayofweek', 'month')], c, features), axis=1) print('Features created') return (features, response)
settings["country_last"] = country_last settings.flush() return country_last country_holidays = { "CA": holidays.CA(), "CO": holidays.CO(), "MX": holidays.MX(), "US": holidays.US(), "NZ": holidays.NZ(), "AU": holidays.AU(), "DE": holidays.DE(), "AT": holidays.AT(), "DK": holidays.DK(), "UK": holidays.UK(), "IE": holidays.IE(), "ES": holidays.ES(), "CZ": holidays.CZ(), "SK": holidays.SK(), "PL": holidays.PL(), "PT": holidays.PT(), "NL": holidays.NL(), "NO": holidays.NO(), "IT": holidays.IT(), "SE": holidays.SE(), "JP": holidays.JP(), "BE": holidays.BE(), "ZA": holidays.ZA(), "SI": holidays.SI(), "FI": holidays.FI(),
def find_good_epics(): spreads_and_epics = [] i_count = 0 pick_from_epics = [] full_hol_list = [] ################################################################### tz = pytz.timezone('Europe/Berlin') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) ger_today = str(str("GER_" + str(todays_date))) print("Europe/Berlin :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('Europe/London') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) gb_today = str(str("GB_" + str(todays_date))) print("Europe/London :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('America/New_York') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) us_today = str(str("US_" + str(todays_date))) print("America/New_York :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('Australia/Sydney') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) aus_today = str(str("AUS_" + str(todays_date))) print("Australia/Sydney :- Today's Date is ..." + str(todays_date)) ################################################################### tz = pytz.timezone('Asia/Tokyo') todays_date = str(datetime.datetime.now(tz=tz).strftime('%Y-%m-%d')) jp_today = str(str("JP_" + str(todays_date))) print("Asia/Tokyo :- Today's Date is ..." + str(todays_date)) ################################################################### b_ger_hol = False b_uk_hol = False b_us_hol = False b_aus_hol = False b_jp_hol = False for date, name in sorted(holidays.DE(years=YEAR_var).items()): full_hol_list.append(str("GER_" + str(date))) for date, name in sorted(holidays.UK(years=YEAR_var).items()): full_hol_list.append(str("GB_" + str(date))) for date, name in sorted(holidays.US(years=YEAR_var).items()): full_hol_list.append(str("US_" + str(date))) for date, name in sorted(holidays.AU(years=YEAR_var).items()): full_hol_list.append(str("AUS_" + str(date))) for date, name in sorted(holidays.JP(years=YEAR_var).items()): full_hol_list.append(str("JP_" + str(date))) full_hol_list = sorted(full_hol_list) for d in full_hol_list: #print (d) if str(d) == ger_today: b_ger_hol = True if str(d) == gb_today: b_uk_hol = True if str(d) == us_today: b_us_hol = True if str(d) == aus_today: b_aus_hol = True if str(d) == jp_today: b_jp_hol = True for epic_id in main_epic_ids: tmp_lst = [] base_url = REAL_OR_NO_REAL + '/markets/' + epic_id auth_r = requests.get(base_url, headers=authenticated_headers) d = json.loads(auth_r.text) try: i_count = i_count + 1 if epic_id.find('MXN') != -1: #print("!!DEBUG!!...skipping, FOUND MXN in..." + str(epic_id)) time.sleep(1) elif epic_id.find('SEK') != -1: #print("!!DEBUG!!...skipping, FOUND SEK in..." + str(epic_id)) time.sleep(1) elif epic_id.find('NOK') != -1: #print("!!DEBUG!!...skipping, FOUND NOK in..." + str(epic_id)) time.sleep(1) elif epic_id.find('CNH') != -1: #print("!!DEBUG!!...skipping, FOUND CNH in..." + str(epic_id)) time.sleep(1) else: b_TRADE_OK = False while True: ###################EUROPE############################ ###################EUROPE############################ ###################EUROPE############################ tz = pytz.timezone('Europe/Berlin') now_time = datetime.datetime.now(tz=tz).strftime('%H:%M') #print ("!!DEBUG!! Europe/Berlin:" + str(now_time)) if is_between(str(now_time), ("08:00", "16:00")): #print("!!DEBUG!!...FRANKFURT MARKET OPEN!!") time.sleep(1) STR_CHECK = "EUR" if STR_CHECK in epic_id and b_ger_hol == False: b_TRADE_OK = True break ###################LONDON############################ ###################LONDON############################ ###################LONDON############################ tz = pytz.timezone('Europe/London') now_time = datetime.datetime.now(tz=tz).strftime('%H:%M') while True: if is_between(str(now_time), ("22:00", "22:59")): time.sleep(1) # Sleeping for the tally up hour print("!!DEBUG!! Tally Up hour:" + str(now_time)) now_time = datetime.datetime.now( tz=tz).strftime('%H:%M') else: break #print ("!!DEBUG!! Europe/London:" + str(now_time)) if is_between(str(now_time), ("08:00", "16:00")): #print("!!DEBUG!!...LONDON MARKET OPEN!!") time.sleep(1) STR_CHECK = "GBP" if STR_CHECK in epic_id and b_uk_hol == False: b_TRADE_OK = True break ###################NY############################ ###################NY############################ ###################NY############################ tz = pytz.timezone('America/New_York') now_time = datetime.datetime.now(tz=tz).strftime('%H:%M') #print ("!!DEBUG!! America/New_York:" + str(now_time)) if is_between(str(now_time), ("08:00", "16:00")): #print("!!DEBUG!!...NEW YORK MARKET OPEN!!") time.sleep(1) STR_CHECK = "USD" if STR_CHECK in epic_id and b_us_hol == False: b_TRADE_OK = True break ###################AUS############################ ###################AUS############################ ###################AUS############################ tz = pytz.timezone('Australia/Sydney') now_time = datetime.datetime.now(tz=tz).strftime('%H:%M') #print ("!!DEBUG!! Australia/Sydney:" + str(now_time)) if is_between(str(now_time), ("08:00", "16:00")): #print("!!DEBUG!!...SYDNEY MARKET OPEN!!") time.sleep(1) STR_CHECK = "AUD" if STR_CHECK in epic_id and b_aus_hol == False: b_TRADE_OK = True break ###################TOKYO############################ ###################TOKYO############################ ###################TOKYO############################ tz = pytz.timezone('Asia/Tokyo') now_time = datetime.datetime.now(tz=tz).strftime('%H:%M') #print ("!!DEBUG!! Asia/Tokyo:" + str(now_time)) if is_between(str(now_time), ("08:00", "16:00")): #print("!!DEBUG!!...TOKYO MARKET OPEN!!") time.sleep(1) STR_CHECK = "JPY" if STR_CHECK in epic_id and b_jp_hol == False: b_TRADE_OK = True break break if b_TRADE_OK: current_bid = d['snapshot']['bid'] ask_price = d['snapshot']['offer'] spread = float(current_bid) - float(ask_price) if float(spread) >= -1: # tmp_lst.append(epic_id) # spreads_and_epics.append(tmp_lst) pick_from_epics.append(epic_id) # print ("bid : " + str(current_bid)) # print ("ask : " + str(ask_price)) # print ("-------------------------") # print ("spread : " + str(spread)) # print ("-------------------------") print("!!DEBUG!!...FOUND GOOD EPIC..." + str(i_count) + "/" + str(len(main_epic_ids))) time.sleep(1) else: print( "!!DEBUG!!...skipping, NO GOOD EPIC....Checking next epic spreads..." + str(i_count) + "/" + str(len(main_epic_ids))) time.sleep(1) continue except Exception as e: print(e) pass return (pick_from_epics)
def get_data(): #print("inside POST") data = request.form data_type = data['select'] date = data['datetime'] time = int(data['time']) temp = data['temp'] humidity = data['humidity'] dewpoint = data['dewpoint'] #print("read form completed") date_value = date dt = datetime.strptime(date_value, '%Y-%m-%d') #month month_value = dt.month #Weekday #Monday is 0 and Sunday is 6 if dt.weekday() == 5 or dt.weekday() == 6: week_day = 0 else: week_day = 1 #Base Hour Flag if time > 4 and time < 22: Base_hour_Flag = "false" else: Base_hour_Flag = "true" #Holiday us_holidays = holidays.UK(years=dt.year) if dt in us_holidays: Holiday = 1 else: Holiday = 0 resultGlm = process(algo="glm", Base_hour_Flag=Base_hour_Flag, Holiday=Holiday, week_day=week_day, temp=temp, humidity=humidity, data_type=data_type, dewpoint=dewpoint, month_value=month_value) resultForest = process(algo="forest", Base_hour_Flag=Base_hour_Flag, Holiday=Holiday, week_day=week_day, temp=temp, humidity=humidity, data_type=data_type, dewpoint=dewpoint, month_value=month_value) resultTree = process(algo="tree", Base_hour_Flag=Base_hour_Flag, Holiday=Holiday, week_day=week_day, temp=temp, humidity=humidity, data_type=data_type, dewpoint=dewpoint, month_value=month_value) resultNN = process(algo="nn", Base_hour_Flag=Base_hour_Flag, Holiday=Holiday, week_day=week_day, temp=temp, humidity=humidity, data_type=data_type, dewpoint=dewpoint, month_value=month_value) resultGlm = json.loads(resultGlm) resultForest = json.loads(resultForest) resultTree = json.loads(resultTree) resultNN = json.loads(resultNN) return render_template( '/classify.html', humidity=humidity, date=date, hour=time, temp=temp, dewPoint=dewpoint, labelGlm=resultGlm['Results']['output1'][0]['Scored Labels'], labelForest=resultForest['Results']['output1'][0]['Scored Labels'], labelNN=resultNN['Results']['output1'][0]['Scored Labels'], labelTree=resultTree['Results']['output1'][0]['Scored Labels'])
from pycm import * import seaborn as sns from imblearn.pipeline import make_pipeline from imblearn import under_sampling from imblearn import over_sampling from imblearn import combine from imblearn.over_sampling import SMOTE import matplotlib.cm as cm from matplotlib.colors import Normalize import matplotlib as mpl pd.options.mode.chained_assignment = None warnings.filterwarnings('ignore') us_holidays = holidays.UnitedStates() ca_holidays = holidays.CA() uk_holidays = holidays.UK() random_state = 123456 np.random.seed(random_state) cwd = str(os.getcwd()) sys.path.append(cwd) sys.path.insert(0, cwd) scaler_fs = [ MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, QuantileTransformer, PowerTransformer ] scalers = dict(zip([scaler.__name__ for scaler in scaler_fs], scaler_fs))
def validate(cls,args,data, rules=[], **kwargs): """ This function performs data validation and saves invalid data records to a table. It works as a sort of filter for the calling script, by not allowing invalid data to be inserted into the database. It receives a set of data, performs validations on each record and on the whole set, and returns a tuple with the valid and invalid data to the calling script. First, the function will perform some generic data validation rules and then it will perform the data validation rules supplied as an argument. Args: - data (string): JSON string containing the data records to validate. It should be of the form: '{"data":[ {"column1":"value1", "column2":"value2", "column3":"value3" ...}, {"column1":"value1", "column2":"value2", "column3":"value3" ...}, ] }' - rules (list): list of functions, where each function takes a JSON string containing the data records to validate, performs a validation rule and returns a tuple, where the first element is the collection of valid records, and the second element is the collection of invalid records. These records should be returned as lists of dict with the data; and in the case of invalid records, the data will be a JSON String, and the dict will be augmented with the reason, the rule, the script, and the time of Validation. For example, the collection of invalid records might look like this: [{"data":'{"column1":"value1"...}', "rule":rule_value, "reason":reason_value, "script":script_value, "date_of_validation":date_value }, {"data":'{"column1":"value1"...}', "rule":rule_value, "reason":reason_value, "script":script_value, "date_of_validation":date_value }, ... ] The collection of valid records will be simpler, just a list of dict, e.g. [{"column1":value1, "column2":value2}, {"column1":value1, "column2":value2} ] - kwargs: - 'google_key_path': the path to the service account key. Returns: tuple: tuple with: a list of dict with the valid records as the first element, and a list of dict with the invalid records as the second element. Each element of the invalid list is a tuple, containing the actual data (dict) as its first element, and another dict as its second element, with additional information like rule, reason, script, date_of_calidation and last_update_date. Exmple: Validate would return: (valid_list,invalid_list) where valid_list is: [ {"column1":value1, "column2":value2...}, {"column1":value1, "column2":value2...}, ... ] where invalid_list: [ (valid_record, additional_info), (valid_record,additional_info), ... ] where valid_record is: {"column1":value1, "column2":value2...} where additional_info is: {"rule":rule_value, "reason":reason_value, "script":script_value, "date_of_validation":date_of_validation_value, "last_update_date":last_update_date_value } """ def rule_1(row, tc, ntc, uk, de, script): for c in ntc: if row[c] is not None and row[c] != 0: return "valid" #All values were 0 #Check if they are holiday or weekend for c in tc: try: parsed = datetime.strptime(row[c], '%Y-%m-%d %H:%M:%S') except Exception as e: continue parsed_date = str(parsed.date()) if parsed_date in uk or parsed_date in de or not parsed.weekday(): if "price" in script: return "valid" else: rule_list.add("rule_1") return "invalid" #Not weekend or holiday rule_list.add("rule_1") return "invalid" def rule_4(row,nsc): for c in nsc: if row[c] != 0 and row[c] is not None: return "valid" rule_list.add("rule_4") return "invalid" data_decoded = jsonpickle.decode(data) valid_data = [] invalid_data = [] rule_list = set() reasons = { "rule_1": "Rule 1: all columns (except timestamp) are null or 0.", "rule_2": "Rule 2: the 4000 previously-validated records are identical.", "rule_3": "Rule 3: the same column had value of 0 or NULL for the last 5 rows.", "rule_4": "Rule 4: standard columns like Constituent name, id, date have values but all others are 0." } #First, perform generic data validation rules #Rule 1: check if all columns (except timestamp) are null or 0 #Get timestamp and non-timestamp columns script = kwargs["script"] uk_holidays = holidays.UK() de_holidays = holidays.Germany() df = pd.DataFrame(data_decoded["data"]) standard_columns = ['constituent_name', "constituent_id", "date", "last_update_date"] original_columns = df.columns timestamp_columns = [c for c in df.columns if ("date" in c or "time" in c)] non_timestamp_columns = [c for c in df.columns if ("date" not in c and "time" not in c)] non_standard_columns = [c for c in df.columns if c not in standard_columns + timestamp_columns] df["rule_1"] = df.apply(lambda x: rule_1(x,timestamp_columns,non_timestamp_columns,uk_holidays, de_holidays,script), axis=1) #Rule 2: Are the 4000 previously-validated records identical? invalid_indices = set() if df.shape[0] >= 4000: #We can apply rule start = 0 end = 3999 while end < df.shape[0]: df_temp = df.loc[start:end] df_duplicates = df_temp[df_temp.duplicated(subset=non_timestamp_columns, keep=False)] invalid_indices.update(list(df_duplicates.index)) start += 1 end += 1 df["rule_2"] = "valid" df.loc[list(invalid_indices), "rule_2"] = "invalid" if len(invalid_indices) > 0: rule_list.add("rule_2") #Rule 3: Has the same column had value of 0 or NULL for the last 5 rows? invalid_indices_2 = set() start = 0 end = 4 while end < df.shape[0]: for c in non_timestamp_columns: series = df.loc[start:end][c] if series.any() == False: invalid_indices_2.update(list(series.index)) start += 1 end += 1 df["rule_3"] = "valid" df.loc[list(invalid_indices_2), "rule_3"] = "invalid" if len(invalid_indices_2) > 0: rule_list.add("rule_3") #Rule 4: If standard columns like Constituent name, id, date have values but all others are 0 reject? df["rule_4"] = df.apply(lambda x: rule_4(x,non_standard_columns), axis=1) df["rule"] = object #Get invalid records invalid_indices = set() for i in range(0, df.shape[0]): row_rules = [] row = df.iloc[i] for r in ["rule_1","rule_2", "rule_3", "rule_4"]: if row[r] == "invalid": invalid_indices.add(i) row_rules.append(r) df.at[i, 'rule'] = row_rules #Get valid indices valid_indices = set(df.index.tolist()) valid_indices = valid_indices.difference(invalid_indices) valid_data += df.loc[list(valid_indices)][original_columns.tolist() + ["rule"]].to_dict(orient='records') invalid_data += df.loc[list(invalid_indices)][original_columns.tolist() + ["rule"]].to_dict(orient='records') #Custom rules custom_invalid = [] for func in rules: valid, invalid = func(jsonpickle.encode({"data":valid_data})) #print(invalid) #Check if valid valid_data = valid custom_invalid += invalid #Format invalid data invalid_data_store = [] #Add rule and reason for item in invalid_data: if "rule_1" in item["rule"] or "rule_4" in item["rule"]: continue additional_info = {} additional_info["rule"] = item["rule"] additional_info["reason"] = [reasons[r] for r in item["rule"] if isinstance(item["rule"], list)] additional_info["script"] = script additional_info["date_of_validation"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') additional_info["last_update_date"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') invalid_data_store.append((item,additional_info)) invalid_data_store += custom_invalid return list(valid_data), invalid_data_store