def __init__(self, database, user, password, host, port, table, tsid_list, period=None, verbose=False): """ :param database: String of the database name :param user: String of the username used to login to the database :param password: String of the password used to login to the database :param host: String of the database address (localhost, url, ip, etc.) :param port: Integer of the database port number (5432) :param table: String of the database table that should be worked on :param tsid_list: List of strings, with each string being a tsid :param period: Optional integer indicating the number of days whose values should be cross validated. If None is provided, then the entire set of values will be validated. :param verbose: Boolean of whether to print debugging statements or not """ self.database = database self.user = user self.password = password self.host = host self.port = port self.table = table self.tsid_list = tsid_list self.period = period self.verbose = verbose # Build a DataFrame with the source id and weight self.source_weights_df = query_source_weights( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port) # List of data vendor names to ignore when cross validating the data. # Relevant when the data source has data that would be considered. self.source_exclude_list = ['pySecMaster_Consensus'] self.source_id_exclude_list = [] for source in self.source_exclude_list: source_id = query_data_vendor_id( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, name=source) self.source_id_exclude_list.append(source_id) if self.verbose: if self.period: print('Running cross validator for %s tsids only for the prior ' '%i day\'s history.' % (len(tsid_list), self.period)) else: print('Running cross validator for %s tsids for the entire ' 'data history.' % (len(tsid_list),)) self.main()
def validator(self, tsid): tsid_start = time.time() # DataFrame of all stored prices for this ticker and interval. This is # a multi-index DataFrame, with date and data_vendor_id in the index. tsid_prices_df = query_all_tsid_prices( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, table=self.table, tsid=tsid) unique_sources = tsid_prices_df.index.\ get_level_values('data_vendor_id').unique() unique_dates = tsid_prices_df.index.get_level_values('date').unique() # If a period is provided, limit the unique_dates list to only those # within the past n period days. if self.period: beg_date = datetime.today() - timedelta(days=self.period) unique_dates = unique_dates[unique_dates > beg_date] # The consensus_price_df contains the prices from weighted consensus if self.table == 'daily_prices': consensus_price_df = pd.DataFrame( columns=['date', 'open', 'high', 'low', 'close', 'volume', 'ex_dividend', 'split_ratio']) elif self.table == 'minute_prices': consensus_price_df = pd.DataFrame( columns=['date', 'open', 'high', 'low', 'close', 'volume']) else: raise NotImplementedError('Table %s is not implemented within ' 'CrossValidate.validator' % self.table) # Set the date as the index consensus_price_df.set_index(['date'], inplace=True) # Cycle through each period, comparing each data source's prices for date in unique_dates: # Either add each field's consensus price to a dictionary, # which is entered into the consensus_price_df upon all fields # being processed, or enter each field's consensus price directly # into the consensus_price_df. Right now, this is doing the later. # consensus_prices = {} try: # Create a DataFrame for the current period, with the source_ids # as the index and the data_columns as the column headers period_df = tsid_prices_df.xs(date, level='date') except KeyError: # Should never happen print('Unable to extract the %s period\'s prices from ' 'the tsid_prices_df for %s' % (date, tsid)) finally: # Transpose the period_df DataFrame so the source_ids are # columns and the price fields are the rows period_df = period_df.transpose() # Cycle through each price field for this period's values for field_index, field_data in period_df.iterrows(): # field_index: string of the index name # field_data: Pandas Series (always??) of the field data # Reset the field consensus for every field processed field_consensus = {} # Cycle through each source's values that are in the # field_data Series. for source_data in field_data.iteritems(): # source_data is a tuple, with the first item is being # the data_vendor_id and the second being the value. # If the source_data's id is in the exclude list, don't # use its price when calculating the field consensus. if source_data[0] not in self.source_id_exclude_list: # Only process the source value if it is not None if source_data[1] is not None: # Retrieve weighted consensus for this source source_weight = self.source_weights_df.loc[ self.source_weights_df['data_vendor_id'] == source_data[0], 'consensus_weight'] try: if field_consensus: # There's already a value for this field if source_data[1] in field_consensus: # This source's value has a match in # the current consensus. Increase # weight for this price. field_consensus[source_data[1]] += \ source_weight.iloc[0] else: # Data value from the source does # not match this field's consensus field_consensus[source_data[1]] = \ source_weight.iloc[0] else: # Add first price to the field_consensus # dictionary, using price as the key # and the source's weight as the item. field_consensus[source_data[1]] = \ source_weight.iloc[0] except IndexError: # No source_weight was found, prob because # there was no data_vendor_id for value pass # Insert the highest consensus value for this period into # the consensus_price_df (the dictionary key (price) with # the largest value (consensus sum). try: consensus_value = max(field_consensus.items(), key=operator.itemgetter(1))[0] except ValueError: # None of the sources had any values, thus use -1 consensus_value = -1 consensus_price_df.ix[date, field_index] = consensus_value # Make the date index into a normal column consensus_price_df.reset_index(inplace=True) # Convert the datetime object to an ISO date consensus_price_df['date'] = consensus_price_df['date'].\ apply(lambda x: x.isoformat()) # Add the vendor id of the pySecMaster_Consensus as a column validator_id = query_data_vendor_id( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, name='pySecMaster_Consensus') consensus_price_df.insert(0, 'data_vendor_id', validator_id) consensus_price_df.insert(1, 'source', 'tsid') consensus_price_df.insert(2, 'source_id', tsid) # Add the current date to the last column consensus_price_df.insert(len(consensus_price_df.columns), 'updated_date', datetime.now().isoformat()) if validator_id in unique_sources: delete_start = time.time() # Data from the cross validation process has already been saved # to the database before, thus it must be removed before adding # the new calculated values. if self.period: # Only delete prior consensus values for this tsid that are # newer than the beg_date (current date - replace period). delete_query = ("""DELETE FROM %s WHERE source_id='%s' AND data_vendor_id='%s' AND date>'%s'""" % (self.table, tsid, validator_id, beg_date.isoformat())) else: # Delete all existing consensus values for this tsid. delete_query = ("""DELETE FROM %s WHERE source_id='%s' AND data_vendor_id='%s'""" % (self.table, tsid, validator_id)) retry_count = 5 while retry_count > 0: retry_count -= 1 delete_status = delete_sql_table_rows( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, query=delete_query, table=self.table, item=tsid) if delete_status == 'success': # Add the validated values to the relevant price table AFTER # ensuring that the duplicates were deleted successfully df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=consensus_price_df, sql_table=self.table, exists='append', item=tsid) break # print('Data table replacement took %0.2f' % # (time.time() - delete_start)) else: # Add the validated values to the relevant price table df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=consensus_price_df, sql_table=self.table, exists='append', item=tsid) # For period updates, slow down the process to allow postgre to catch up if self.period: time.sleep(1.5) if self.verbose: print('%s data cross-validation took %0.2f seconds to complete.' % (tsid, time.time() - tsid_start))
def validator(self, tsid): tsid_start = time.time() # DataFrame of all stored prices for this ticker and interval. This is # a multi-index DataFrame, with date and data_vendor_id in the index. tsid_prices_df = query_all_tsid_prices(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, table=self.table, tsid=tsid) unique_sources = tsid_prices_df.index.\ get_level_values('data_vendor_id').unique() unique_dates = tsid_prices_df.index.get_level_values('date').unique() # If a period is provided, limit the unique_dates list to only those # within the past n period days. if self.period: beg_date = datetime.today() - timedelta(days=self.period) unique_dates = unique_dates[unique_dates > beg_date] # The consensus_price_df contains the prices from weighted consensus if self.table == 'daily_prices': consensus_price_df = pd.DataFrame(columns=[ 'date', 'open', 'high', 'low', 'close', 'volume', 'dividend', 'split' ]) elif self.table == 'minute_prices': consensus_price_df = pd.DataFrame( columns=['date', 'open', 'high', 'low', 'close', 'volume']) else: raise NotImplementedError('Table %s is not implemented within ' 'CrossValidate.validator' % self.table) # Set the date as the index consensus_price_df.set_index(['date'], inplace=True) # Cycle through each period, comparing each data source's prices for date in unique_dates: # Either add each field's consensus price to a dictionary, # which is entered into the consensus_price_df upon all fields # being processed, or enter each field's consensus price directly # into the consensus_price_df. Right now, this is doing the later. # consensus_prices = {} try: # Create a DataFrame for the current period, with the source_ids # as the index and the data_columns as the column headers period_df = tsid_prices_df.xs(date, level='date') except KeyError: # Should never happen print('Unable to extract the %s period\'s prices from ' 'the tsid_prices_df for %s' % (date, tsid)) finally: # Transpose the period_df DataFrame so the source_ids are # columns and the price fields are the rows period_df = period_df.transpose() # Cycle through each price field for this period's values for field_index, field_data in period_df.iterrows(): # field_index: string of the index name # field_data: Pandas Series (always??) of the field data # Reset the field consensus for every field processed field_consensus = {} # Cycle through each source's values that are in the # field_data Series. for source_data in field_data.iteritems(): # source_data is a tuple, with the first item is being # the data_vendor_id and the second being the value. # If the source_data's id is in the exclude list, don't # use its price when calculating the field consensus. if source_data[0] not in self.source_id_exclude_list: # Only process the source value if it is not None if source_data[1] is not None: # Retrieve weighted consensus for this source source_weight = self.source_weights_df.loc[ self.source_weights_df['data_vendor_id'] == source_data[0], 'consensus_weight'] try: if field_consensus: # There's already a value for this field if source_data[1] in field_consensus: # This source's value has a match in # the current consensus. Increase # weight for this price. field_consensus[source_data[1]] += \ source_weight.iloc[0] else: # Data value from the source does # not match this field's consensus field_consensus[source_data[1]] = \ source_weight.iloc[0] else: # Add first price to the field_consensus # dictionary, using price as the key # and the source's weight as the item. field_consensus[source_data[1]] = \ source_weight.iloc[0] except IndexError: # No source_weight was found, prob because # there was no data_vendor_id for value pass # Insert the highest consensus value for this period into # the consensus_price_df (the dictionary key (price) with # the largest value (consensus sum). try: consensus_value = max(field_consensus.items(), key=operator.itemgetter(1))[0] except ValueError: # None of the sources had any values, thus use -1 consensus_value = -1 consensus_price_df.ix[date, field_index] = consensus_value # Make the date index into a normal column consensus_price_df.reset_index(inplace=True) # Convert the datetime object to an ISO date consensus_price_df['date'] = consensus_price_df['date'].\ apply(lambda x: x.isoformat()) # Add the vendor id of the pySecMaster_Consensus as a column validator_id = query_data_vendor_id(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, name='pySecMaster_Consensus') consensus_price_df.insert(0, 'data_vendor_id', validator_id) consensus_price_df.insert(1, 'source', 'tsid') consensus_price_df.insert(2, 'source_id', tsid) # Add the current date to the last column consensus_price_df.insert(len(consensus_price_df.columns), 'updated_date', datetime.now().isoformat()) if validator_id in unique_sources: delete_start = time.time() # Data from the cross validation process has already been saved # to the database before, thus it must be removed before adding # the new calculated values. if self.period: # Only delete prior consensus values for this tsid that are # newer than the beg_date (current date - replace period). delete_query = ( """DELETE FROM %s WHERE source_id='%s' AND source='tsid' AND data_vendor_id='%s' AND date>'%s'""" % (self.table, tsid, validator_id, beg_date.isoformat())) else: # Delete all existing consensus values for this tsid. delete_query = ("""DELETE FROM %s WHERE source_id='%s' AND source='tsid' AND data_vendor_id='%s'""" % (self.table, tsid, validator_id)) retry_count = 5 while retry_count > 0: retry_count -= 1 delete_status = delete_sql_table_rows(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, query=delete_query, table=self.table, item=tsid) if delete_status == 'success': # Add the validated values to the relevant price table AFTER # ensuring that the duplicates were deleted successfully df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=consensus_price_df, sql_table=self.table, exists='append', item=tsid) break # print('Data table replacement took %0.2f' % # (time.time() - delete_start)) else: # Add the validated values to the relevant price table df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=consensus_price_df, sql_table=self.table, exists='append', item=tsid) # For period updates, slow down the process to allow postgre to catch up if self.period: time.sleep(1.5) if self.verbose: print('%s data cross-validation took %0.2f seconds to complete.' % (tsid, time.time() - tsid_start))