def detect_and_parse_new_disk_files_async(self): Log.i('asynchronously detecting and parsing new disk files') event_handler = DirWatcher(self.handle_file_created) self.observer = Observer() self.observer.schedule(event_handler, self.dir_path, recursive=False) self.observer.start() return self.observer
def retrieve(db, url, datasource_id, exchange_id, currency_id): temp_dirpath=AppConfig.setting('TEMP_DIRPATH') filepath = os.path.join(temp_dirpath, url.split('/')[-1]) downloadFile(url, filepath) duplicateCount = 0 insertCount = 0 with gzip.open(filepath, 'rt') as f: Log.d('Processing csv file..') spamreader = csv.reader(f, delimiter=',', quotechar='|') for row in spamreader: timeStr = row[0] epochTime = int(timeStr) priceStr = row[1] price = float(priceStr) amountStr = row[2] amount = float(amountStr) transaction = { 'datasource_id': datasource_id, 'exchange_id': exchange_id, 'amount': amount, 'price': price, 'currency_id': currency_id, 'epoch_time': epochTime, } try: db.create_transaction(transaction) insertCount += 1 except DuplicateInsertException as e: duplicateCount += 1 os.remove(filepath) Log.i('Done processing, insert count: {}, duplicate count: {}', insertCount, duplicateCount)
async def __process_subscriber(self, index, subscriber): fail_count = 0 response_file_prefix = subscriber.handler_filename while True: try: Log.i('invoking subscriber {}', subscriber.handler_filename) async for response_text in subscriber.subscribe(): response_text_md5hash = StringExpert.md5hash(response_text) try: epoch = int(time.time()) filepath = os.path.join( self.data_response_dirpath, '{}.{}.{}'.format(response_file_prefix, epoch, FetchApp.RESPONSE_EXTENSION) ) with open(filepath, 'w') as file: file.write(response_text) except Exception as e: Log.e('Failed to save response to file, message: {}', e) Log.d('stored api response for subcriber {} (hash {})', subscriber.handler_filename, response_text_md5hash) except Exception as e: fail_count += 1 Log.e('failed to invoke subscriber {} ({} failures so far)', subscriber.handler_filename, fail_count) stacktrace = traceback.format_exc() Log.d('exception stack:\n{}', stacktrace) Log.i('retrying in {} seconds..', self.retry_delay_seconds) await asyncio.sleep(self.retry_delay_seconds)
def activateSubscribers(self): subscriber_count = len(self.subscribers) Log.i('activating {} subscriber(s)', subscriber_count) loop = asyncio.get_event_loop() futures = [self.__process_subscriber(i, s) for i,s in enumerate(self.subscribers)] tasks = asyncio.gather(*futures) loop.run_until_complete(tasks) loop.close() Log.i('done processing subscribers')
def feed_jobs_forever(self, job_changed_handler): assert job_changed_handler is not None sleep_seconds = self.sleep_seconds transaction_min_timestamp = self.transaction_min_timestamp start_transaction_min_timestamp = transaction_min_timestamp data_dirpath = self.data_dirpath start_time = time.time() Log.i( 'processing transactions, sleep interval {}s, starting from epoch {} ({})', sleep_seconds, transaction_min_timestamp, StringExpert.format_timestamp(transaction_min_timestamp)) to_fetch_count = self.db.transaction_count(transaction_min_timestamp) Log.d('transaction count since {} ({}): {}', transaction_min_timestamp, StringExpert.format_timestamp(transaction_min_timestamp), to_fetch_count) pd.set_option('io.hdf.default_format', 'table') hdf5_filename = '{}_{}_{}.h5'.format( self.version.major, self.version.minor, datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S')) hdf5_filepath = path.join(data_dirpath, hdf5_filename) Log.i('hdf5 output filepath is: \n{}', hdf5_filepath) set_size = 1000 fetch_count = 0 plot_time = time.time() is_realtime = False while True: try: next_transaction_min_timestamp = self.process_transaction_subset( transaction_min_timestamp, set_size, hdf5_filepath, job_changed_handler, is_realtime) if next_transaction_min_timestamp is None: Log.d('nothing to process, waiting..') is_realtime = True # TODO: empty polling perhaps not the best indicator of switch to realtime time.sleep(sleep_seconds) else: assert next_transaction_min_timestamp > transaction_min_timestamp, 'next minimum timestamp was not greater than the current timestamp' transaction_min_timestamp = next_transaction_min_timestamp fetch_count += set_size percentage = 100 * fetch_count / to_fetch_count current_time = time.time() Log.d( 'processed {}/{}, {}%, spent {} on the period {} ({}) to {} ({})', fetch_count, to_fetch_count, int(percentage), Timespan.from_seconds(int(current_time - start_time)).as_string(), StringExpert.format_timestamp( start_transaction_min_timestamp), start_transaction_min_timestamp, StringExpert.format_timestamp( transaction_min_timestamp), transaction_min_timestamp) except Exception as e: raise Exception( 'Failed to process nonparsed api responses') from e Log.w('all {} rows read, but should loop forever', row_count)
def watch_continuously(self, watch_interval_seconds): Log.i('continuous watching activated with interval of {} seconds', watch_interval_seconds) consecutive_error_count = 0 while True: try: self.__verify_datafetch_apis_write_frequency() consecutive_error_count = 0 except Exception as e: consecutive_error_count += 1 Log.e('fail during watcher check ({} consecutive errors)', consecutive_error_count) stacktrace = OsExpert.stacktrace() Log.d('stacktrace:\n{}', stacktrace) time.sleep(watch_interval_seconds)
def create_predictor_from_csv(self): Log.i('initiating sagemaker model creation') role = AppConfig.setting('AWS_PREDICTOR_ROLE') bucket='cryptrade-sagemaker' custom_code_upload_location = 's3://{}/customcode/tensorflow_iris'.format(bucket) model_artifacts_location = 's3://{}/artifacts'.format(bucket) Log.d('training data will be uploaded to: {}', custom_code_upload_location) Log.d('training artifacts will be uploaded to: {}', model_artifacts_location) sess = sagemaker.Session() def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False): file = Path(filepath) """From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb""" s3 = boto3.resource('s3') key = channel + '/' + file.name bucket_ref = s3.Bucket(bucket) objs = list(bucket_ref.objects.filter(Prefix=key)) is_file_already_existing = len(objs) > 0 and objs[0].key == key if is_file_already_existing is True: if skip_if_name_and_size_matches is True: s3_client = boto3.client('s3') response = s3_client.head_object(Bucket=bucket, Key=key) local_size = file.stat().st_size remote_size = response['ContentLength'] if remote_size == local_size: Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key) return Log.w('overwriting existing s3 key: {}', key) with open(filepath, "rb") as data: s3.Bucket(bucket).put_object(Key=key, Body=data) s3_data_folder = 'data' upload_to_s3(s3_data_folder, self.train_filepath, True) upload_to_s3(s3_data_folder, self.test_filepath, True) upload_to_s3(s3_data_folder, self.meta_filepath) estimator = TensorFlow( entry_point='aws_dnn_predictor_entry.py', role=role, output_path=model_artifacts_location, code_location=custom_code_upload_location, train_instance_count=1, train_instance_type='ml.c5.xlarge', training_steps=1000, evaluation_steps=100 ) train_data_location = 's3://{}/{}'.format(bucket, s3_data_folder) Log.i('fitting train data: {}', train_data_location) estimator.fit(train_data_location) Log.i('deploying model') deploy_start = datetime.now() predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium' ) deploy_end = datetime.now() Log.i('deployed predictor in {}s, endpoint is:\n{}', deploy_end - deploy_start, predictor.endpoint) self.predictor = predictor
def process_nonparsed_api_responses_full(self, sleep_seconds=0): Log.i( 'initiating continuous parsing of api responses with subset sleep interval: {} seconds', sleep_seconds) try: min_id = -1 next_min_id = 0 while next_min_id > min_id: min_id = next_min_id parse_count = 0 next_min_id = self.process_nonparsed_api_responses_subset( next_min_id=min_id) time.sleep(sleep_seconds) except Exception as e: raise Exception('Failed to process nonparsed api responses') from e transaction_count = self.store.transaction_count() Log.d('no more api responses to parse, transaction count is now {}', transaction_count)
def __init__(self, version): super().__init__(__file__) self.window_size = 15 self.interval_seconds = [15 * 60] # 15 minutes self.contruct_time = time.time() self.version = version self.sleep_seconds = 1 # must be low enough to produce empty result set eventually > reaktime self.transaction_min_timestamp = int( AppConfig.setting('GENERATOR_TRANSACTION_MIN_TIMESTAMP')) self.data_dirpath = AppConfig.setting('GENERATOR_DATA_DIRPATH') Log.d('construct: {}', self.__dict__) self.db = DatabaseGateway() max_history_minutes = 10 * 24 * 60 #max(self.minute_intervals) self.from_currency_ids = [] self.to_currency_ids = [] self.run_config = self.read_run_config() self.jobs = list( self.__jobs_iterate(max_history_minutes, self.run_config)) Log.i('count of generator jobs: {}', len(self.jobs))
def run(self): emailHeader = 'LogWatchPipeApp input trigger match' while True: sys.stdout.flush() try: line = sys.stdin.readline() except KeyboardInterrupt: break if not line: break sys.stdout.write(line) for triggerLine in self.triggerLines: if triggerLine in line: self.matchCountSinceLastEmail += 1 Log.i('Log watch triggered, will send email') msg = 'The following line matched a trigger:\n\n{}\n\nMatches since last email attempt: {}\n\nNo more matches will be reported for {} minutes'.format( line, self.matchCountSinceLastEmail, self.maxEmailReccurenceMinutes) Thread(target=self.email_maybe, args=(emailHeader, msg)).start()
def process(self, epoch, df): if df.empty: Log.d('skipping processing of empty dataset') return r_index = df.index.get_loc(epoch) if self.predictor is not None: row_frame = df[r_index:r_index + 1] return self.__predict(row_frame) not_enough_predictor_data = r_index +1 < self.min_predict_generator_size if not_enough_predictor_data: return Log.d('initiating predictor contruction at index {}, frame length {}', r_index, len(df)) predictor = self.predictor_from_config_maybe() if predictor is not None: self.predictor = predictor Log.i('existing predictor endpoint loaded: {}', predictor.endpoint) return train_df = df[:r_index +1] Log.i('at index {}, detected data of adequate length {} writing csv', r_index, len(train_df), self.csv_filepath) self.write_csv(train_df) return None
def __init__(self, h5_filepath, version): warnings.simplefilter('ignore', NaturalNameWarning) h5_inputfile = Path(h5_filepath) output_dirpath = AppConfig.setting('PREDICTOR_DATA_DIRPATH') self.h5_out_filepath = os.path.join(output_dirpath, h5_inputfile.name) h5_out_file = Path(self.h5_out_filepath) if h5_out_file.exists(): Log.i('overwrite file?: {}', h5_out_file) if not OsExpert.prompt_confirm('File already exists, overwrite? {}'.format(h5_out_file)): Log.d('user aborted, exiting') exit() Log.w('removing file: {}', h5_out_file) os.remove(self.h5_out_filepath) self.predictors_map = {} base_filepath = output_dirpath with pd.HDFStore(h5_filepath, mode='r') as h5: keys = h5.keys() Log.i('h5 input keys: {}', keys) assert len(keys) == 1, 'harcoded restriction on single key was violated' for key in keys: Log.i('row count for {}: {}', key, h5.get_storer(key).nrows) self.predictors_map[key] = [ EnsemblePredictor(min_predict_generator_size=2000, max_train_size=5000) ] self.h5_watcher = H5FileWatcher(h5_filepath, self.handle_job_epoch, {'is_simulated': 0})
def process_nonparsed_api_responses_subset(self, next_min_id=0): limit = 1000 Log.i( 'processing nonparsed api responses, starting from id {} with limit {}', next_min_id, limit) total_row_count = 0 parse_count = 0 is_to_keep_fetching = True while is_to_keep_fetching == True: datasources_frame = self.store.datasources_frame() frame = self.store.unparsed_datafetch_api_responses_frame( min_id=next_min_id, limit=1000) row_count = frame.shape[0] if row_count == 0: is_to_keep_fetching = False else: total_row_count += row_count for i, row in frame.iterrows(): try: row_id = row['id'] datasource_id = row['datasource_id'] parser = self.find_parser(datasource_id, datasources_frame) if ParseUtil.parse_and_persist_as_transaction_maybe( row, parser, self.store) == True: parse_count += 1 except Exception as e: raise Exception( 'Failed to parse row index {} with id {}'.format( i, row_id)) from e ids = frame['id'] max_id = ids.max() Log.t('sweep of ids {}..{} returned {} entries', next_min_id, max_id, row_count) next_min_id = max_id + 1 # start from the next row Log.i('search for nonparsed responses done, parse count: {}/{}', parse_count, total_row_count) return next_min_id
def write_csv(self, df): if self.write_count > 0: Log.w('ignoring csv write because it has already been performed') return X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True) assert len(X_all) == len(y_all) if X_all.empty: Log.w('no rows to write!') return y_null_count = y_all.isnull().sum() assert y_null_count == 0, 'null count: {}'.format(y_null_count) X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=123) Log.d('X train shape: {}, X test shape: {}', X_train.shape, X_test.shape) train = pd.concat([X_train, y_train], axis=1) test = pd.concat([X_test, y_test], axis=1) is_first_write = (self.write_count == 0) for frame, filepath in ((train, self.train_filepath), (test, self.test_filepath)): Log.d('writing csv: {}', filepath) frame.to_csv(filepath, sep=',', na_rep='', index=False, header=is_first_write, decimal='.', mode='+a')#, index=False) with open(self.meta_filepath, 'w') as f: f.write(json.dumps( { 'train_filename': Path(self.train_filepath).name, 'test_filename': Path(self.test_filepath).name, 'train_observation_count': len(X_train), 'test_observation_count': len(X_test), 'feature_count': X_all.shape[1] }, indent=4#, sort_keys=True )) self.write_count += 1 Log.i('done writing csv file, write count is now: {}', self.write_count) if self.is_train_async is True: Log.d('propagating notification that csv has been written') self.csv_changed_event.set() else: self.create_predictor_from_csv()
'price': price, 'currency_id': currency_id, 'epoch_time': epochTime, } try: db.create_transaction(transaction) insertCount += 1 except DuplicateInsertException as e: duplicateCount += 1 os.remove(filepath) Log.i('Done processing, insert count: {}, duplicate count: {}', insertCount, duplicateCount) db = DatabaseGateway() currencies = db.currencies_frame() datasources = db.datasources_frame() for i, job in enumerate(jobs): url = job['url'] Log.i('Processing job {}/{}'.format(i + 1, len(jobs))) start_time = datetime.datetime.now() datasource_id = db.datasource_id_by_name(job['datasource_name']) exchange_id = db.exchange_id_by_name(job['provider_name']) currency_code = db.currency_id_by_code(job['currency_code']) retrieve( db, url, datasource_id, exchange_id, currency_code ) time_spent = datetime.datetime.now() - start_time Log.i('Done with job, time spent: {}', time_spent)
def run(self, alert_interval_seconds): Log.i('Check interval is: {} seconds', alert_interval_seconds) loop = asyncio.get_event_loop() loop.run_until_complete( self.alert_continuously(alert_interval_seconds) )
def initialize(filepath): AppConfig.__ensure_config_filepath_valid(filepath) AppConfig.Filepath = filepath startup_message = 'Configuration filepath: {}'.format( AppConfig.Filepath) Log.i(startup_message)
def handle_file_created(self, filepath): filename = os.path.basename(filepath) subscriber = self.parse_util.subscriber_by_filename(filename) is_parsed = self.parse_util.process_api_response_file( filepath, subscriber) Log.i('file {} was parsed: {}', filepath, is_parsed)
def __create_predictor(self, df): Log.i('creating predictor on {} rows', len(df)) assert not df.empty kfold = StratifiedKFold(n_splits=10) random_state = 2 classifiers = [] classifiers.append(SVC(random_state=random_state)) classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1)) classifiers.append(RandomForestClassifier(random_state=random_state)) classifiers.append(ExtraTreesClassifier(random_state=random_state)) classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) classifiers.append(KNeighborsClassifier()) classifiers.append(LogisticRegression(random_state = random_state)) classifiers.append(LinearDiscriminantAnalysis()) X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True, max_train_size=self.max_train_size) if X_all.empty: Log.w('could not create predictor as the preprocessing resulted in an empty dataframe') return X_train, X_test, Y_train, Y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=random_state) Log.d('train shape: X: {}, y: {}', X_train.shape, Y_train.shape) cv_results = [] for classifier in classifiers : Log.d('performing cross val score for predictor {}', classifier) start_time = datetime.now() cv_results.append( cross_val_score(classifier, X_train, y = Y_train, scoring = 'accuracy', cv = kfold, n_jobs=core_count) ) Log.d('..done, time spent: {}', datetime.now() - start_time) cv_means = [] cv_std = [] for cv_result in cv_results: cv_means.append(cv_result.mean()) cv_std.append(cv_result.std()) cv_res = pd.DataFrame({ 'CrossValMeans': cv_means, 'CrossValerrors': cv_std, 'Algorithm': [ 'SVC', 'DecisionTree', 'AdaBoost', 'RandomForest', 'ExtraTrees', 'GradientBoosting', 'MultipleLayerPerceptron', 'KNeighboors', 'LogisticRegression', 'LinearDiscriminantAnalysis' ]}) Log.d('cross val results:\n{}', cv_res) g = sns.barplot('CrossValMeans','Algorithm',data = cv_res, palette='Set3',orient = 'h',**{'xerr':cv_std}) g.set_xlabel('Mean Accuracy') g = g.set_title('Cross validation scores') Log.i('saving plot..') plt.savefig('!eb1_cross_val_score.png', edgecolor='none', format="png") DTC = DecisionTreeClassifier() adaDTC = AdaBoostClassifier(DTC, random_state=7) ada_param_grid = {'base_estimator__criterion' : ['gini', 'entropy'], 'base_estimator__splitter' : ['best', 'random'], 'algorithm' : ['SAMME','SAMME.R'], 'n_estimators' :[1,2], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]} gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsadaDTC.fit(X_train,Y_train) ada_best = gsadaDTC.best_estimator_ gsadaDTC.best_score_ ExtC = ExtraTreesClassifier() ex_param_grid = {'max_depth': [None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [False], 'n_estimators' :[100,300], 'criterion': ['gini']} gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsExtC.fit(X_train,Y_train) ExtC_best = gsExtC.best_estimator_ Log.d('gsExtC.best_score_: {}', gsExtC.best_score_) RFC = RandomForestClassifier() rf_param_grid = {'max_depth': [None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [False], 'n_estimators' :[100,300], 'criterion': ['gini']} gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsRFC.fit(X_train,Y_train) RFC_best = gsRFC.best_estimator_ Log.d('gsRFC.best_score_: {}', gsRFC.best_score_) GBC = GradientBoostingClassifier() gb_param_grid = {'loss' : ['deviance'], 'n_estimators' : [100,200,300], 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 8], 'min_samples_leaf': [100,150], 'max_features': [0.3, 0.1] } gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsGBC.fit(X_train,Y_train) GBC_best = gsGBC.best_estimator_ Log.d('gsGBC.best_score_: {}', gsGBC.best_score_) SVMC = SVC(probability=True) svc_param_grid = {'kernel': ['rbf'], 'gamma': [ 0.001, 0.01, 0.1, 1], 'C': [1, 10, 50, 100,200,300, 1000]} gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsSVMC.fit(X_train,Y_train) SVMC_best = gsSVMC.best_estimator_ Log.d('gsSVMC.best_score_: {}', gsSVMC.best_score_) Log.w('quitting') exit()
def process_transaction_subset(self, transaction_min_timestamp, set_size, hdf5_filepath, job_changed_handler, is_realtime): assert job_changed_handler is not None, 'no job_changed_handler provided' window_size = 10 subset_process_start_time = time.time() frame = self.db.transaction_by_timestamp_frame( transaction_min_timestamp, set_size, self.from_currency_ids, self.to_currency_ids) frame.set_index('epoch_time', inplace=True) row_count = frame.shape[0] Log.d('...time spent fetching subset ({} rows) from db: {:.2f}s', row_count, time.time() - subset_process_start_time) if row_count == 0: return None row_process_count = 0 last_epoch_time = None Log.d('...processing rows...') row_process_start_time = time.time() gap_resolver = self.run_config['gap_resolver'] for epoch_time, row in frame.iterrows(): is_row_processed = False try: transaction_id = row['id'] datasource_id = row['datasource_id'] exchange_id = row['exchange_id'] from_currency_id = row['from_currency_id'] to_currency_id = row['to_currency_id'] price = np.float64(row['price']) volume = np.float64(row['volume']) transaction_min_timestamp = epoch_time #transaction_id + 1 seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time Log.t('seconds since previous epoch time: {}', seconds_since_previous) if last_epoch_time is not None: assert epoch_time >= last_epoch_time, 'epoch time ({}) was less than the previous epoch time ({})'.format( epoch_time, last_epoch_time) seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time assert seconds_since_previous >= 0, 'seconds_since_previous cannot be a negative value' last_epoch_time = epoch_time for job in self.jobs: if (job.datasource.id == datasource_id and job.exchange.id == exchange_id and job.from_currency.id == from_currency_id and job.to_currency.id == to_currency_id): is_row_processed = True try: h5frame = job.frame if h5frame is not None: # perfrom integrity check on existing = non-empty dataframe assert not h5frame.empty # should not be possible if the frame has previously been created last_epoch = h5frame.index.values[-1] seconds_since_previous = epoch_time - last_epoch assert seconds_since_previous >= 0 max_gap_seconds = 120 # TODO make config setting if (seconds_since_previous > max_gap_seconds ): # TODO make config setting warn_message = 'excessive time (+{}s) passed since previous observation: {}s ({}) between {} ({}) and {} ({})'.format( max_gap_seconds, seconds_since_previous, Timespan.from_seconds( int(seconds_since_previous) ).as_string(), last_epoch, StringExpert.format_timestamp( last_epoch), epoch_time, StringExpert.format_timestamp( epoch_time)) if gap_resolver is None: raise Exception(warn_message) Log.w(warn_message) prev_observation = h5frame.iloc[-1] df_intermediates = gap_resolver.intermediates_frame( max_gap_seconds, from_epoch=last_epoch, to_epoch=epoch_time, from_price=prev_observation['latest'], to_price=price, from_volume=prev_observation['volume'], to_volume=volume) Log.d( 'simulating intermediate observations:\n{}', df_intermediates) simulated_count = 0 for intermediate_epoch, intermediate in df_intermediates.iterrows( ): job_observation = job.job_observe( value=intermediate['price'], epoch_time=intermediate_epoch, volume=intermediate['volume'], is_simulated=True, is_realtime=False) assert job_observation is not None simulated_count += 1 if simulated_count % 1000 == 0: Log.d('..simulated {}/{}..', simulated_count, len(df_intermediates)) Log.i( 'done simulating {} observations up until epoch {} ({})', len(df_intermediates), epoch_time, StringExpert.format_timestamp( epoch_time)) try: job_observation = job.job_observe( value=price, epoch_time=epoch_time, volume=volume, is_simulated=False, is_realtime=is_realtime) row = job_observation # job_observation_to_frame_row(volume, job_observation) assert row is not None job_changed_handler(job) except DoubleObservationError as doe: Log.w( 'epoch already in frame, will be ignored ({})', epoch_time) except Exception as job_e: raise Exception( 'Failed to feed row to job') from job_e except Exception as e: raise Exception( 'Failed to process row index {}'.format(epoch_time)) from e if is_row_processed: row_process_count += 1 Log.d('...time spent processing {}/{} rows in time: {:.2f}s', row_process_count, frame.shape[0], time.time() - row_process_start_time) with pd.HDFStore(hdf5_filepath, mode='a') as h5: h5_process_start_time = time.time() start_observation_epoch = frame.index.values[0] for job in self.jobs: df_to_append = job.frame[ job.frame.index >= start_observation_epoch] try: h5.append(job.uid, df_to_append, format='table', data_columns=True) row_count = h5.get_storer(job.uid).nrows Log.d('...h5 key {}, row count is {}', job.uid, row_count) except Exception as append_error: raise append_error Log.d('...time spent adding to h5: {:.2f}s', time.time() - h5_process_start_time) row_processing_time = time.time() - subset_process_start_time Log.d('...total time spent on subset: {:.2f}s ({:.2f}s per row)', row_processing_time, row_processing_time / row_process_count) return transaction_min_timestamp