Пример #1
0
 def detect_and_parse_new_disk_files_async(self):
     Log.i('asynchronously detecting and parsing new disk files')
     event_handler = DirWatcher(self.handle_file_created)
     self.observer = Observer()
     self.observer.schedule(event_handler, self.dir_path, recursive=False)
     self.observer.start()
     return self.observer
Пример #2
0
	def __init__(self, h5_filepath, version):
		warnings.simplefilter('ignore', NaturalNameWarning)
		h5_inputfile = Path(h5_filepath)
		output_dirpath = AppConfig.setting('PREDICTOR_DATA_DIRPATH')
		self.h5_out_filepath = os.path.join(output_dirpath, h5_inputfile.name)
		h5_out_file =  Path(self.h5_out_filepath)
		if h5_out_file.exists():
			Log.i('overwrite file?: {}', h5_out_file)
			if not OsExpert.prompt_confirm('File already exists, overwrite? {}'.format(h5_out_file)):
				Log.d('user aborted, exiting')
				exit()
			Log.w('removing file: {}', h5_out_file)
			os.remove(self.h5_out_filepath)
		self.predictors_map = {}
		base_filepath = output_dirpath
		with pd.HDFStore(h5_filepath, mode='r') as h5: 	
			keys = h5.keys()
			Log.i('h5 input keys: {}', keys)
			assert len(keys) == 1, 'harcoded restriction on single key was violated'
			for key in keys:
				Log.i('row count for {}: {}', key, h5.get_storer(key).nrows)
				self.predictors_map[key] = [
				EnsemblePredictor(min_predict_generator_size=2000, max_train_size=5000)
				]		
		self.h5_watcher = H5FileWatcher(h5_filepath, self.handle_job_epoch, {'is_simulated': 0})
Пример #3
0
 def email(sender,
           receiver,
           title,
           text,
           smtp_host=None,
           smtp_user=None,
           smtp_password=None,
           smtp_port=587):
     try:
         if smtp_host is None:
             smtp_host = AppConfig.setting('SMTP_HOST')
         if smtp_user is None:
             smtp_user = AppConfig.setting('SMTP_USER')
         if smtp_password is None:
             smtp_password = AppConfig.setting('SMTP_PASSWORD')
         msg = EmailMessage()
         msg.set_content(text)
         msg['Subject'] = title
         msg['From'] = sender  #Address(display_name='Recipient', addr_spec='*****@*****.**')
         msg['To'] = receiver
         Log.t('sending email')
         with smtplib.SMTP(host=smtp_host, port=smtp_port) as smtp_server:
             smtp_server.starttls(context=SSLContext(PROTOCOL_TLSv1_2))
             smtp_server.login(user=smtp_user, password=smtp_password)
             smtp_server.send_message(msg)
             smtp_server.quit()
         Log.t('sent email')
     except Exception as e:
         raise Exception('Failed to send email') from e
Пример #4
0
	async def check_for_alert_match(self):
		urls = [
			'https://twitter.com/CFTC', 
			'https://twitter.com/sec_enforcement?lang=en',
			'https://twitter.com/ushouserep?lang=en'
			]
		strip_texts = None
		with open('ignore-lines.json', 'r') as f:    
		    strip_texts = json.load(f)			    	    
		Log.d('checking {} sources, ignoring {} lines..', len(urls), len(strip_texts))
		patterns = [
			r'.{,200}bitcoin.{,200}', 
			r'.{,200}crypto.{,200}', 
			r'.{,200}virtual currency.{,200}',
			]
		for url in urls:
			async with aiohttp.ClientSession() as session:
				html_text = await self.__fetch(session, url)
				text = StringExpert.strip_tags(html_text)
				text = html.unescape(text)
				for strip_text in strip_texts:
					text = text.replace(strip_text, '')
				for pattern in patterns:
					match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
					if match is not None:
						matched_line = match.group()
						warning = 'Found pattern "{}" at url "{}" in line: {}'.format(pattern, url, matched_line) 
						Log.w(warning)						
						return True
		return False
Пример #5
0
 def __init__(self):
     super().__init__(__file__)
     Log.d('construct')
     self.dir_path = AppConfig.setting('DATA_RESPONSE_DIRPATH')
     self.store = Store()
     self.subscribers = subscribe.all()
     self.parse_util = ParseUtil(self.subscribers, self.store)
Пример #6
0
def retrieve(db, url, datasource_id, exchange_id, currency_id):
	temp_dirpath=AppConfig.setting('TEMP_DIRPATH')
	filepath = os.path.join(temp_dirpath, url.split('/')[-1])
	downloadFile(url, filepath)
	duplicateCount = 0
	insertCount = 0
	with gzip.open(filepath, 'rt') as f:
		Log.d('Processing csv file..')
		spamreader = csv.reader(f, delimiter=',', quotechar='|')
		for row in spamreader:
			timeStr = row[0]
			epochTime = int(timeStr)
			priceStr = row[1]
			price = float(priceStr)
			amountStr = row[2]
			amount = float(amountStr)
			transaction = {
				'datasource_id': datasource_id,
				'exchange_id': exchange_id,
				'amount': amount,
				'price': price,
				'currency_id': currency_id,
				'epoch_time': epochTime,
			}
			try:
				db.create_transaction(transaction)
				insertCount += 1
			except DuplicateInsertException as e:
				duplicateCount += 1
	os.remove(filepath)
	Log.i('Done processing, insert count: {}, duplicate count: {}', insertCount, duplicateCount)
Пример #7
0
 def process_h5(self):
     with pd.HDFStore(self.h5_filepath, mode='r') as h5:
         for jobuid in h5:
             is_first_encounter = jobuid not in self.job_frames
             if is_first_encounter == True:
                 self.job_frames[jobuid] = pd.read_hdf(
                     h5, jobuid, start=0,
                     stop=1)  # fetch first row to get the first index/epoch
             job_df = self.job_frames[jobuid]
             latest_epoch = job_df.index.values[
                 -1]  # will ensure we don't 'miss' any rows in case the handle count jumps more than once
             where_clause = 'index > {} {}'.format(latest_epoch,
                                                   self.contraints_clause)
             new_df = pd.read_hdf(h5, jobuid, where=where_clause)
             if new_df.empty:
                 Log.w('dataset was empty for key {} and index > {}',
                       jobuid, latest_epoch)
             else:
                 assert new_df.index.values[0] > latest_epoch
                 new_first_index = 0 if is_first_encounter == True else len(
                     job_df)
                 joined = pd.concat([job_df, new_df])
                 self.job_frames[jobuid] = joined
                 if len(joined) > 100000:
                     Log.w(
                         'holding a dataset of significant length ({} rows, {:.1f}mb): {}',
                         len(joined),
                         joined.memory_usage().sum() / 1_000_000, jobuid)
                 assert joined.shape[0] == len(job_df) + len(new_df)
                 self.ensure_strictly_increasing_index(
                     joined)  # TODO: remove once this is confirmed
                 self.row_handler(jobuid, joined, new_first_index)
Пример #8
0
	def __init__(self, min_predict_generator_size, max_train_size):
		super().__init__(predict_col='feature_rtrspc()_next_trend_pricefeature')
		assert max_train_size > min_predict_generator_size
		self.min_predict_generator_size = min_predict_generator_size 
		self.max_train_size = max_train_size
		self.predictor = None
		Log.d('core count: {}', core_count)
Пример #9
0
	async def subscribe(self):	
		try:
			async for response_text in self.__socket_subscribe():
				Log.t('received text: {}', response_text)
				yield response_text
		except Exception as e:
			error_msg = 'Failed to subscribe for handler filepath {}'.format(self.handler_filepath)
			raise Exception(error_msg) from e
Пример #10
0
	def filter_simulated_observations(self, df):
		filtered_df = df[df['is_simulated'] != 1]
		dropped = df[~df.index.isin(filtered_df.index)]
		if len(dropped) > 0:
			Log.w('filtered out {} simulated frames', len(dropped))
		else:
			Log.d('no simulated frames found to filter out')
		return filtered_df
Пример #11
0
 def __parse_and_persist_as_transaction(row, parser, db):
     parsed = parser.parse(row)
     if parsed is None:
         return None
     parser_datasource_name = parser.datasource_name
     id = db.create_transaction(parsed)
     Log.t('persisted transaction id {}', id)
     return parsed
Пример #12
0
 def __init__(self):
     super().__init__(__file__, isToNotifyStartup=False)
     self.maxEmailReccurenceMinutes = float(
         AppConfig.setting('LOGWATCH_EMAIL_MAX_RECCURENCE_MINUTES'))
     self.triggerLines = ['ERROR', 'WARNING']
     Log.d('construct: {}', self.__dict__)
     self.matchCountSinceLastEmail = 0
     self.lastEmailDatetime = None
Пример #13
0
 def tryAppNotifyByEmail(serviceName, messsage):
     if AppConfig.setting('IS_EMAIL_NOTIFICATION_ENABLED') != '1':
         Log.d('ignoring email request per configuration')
         return False
     alertEmail = AppConfig.setting('ALERT_EMAIL')
     hostName = socket.gethostname()
     return NetworkExpert.emailMaybe(
         alertEmail, alertEmail,
         '*** {}: {} ***'.format(hostName, serviceName), messsage)
Пример #14
0
	async def alert_continuously(self, alert_interval_seconds):	
		is_triggered = False
		while True: #is_triggered == False:
			try:
				is_triggered = await self.check_for_alert_match()
			except Exception as e:
				stacktrace = OsExpert.stacktrace()
				Log.e('Failed to run alert check, stacktace:\n{}', stacktrace)
			await asyncio.sleep(alert_interval_seconds)
Пример #15
0
	def activateSubscribers(self):
		subscriber_count = len(self.subscribers)
		Log.i('activating {} subscriber(s)', subscriber_count)
		loop = asyncio.get_event_loop()
		futures = [self.__process_subscriber(i, s) for i,s in enumerate(self.subscribers)]
		tasks = asyncio.gather(*futures)
		loop.run_until_complete(tasks)
		loop.close()
		Log.i('done processing subscribers')
Пример #16
0
	def __init__(self):
		super().__init__(__file__)
		Log.d('construct')
		retry_delay_seconds = int(AppConfig.setting('DATAFETCH_API_RETRY_DELAY_SECONDS'))
		data_response_dirpath = AppConfig.setting('DATA_RESPONSE_DIRPATH')
		Log.d('data response dirpath is: {}', data_response_dirpath)
		self.retry_delay_seconds = retry_delay_seconds
		self.data_response_dirpath = data_response_dirpath
		OsExpert.ensure_abs_dirpath_exists(data_response_dirpath)
		self.subscribers = subscribe.all()
Пример #17
0
 def __init__(self, h5_filepath, row_handler, contraints_dict=None):
     self.handle_event = Event()
     self.h5_filepath = h5_filepath
     self.handle_count = 0
     self.job_frames = {}
     self.last_handle_count = None
     self.row_handler = row_handler
     self.contraints_clause = '' if contraints_dict is None else ' '.join(
         'and {}={}'.format(k, v) for k, v in contraints_dict.items())
     Log.d('cc: {}', self.contraints_clause)
     assert row_handler
Пример #18
0
def downloadFile(url, filepath):
	if url is None:
		raise ValueError('parameter "value" not specified')
	if filepath is None:
		raise ValueError('parameter "filepath" not specified')
	Log.d('Downloading to path {}: {}'.format(filepath, url))
	r = requests.get(url, stream=True) # NOTE the stream=True parameter
	with open(filepath, 'wb') as f:
		for chunk in r.iter_content(chunk_size=1024): 
			if chunk: # filter out keep-alive new chunks
				f.write(chunk)
Пример #19
0
	def __predict(self, df):
		max_prediction_count = 100
		if self.predict_count >= max_prediction_count:
			Log.w('too many predictions {} reached, exiting', self.predict_count)
			exit()
		assert len(df) == 1
		X_all, y_all = self.frame_to_ml_inputs(df)
		predict_row = X_all.iloc[0]
		Log.d('predicting based on {} values:\n{}', len(predict_row.values), predict_row.squeeze().sort_index())
		prediction_response = self.predictor.predict(predict_row.values)
		prediction = self.sagemaker_response_highest_score_label(prediction_response)
		self.predict_count += 1
		return prediction 
Пример #20
0
	def print_acc(self, df):
		Log.d('begin acc calc ======')
		y_predict_colname = 'prediction_ensmbl_next_trend_feature' #'prediction_awsdnn_next_trend'
		y_true_colname = 'feature_rtrspc()_next_trend_pricefeature'
		df = df[[y_predict_colname, y_true_colname]]
		filtered = df.dropna(how='any')
		Log.d('acc source frame:\n{}', filtered)
		Log.d('dropped {}/{} rows where either the predictor or the true value was unspecified', len(df) - len(filtered), len(df))
		y_predict = filtered[y_predict_colname]
		y_true = filtered[y_true_colname]
		score = accuracy_score(y_true, y_predict, normalize=True)
		Log.d('accuracy: {}', score)
		Log.d('===== end acc calc ')
Пример #21
0
    def unparsed_datafetch_api_responses_frame(self, min_id=0, limit=100):
        sql = """
			SELECT {0}.* FROM {0}
			LEFT OUTER JOIN {1} ON 
							{1}.source_md5hash = {0}.response_md5hash
			WHERE 
				{1}.source_md5hash IS NULL 
			AND 
				{0}.id >= {2}
            ORDER BY {0}.id
            LIMIT {3}
			""".format('datafetch_api_response', 'transaction', min_id, limit)
        Log.d('executing:\n{}', sql)
        sys.stdout.flush()
        return self.__query_frame(sql)
Пример #22
0
 def email_maybe(self, header, message):
     now = datetime.now()
     if self.lastEmailDatetime is not None:
         minutesSinceLastEmail = (
             now - self.lastEmailDatetime).total_seconds() / 60.0
         if minutesSinceLastEmail < self.maxEmailReccurenceMinutes:
             timeLeftMinutes = int(self.maxEmailReccurenceMinutes -
                                   minutesSinceLastEmail)
             Log.d(
                 'Aborting email notification ({}+ minutes left in window)',
                 timeLeftMinutes)
             return
     self.lastEmailDatetime = now
     self.matchCountSinceLastEmail = 0
     NetworkExpert.tryAppNotifyByEmail(header, message)
Пример #23
0
 def parse_and_persist_as_transaction_maybe(datafetch_api_response, parser,
                                            db):
     try:
         transaction = ParseUtil.__parse_and_persist_as_transaction(
             datafetch_api_response, parser, db)
     except DuplicateInsertException as e:
         Log.w('db rejected transaction as a duplicate: {}',
               datafetch_api_response)
         return False
     except Exception as e:
         Log.e(
             'Failed to parse and store transaction from api_response: {}',
             datafetch_api_response)
         raise e
     return True
Пример #24
0
 def emailMaybe(sender,
                receiver,
                title,
                text,
                smtp_host=None,
                smtp_user=None,
                smtp_password=None,
                smtp_port=587):
     try:
         NetworkExpert.email(sender, receiver, title, text, smtp_host,
                             smtp_user, smtp_password, smtp_port)
         return True
     except Error:
         Log.e('Failed to send email')
         return False
Пример #25
0
def frame(mode, filename, from_epoch, to_epoch, filterInNth, agents,
          format_as_image):
    dirpath = AppConfig.setting('GENERATOR_DATA_DIRPATH')
    filepath = os.path.join(dirpath, filename)
    if from_epoch is None:
        from_epoch = to_epoch - 60 * 60 * 24 * 7
    with pd.HDFStore(filepath, mode='r') as h5:
        key = h5.keys()[0]  # TODO: always select first?
        storer = h5.get_storer(key)
        row_count = storer.nrows
        Log.d(row_count)
        first_epoch = pd.read_hdf(h5, key, start=0, stop=1,
                                  columns=[]).index.values[0]
        last_epoch = pd.read_hdf(h5,
                                 key,
                                 start=row_count - 1,
                                 stop=row_count,
                                 columns=[]).index.values[0]
        column_names = [attr for attr in storer.attrs.data_columns]
        plot_html = h5_to_plot(h5, from_epoch, to_epoch, filterInNth, agents,
                               format_as_image)
        if mode == 'plot_only':
            return plot_html
        feature_columns = set([
            a.split('_')[1] for a in column_names if a.startswith('feature_')
        ])
        feature_names = [c.split('(')[0] for c in feature_columns]
        agent_map = {
            fn: [c for c in feature_columns if c.startswith(fn)]
            for fn in feature_names
        }
        return render_template(
            'frame.html',
            style=style,
            plothtml=plot_html,
            filename=filename,
            from_epoch=from_epoch,
            to_epoch=to_epoch,
            first_epoch=first_epoch,
            last_epoch=last_epoch,
            min_epoch=1514764800,
            max_epoch=int(time.time()),
            agent_map=sorted(agent_map.items()),  # min epoch is 2018
            job_uid=key,
            frame_info_html=json2html.convert(json={
                'row count': row_count,
                'columns': column_names
            }))
Пример #26
0
 def process_nonparsed_api_responses_full(self, sleep_seconds=0):
     Log.i(
         'initiating continuous parsing of api responses with subset sleep interval: {} seconds',
         sleep_seconds)
     try:
         min_id = -1
         next_min_id = 0
         while next_min_id > min_id:
             min_id = next_min_id
             parse_count = 0
             next_min_id = self.process_nonparsed_api_responses_subset(
                 next_min_id=min_id)
             time.sleep(sleep_seconds)
     except Exception as e:
         raise Exception('Failed to process nonparsed api responses') from e
     transaction_count = self.store.transaction_count()
     Log.d('no more api responses to parse, transaction count is now {}',
           transaction_count)
Пример #27
0
	def handle_job_epoch(self, jobuid, df, start_index):
		trade_fee = float64(.25 / 100) 
		min_capital = self.initial_capital * trade_fee * 10
		print(start_index)
		print(len(df))
		try:
			assert jobuid == '/bitcoinaverage_multiple_global_ETH_USD_900', 'unexpected job id'
			new_df = df[start_index:]
			for epoch, row in new_df.iterrows():
				action = row[PREDICT_ACTION]
				coin_price = row['close']
				if self.start_value is None:
					self.start_value = self.current_value(coin_price)
				if not isnan(action):
					print('coin price ', coin_price, ', capital ', self.capital)
					if action == FeatureValue.BUY:					
						coin_transaction_count = (1 - trade_fee) * (self.capital - min_capital) / coin_price 
						if coin_transaction_count > 0:
							print('BUYING coins: ', coin_transaction_count)
							cost = coin_transaction_count * coin_price
							fee  = cost * trade_fee
							assert self.capital >= cost + fee, '{} >= {} + {} = {}'.format(self.capital, cost, fee, cost + fee)
							self.capital -= cost
							self.coins += coin_transaction_count
							self.pay_fee(cost)
					elif action == FeatureValue.SELL:
						fee = min(self.coins * coin_price * trade_fee, self.capital)					
						coin_transaction_count = fee / (coin_price * trade_fee)
						if coin_transaction_count > 0 and self.coins >= coin_transaction_count:
							print('SELLING coins: {}'.format(coin_transaction_count))
							gain = coin_transaction_count * coin_price
							self.capital += gain
							self.coins -= coin_transaction_count
							self.pay_fee(gain)	
						else:
							Log.d('NOT ENOUGH COINS TO SELL! {} at {}', coin_transaction_count, fee)		
					net_worth = self.current_value(coin_price)
		except Exception as e:
			raise Exception('Failed to execute on new job epoch') from e
		print(len(df))
		print(df[PREDICT_ACTION].value_counts())
		print('done')
		sys.stdout.flush()
Пример #28
0
 def __init__(self, version):
     super().__init__(__file__)
     self.window_size = 15
     self.interval_seconds = [15 * 60]  # 15 minutes
     self.contruct_time = time.time()
     self.version = version
     self.sleep_seconds = 1  # must be low enough to produce empty result set eventually > reaktime
     self.transaction_min_timestamp = int(
         AppConfig.setting('GENERATOR_TRANSACTION_MIN_TIMESTAMP'))
     self.data_dirpath = AppConfig.setting('GENERATOR_DATA_DIRPATH')
     Log.d('construct: {}', self.__dict__)
     self.db = DatabaseGateway()
     max_history_minutes = 10 * 24 * 60  #max(self.minute_intervals)
     self.from_currency_ids = []
     self.to_currency_ids = []
     self.run_config = self.read_run_config()
     self.jobs = list(
         self.__jobs_iterate(max_history_minutes, self.run_config))
     Log.i('count of generator jobs: {}', len(self.jobs))
Пример #29
0
	def reset(self, epoch_time):
		self.low = None
		self.high = None
		self.open = None
		self.close = None
		self.latest = None
		self.is_opening = False
		self.is_closing = False
		self.observation_count = 0
		interval_position_epoch = epoch_time % self.interval_second # find the 'time passed' within the interval
		self.interval_start_epoch = epoch_time - interval_position_epoch
		self.interval_end_epoch = self.interval_start_epoch + self.interval_second
		interval = self.interval_start_epoch / self.interval_second
		assert interval % 1 == 0, 'interval index {} is not an integer'.format(interval)
		self.interval_index = int(interval)
		Log.t(
			self.interval_index,
			datetime.utcfromtimestamp(self.interval_start_epoch),
			datetime.utcfromtimestamp(self.interval_end_epoch)
			)
Пример #30
0
			def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False):
				file = Path(filepath)
				"""From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb"""
				s3 = boto3.resource('s3')
				key = channel + '/' + file.name
				bucket_ref = s3.Bucket(bucket)
				objs = list(bucket_ref.objects.filter(Prefix=key))
				is_file_already_existing = len(objs) > 0 and objs[0].key == key
				if is_file_already_existing is True:
					if skip_if_name_and_size_matches is True:
						s3_client = boto3.client('s3')
						response = s3_client.head_object(Bucket=bucket, Key=key)
						local_size = file.stat().st_size
						remote_size = response['ContentLength']
						if remote_size == local_size:
							Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key)
							return
					Log.w('overwriting existing s3 key: {}', key)
				with open(filepath, "rb") as data:
					s3.Bucket(bucket).put_object(Key=key, Body=data)