Exemplo n.º 1
0
def splits():

	logger.info(f"SCRAPER,SPLITS,INITIATED,")

	now = datetime.now()
	report_df = pd.DataFrame()
	dt = datetime(now.year, now.month, 1).strftime("%m/%d/%Y")

	try:

		df = process(dt)
		store(df)

		_connector.execute(f"DELETE FROM stocksplitstmp{MODIFIER};")
		_connector.write(f"stocksplitstmp{MODIFIER}", df)
		_connector.execute("""
				INSERT IGNORE INTO
					stocksplits{modifier}
				SELECT
					*
				FROM
					stocksplitstmp{modifier};
			""".format(modifier=MODIFIER))

		df = df[df.ex_date == DATE]
		if len(df) != 0:

			logger.info(f"SCRAPER,SPLITS,ADJUSTING,{len(df)}")
			_connector.register_splits(P_COLUMNS, MODIFIER)
			_connector.adjust_splits(MODIFIER)
		
		metric = 1
		title_modifier = "SUCCESS"
		logger.info(f"SCRAPER,SPLITS,TERMINATED,{len(df)}")

	except Exception as e:

		metric = 0
		title_modifier = "FAILURE"
		logger.warning(f"SCRAPER,SPLITS,FAILURE,{e}")

	###############################################################################################

	report = _connector.read("""
			SELECT
				*
			FROM
				stocksplitstatus{modifier}
			WHERE
				ex_date = "{date}"
		""".format(modifier=MODIFIER, date=DATE))

	send_gcp_metric(CONFIG, "splits_success_indicator", "int64_value", metric)
	send_email(CONFIG, f"{title_modifier} - Stock Splits", report.to_html(), [], logger)
Exemplo n.º 2
0
def cleaning_loop():

    files = set([".gitignore"])

    while True:

        new_files = os.listdir(NEWS_DIR)

        try:

            send_gcp_metric(CONFIG, "rss_daily_item_counter", "int64_value",
                            len(new_files) - 1)

        except Exception as e:

            print(e)

        if len(new_files) < len(files):
            files = set([".gitignore"])

        items = []
        for new_file in set(new_files).difference(files):

            with open(f"{NEWS_DIR}/{new_file}", "r") as file:

                try:
                    items.extend(json.loads(file.read()))
                    files.add(new_file)
                except Exception as e:
                    print(e)

        new_items = []
        for item in items:

            if not item.get("title"):
                continue

            item = clean(item)
            dummy_item = {
                "title": item['title'].lower(),
                'summary': item['summary'].lower(),
                "link": item['link'].lower()
            }
            dummy_item = json.dumps(dummy_item, sort_keys=True)
            _hash = sha256(dummy_item.encode()).hexdigest()

            new_items.append({
                "_index": "news",
                "_id": _hash,
                "_op_type": "create",
                "_source": item
            })

        if len(new_items) != 0:

            titles = [item['_source']['title'] for item in new_items]
            scores = get_scores(titles)

            for item, score in zip(new_items, scores):
                item['_source']['sentiment'] = score['prediction']
                item['_source']['sentiment_score'] = score['sentiment_score']
                item['_source']['abs_sentiment_score'] = abs(
                    score['sentiment_score'])

            successes, failures = helpers.bulk(ES_CLIENT,
                                               new_items,
                                               stats_only=True,
                                               raise_on_error=False)

            print(successes, failures)
            with open(f"{DIR}/cleaned_news_data/{str(uuid.uuid4())}.txt",
                      "w") as file:
                file.write(json.dumps(new_items))

            new_items = []

        time.sleep(5)
Exemplo n.º 3
0
	r_map = df.iloc[-1, 1:].values
	r_map = np.array([0] + r_map.tolist())
	chs = CubicHermiteSpline(t_map, r_map, [0]*len(t_map))

	rm_df = pd.DataFrame()
	rm_df['days_to_expiry'] = np.arange(0, 365 * 10 + 1).astype(int)
	rm_df['rate'] = chs(rm_df.days_to_expiry.values)
	rm_df['date_current'] = DATE

	_connector.write("treasuryratemap", rm_df)

	return df

if __name__ == '__main__':

	try:

		df = collect()
		store()
		send_email(CONFIG, "Interest Rate Summary", df.to_html(), [], logger)
		metric = 1

	except Exception as e:

		logger.warning(e)
		body = f"<p>Process Failed. {e}</p>"
		send_email(CONFIG, "Interest Rate Summary - FAILED", body, [], logger)
		metric = 0

	send_gcp_metric(CONFIG, "rates_success_indicator", "int64_value", metric)
Exemplo n.º 4
0
def compress_files():

    filedate = datetime.now() - timedelta(days=1)
    filedate = filedate.strftime('%Y-%m-%d')

    raw_txt = f'{DIR}/news_data_backup/{filedate}.txt'
    raw_tar = f'{DIR}/news_data_backup/{filedate}.tar.xz'

    files = os.listdir(f"{DIR}/news_data")
    files = [f"{DIR}/news_data/{file}" for file in files]
    files = sorted(files, key=os.path.getmtime)[::-1]
    files.remove(f"{DIR}/news_data/.gitignore")

    cfiles = os.listdir(f"{DIR}/cleaned_news_data")
    cfiles = [f"{DIR}/cleaned_news_data/{file}" for file in cfiles]
    cfiles = sorted(cfiles, key=os.path.getmtime)[::-1]
    cfiles.remove(f"{DIR}/cleaned_news_data/.gitignore")

    ###############################################################################################

    ctr = 0
    data, hashes = list(), set()
    sources, usources = dict(), dict()
    for file in files:

        with open(file, "r") as data_file:
            items = json.loads(data_file.read())

        for item in items:

            ctr += 1

            item_ = item.copy()
            item_.pop("oscrap_acquisition_datetime")

            if 'oscrap_source' not in item_:
                continue

            source = item_['oscrap_source']
            if source in sources:
                sources[source] += 1
            else:
                sources[source] = 1

            hash_ = sha256(json.dumps(item_).encode()).hexdigest()

            if hash_ in hashes:
                continue

            if source in usources:
                usources[source] += 1
            else:
                usources[source] = 1

            data.append(item)
            hashes.add(hash_)

    logger.info(f"RSS,Storage,Data,{ctr}")
    logger.info(f"RSS,Storage,Unique Data,{len(hashes)}")

    send_gcp_metric(CONFIG, "rss_daily_item_uniques", "int64_value",
                    len(hashes))
    send_gcp_metric(CONFIG, "rss_daily_item_total", "int64_value", ctr)

    for source in sources:

        logger.info(f"RSS,Source Total,{source},{sources[source]}")
        metric_name = source.lower().replace(" ", "_")
        send_gcp_metric(CONFIG, f"{metric_name}_daily_item_total",
                        "int64_value", sources[source])

    for source in usources:

        logger.info(f"RSS,Source Uniques,{source},{usources[source]}")
        metric_name = source.lower().replace(" ", "_")
        send_gcp_metric(CONFIG, f"{metric_name}_daily_item_uniques",
                        "int64_value", usources[source])

    with open(raw_txt, "w") as file:
        file.write(json.dumps(data))

    with tar.open(raw_tar, mode="x:xz") as tar_file:
        tar_file.add(raw_txt, arcname=os.path.basename(raw_txt))

    ###############################################################################################

    ctr = 0
    data, hashes = list(), set()
    for file in cfiles:

        with open(file, "r") as data_file:
            items = json.loads(data_file.read())

        for item in items:

            ctr += 1

            hash_ = sha256(json.dumps(item).encode()).hexdigest()

            if hash_ in hashes:
                continue

            data.append(item)
            hashes.add(hash_)

    send_gcp_metric(CONFIG, "rss_daily_clean_uniques", "int64_value",
                    len(hashes))
    send_gcp_metric(CONFIG, "rss_daily_clean_total", "int64_value", ctr)

    cleaned_txt = f"{DIR}/cleaned_news_data/{filedate}.txt"
    cleaned_tar = cleaned_txt[:-4] + ".tar.xz"

    with open(cleaned_txt, "w") as file:
        file.write(json.dumps(data))

    with tar.open(cleaned_tar, mode="x:xz") as tar_file:
        tar_file.add(cleaned_txt, arcname=os.path.basename(cleaned_txt))

    ###############################################################################################

    time.sleep(600)

    file_size = os.stat(raw_tar).st_size / 1_000_000
    if file_size > 0:
        for file in files:
            os.remove(file)
        os.remove(raw_txt)
    else:
        raise Exception("TarFile Corrupted. File Size 0.")

    file_size = os.stat(cleaned_tar).st_size / 1_000_000
    if file_size > 0:
        for file in cfiles:
            os.remove(file)
        os.remove(cleaned_txt)
    else:
        raise Exception("TarFile Corrupted. File Size 0.")

    return raw_tar, cleaned_tar
Exemplo n.º 5
0
	def get_options(self):

		def get_page(url):

			ctr, max_ctr = 0, 3
			while (ctr < max_ctr):	
				
				bs = BeautifulSoup(request(CONFIG, url, self.logger).content, PARSER)
				options = bs.find_all("option")

				if len(options) != 0:
					break

				ctr += 1
				self.logger.info(f"{self.ticker},{self.batch_id},Option Download,{ctr}")
				self.sleep()

			return bs, options

		def append_options(table, expiry_date_fmt, days_to_expiry, option_type):

			for row in table.find_all("tr")[1:]:
				es = [e for e in row.find_all("td")[2:]]
				self.options.append([
						DATE,
						self.ticker,
						expiry_date_fmt,
						days_to_expiry,
						option_type,
						self.option_fmt(es[0].text, 'Strike Price'),
						self.option_fmt(es[2].text, 'Bid'),
						self.option_fmt(es[1].text, 'Option Price'),
						self.option_fmt(es[3].text, 'Ask'),
						self.option_fmt(es[-1].text, 'Implied Volatility'),
						self.option_fmt(es[-2].text, 'Volume'),
						self.option_fmt(es[-3].text, 'Open Interest')
					])

		url = OPTIONS.format(ticker = self.ticker)
		bs, options = get_page(url)

		for option in options:

			self.sleep()

			expiry, expiry_date = option.get("value"), option.text
			self.logger.info(f"{self.ticker},{self.batch_id},Option Expiry,{expiry},{expiry_date.replace(',', '.')}")

			expiry_date = datetime.strptime(expiry_date, NAMED_DATE_FMT)
			expiry_date_fmt = expiry_date.strftime("%Y-%m-%d")
			
			tdays = CONFIG['trading_days']
			days_to_expiry = calculate_trading_days(DATE, expiry_date_fmt, CONFIG['trading_days'])
			
			if days_to_expiry is None:
				warning = f"{self.ticker},{self.batch_id},Null Days to Expiry,{expiry},{days_to_expiry}"
				self.logger.warning(warning)

			page = url+f"&date={str(expiry)}"
			bs, _ = get_page(page)

			calls = bs.find("table", {"class" : "calls"})
			puts = bs.find("table", {"class" : "puts"})
			
			if calls:
				append_options(calls, expiry_date_fmt, days_to_expiry, 'C')
			
			if puts:
				append_options(puts, expiry_date_fmt, days_to_expiry, 'P')

		df = pd.DataFrame(self.options, columns = OPTION_COLS)
		oid = df.ticker + ' ' + df.expiration_date + ' ' + df.option_type
		sp = df.strike_price.round(2).astype(str)
		sp = sp.str.rstrip("0").str.rstrip(".")
		df['option_id'] = oid + sp

		try:

			df['stock_price'] = self.adj_close
			df['dividend_yield'] = self.div
			df = df.merge(CONFIG['ratemap'], on="days_to_expiry", how="inner")
			df = calculate_iv(df)
			df = df.drop(["stock_price", "dividend_yield", "rate"], axis=1)

		except Exception as e:

			self.logger.warning(f"{self.ticker},{self.batch_id},Calculate IV,Failure,{e}")
			df['zimplied_volatility'] = 0

		zba = df[(df.bid_price == 0) | (df.ask_price == 0)].shape[0]
		send_gcp_metric(CONFIG, "zero_bid_ask", "double_value", zba / (1 + len(df)))

		if not self.retries and len(df) > 0:
			
			df.to_csv(f"{DATA}/options/{self.ticker}_{DATE}.csv", index=False)

		elif len(df) != 0:
			
			try:
				old = pd.read_csv(f"{DATA}/options/{self.ticker}_{DATE}.csv")
			except Exception as e:
				old = pd.DataFrame()

			df = pd.concat([old, df]).reset_index(drop=True)
			df = df.drop_duplicates(subset=['expiration_date', 'strike_price', 'option_type'], keep="last")
			df = df.sort_values(['expiration_date', 'option_type', 'strike_price'])
			df.to_csv(f"{DATA}/options/{self.ticker}_{DATE}.csv", index=False)

			self.fault_dict['options']['new'] = len(df)
			delta = self.fault_dict['options']['new'] - self.fault_dict['options']['old']

			self.fault_dict['options']['delta'] = delta
			self.logger.info(f"{self.ticker},{self.batch_id},Re-Options,Success,{delta}")

		else:

			self.logger.info(f"{self.ticker},{self.batch_id},Options,None Collected,")
Exemplo n.º 6
0
def send_metrics(success, failure):

	for key in success:
		metric = success[key]
		metric /= success[key] + failure[key]
		send_gcp_metric(CONFIG, f"oscrap_{key}_sucess", "double_value", metric)
Exemplo n.º 7
0
		db_flags.append(b_db_flag)
		db_stats.append(b_db_stats)

		success, failure = get_job_success_rates(tickers[ : BATCH_SIZE * (1 + batch_id)])
		send_metrics(success, failure)

		# if batch_id % checkpoint == 0 and batch_id != 0:
		# 	report("Partial", success, failure, faults_summary, db_flags, db_stats)

	###############################################################################################

	success, failure = get_job_success_rates(tickers)
	report("Full", success, failure, faults_summary, db_flags, db_stats)

	store()

	logger.info(f"SCRAPER,JOB,TERMINATED,{DATE},")

if __name__ == '__main__':

	try:
	
		send_gcp_metric(CONFIG, "oscrap_job_status", "int64_value", 1)
		main()
	
	except Exception as e:

		send_gcp_metric(CONFIG, "oscrap_job_status", "int64_value", 0)
		logger.warning(f"SCRAPER,JOB,MAIN ERROR,{e},")