def main(): logger.info(f"RSS,Job,Initated,{SDATE}") for file in os.listdir(f"{DIR}/pids"): if file == ".gitignore": continue os.remove(f"{DIR}/pids/{file}") os.system(f"touch {DIR}/pids/{os.getpid()}") group_keys = list(groups.keys()) parallel_groups = [group_keys[0::2], group_keys[1::2]] try: Parallel(n_jobs=2)( delayed(parallel_job)(job_id, parallel_group) for job_id, parallel_group in enumerate(parallel_groups)) except Exception as e: logger.warning(e)
def collect_data_again(batch_id, faults): for i, ticker in enumerate(faults): try: retries = { key: key in faults[ticker] for key in ['analysis', 'keystats', 'ohlc', 'options'][:2] } ticker_obj = Ticker(ticker, logger, batch_id, retries, faults[ticker]) faults[ticker] = ticker_obj.fault_dict time.sleep(SLEEP) logger.info(f"{ticker},{batch_id},Re-Ticker,Success,") except Exception as e: logger.warning(f"{ticker},{batch_id},Re-Ticker,Failure,{e}") pct = (i + 1) / len(faults) pct = np.round(100 * pct, 4) logger.info(f"SCRAPER,{batch_id},RE-PROGRESS,{pct}%,") return faults
def replace_all_synonyms(df): while True: new_df = replace_synonyms(df) logger.info(f"Synonym Replacement. Old:{len(df)} New:{len(new_df)}") if len(new_df) == len(df): break df = new_df.copy() return df
def scrape(exchange_code, exchange_name): stats = [] for letter in LETTERS: page = get_bs_obj(index="stocklist", exchange=exchange_code, symbol=letter) rows = page.find("table", {"class": "quotes"}).find_all("tr")[1:] if rows[0].find('td').text[0] != letter.upper(): continue for row in rows: try: error = None ticker = row.find('td').text name = row.find("td", text=ticker).next_sibling.text ticker = ticker.replace('.', '-') market_cap = get_market_cap(ticker) sector, industry, instrument_type = get_sector_and_industry( ticker) except Exception as e: market_cap = 0 sector, industry, instrument_type = None, None, None error = e stats.append([ ticker, name, exchange_code, exchange_name, sector, industry, instrument_type, round(market_cap, 3), ]) log_entry = list(map(str, stats[-1])) log_entry = ','.join(log_entry) log_entry += ',' if not error else ',' + str(error) logger.info(log_entry) time.sleep(SLEEP) df = pd.DataFrame(stats, columns=COLUMNS) df.to_csv(f'{DATA}/{exchange_code}_tickers.csv', index=False)
def save(self, name, content): folder = self.timestamp.strftime('%Y-%m-%d') prefix = self.timestamp.strftime('%Y%m%d%H%m') path = os.path.join(IMAGE_DIR, folder) if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, prefix+'-'+name), 'wb') as f: f.write(content) logger.info('pivix file saved: %s %s' % (folder, name))
def main(): logger.info(f"SCRAPER,JOB,INITIATED,{DATE},") init() tickers = _connector.get_equity_tickers(N_USD) checkpoint = len(tickers) / BATCH_SIZE checkpoint = int(checkpoint / 4) faults_summary = { "options" : {}, "analysis" : {}, "keystats" : {}, "ohlc" : {} } db_flags, db_stats = [], [] ############################################################################################### for batch_id, batch in enumerate(range(BATCH_SIZE, len(tickers) + BATCH_SIZE, BATCH_SIZE)): ticker_batch = tickers[batch - BATCH_SIZE : batch] results = batch_main(batch_id, ticker_batch) b_fault_summary, b_db_flag, b_db_stats = results for key in b_fault_summary: for ticker in b_fault_summary[key]: faults_summary[key][ticker] = b_fault_summary[key][ticker] db_flags.append(b_db_flag) db_stats.append(b_db_stats) success, failure = get_job_success_rates(tickers[ : BATCH_SIZE * (1 + batch_id)]) send_metrics(success, failure) # if batch_id % checkpoint == 0 and batch_id != 0: # report("Partial", success, failure, faults_summary, db_flags, db_stats) ############################################################################################### success, failure = get_job_success_rates(tickers) report("Full", success, failure, faults_summary, db_flags, db_stats) store() logger.info(f"SCRAPER,JOB,TERMINATED,{DATE},")
def get_exchanges(): page = get_bs_obj(index="stocklist", exchange="AMEX", symbol=LETTERS[0]) select = page.find("select", { "id": "ctl00_cph1_cboExchange" }).find_all("option") exchanges = [] for option in select: if option.text not in EXCHANGES: continue exchanges.append((option.get_attribute_list("value")[0], option.text)) logger.info(','.join(exchanges[-1])) return exchanges
def parallel_job(job_id, parallel_group): logger.info(f"RSS,Job,PID,{os.getpid()}") def on_close(): for group in parallel_group: feed_threads[group].on_close() logger.info(f"RSS,Thread,Closed,{job_id} - {group}") def sigterm_handler(signal_number, frame): logger.info(f"RSS,Job,SIGTERM,{os.getpid()}") on_close() signal.signal(signal.SIGTERM, sigterm_handler) os.system(f"touch {DIR}/pids/{os.getpid()}") ############################################################################################### try: feed_threads = {} for i, group in enumerate(parallel_group): group, sleep = group, groups[group] group_coords = feeds[feeds.source.isin(group)] feed_threads[group] = Feeds(sources=group_coords.source.values, feeds=group_coords.feed.values, sleep=sleep, logger=logger) feed_threads[group].start() logger.info(f"RSS,Thread,Initiated,{job_id} - {group}") except Exception as e: logger.warning(f"RSS,Thread,Error,{job_id} - {e}") on_close() raise Exception(f"RSS,Job,Terminated,{job_id} - {e}")
def main(batch_id, tickers): logger.info(f"SCRAPER,{batch_id},INITIATED,,") collect_data(batch_id, tickers) _connector.init_batch_tickers(batch_id, tickers) faults_summary = fix_faults(batch_id, tickers) db_flag, db_stats = index_data(batch_id, tickers) _connector.launch_derived_engine(batch_id) logger.info(f"SCRAPER,{batch_id},TERMINATED,,") return faults_summary, db_flag, db_stats
def main(date): logger.info("News Stats Initiated") datestr = (date - timedelta(days=1)).isoformat()[:10] file = Path(f"{DIR}/data/{datestr}.csv") xz_file = file.with_suffix(".tar.xz") try: logger.info(f"Processing stats for {date}") pre_n, df = process_data(get_data(date)) df = create_sector_tickers(df, get_ticker_info()) pre_n += len(sector_to_ticker) df['date'] = datestr df['volume'] = df.volume.astype(int) df = df[['date', 'ticker', 'volume', 'sentiment']] logger.info( f"Processed stats for {len(df)} tickers. Collected {pre_n} items.") send_metric(CONFIG, "news_stats_ticker_pre_count", "int64_value", pre_n) send_metric(CONFIG, "news_stats_ticker_post_count", "int64_value", len(df)) if len(df) == 0: raise Exception("Zero tickers after filtering") df.to_csv(file, index=False) with tar.open(xz_file, "x:xz") as tar_file: tar_file.add(file, arcname=file.name) send_to_bucket("daily_news_stats", "", xz_file, logger=logger) os.unlink(file) os.unlink(xz_file) send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 1) except Exception as e: logger.info(f"News Stats Error - {e}") send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 0) logger.info("News Stats Terminated")
def collect_data(batch_id, tickers): for i, ticker in enumerate(tickers): try: Ticker(ticker, logger, batch_id) time.sleep(SLEEP) logger.info(f"{ticker},{batch_id},Ticker,Success,") except Exception as e: logger.warning(f"{ticker},{batch_id},Ticker,Failure,{e}") pct = (i + 1) / len(tickers) pct = np.round(100 * pct, 4) logger.info(f"SCRAPER,{batch_id},PROGRESS,{pct}%,")
def store(df): try: df.to_csv(f"{DATA}/{DATE}.csv", index=False) with tar.open(f"{DATA}.tar.xz", "x:xz") as tar_file: for file in DATA.iterdir(): tar_file.add(file, arcname=file.name) tar_file.add(f"{DIR}/log.log", arcname="log.log") send_to_bucket(BUCKET_PREFIX, BUCKET_NAME, f"{DATE}.tar.xz", f"{DIR}/instrument_data", logger) for folder in (DIR / "instrument_data").iterdir(): if folder.is_dir(): shutil.rmtree(folder) except Exception as e: logger.info(f"Storage Error - {e}")
def collect_news(job_id, company_names, id_cache, ids, errors): try: N = len(company_names) for i, data in enumerate(company_names.values): queries = ' '.join(data) progress = round(i / N * 100, 2) logger.info(f"collecting {queries}, {progress}%") ticker, company_name = data if ticker: fetch(ticker, id_cache, ids) fetch(company_name, id_cache, ids) except Exception as e: errors.put(f"{e}\n{traceback.format_exc()}") with open(IDSDIR / f"{job_id}.json", "w") as file: file.write(json.dumps(id_cache[SDATE]))
def run(self): rank = self.sess.get(self.RANK_URL) soup = BeautifulSoup(rank.content, 'html.parser', from_encoding='utf-8') ranking_items = soup.find_all('section', {'class': 'ranking-item'}, limit=self.topN) for item in ranking_items: url = item.find('img')['data-src'] url = re.sub(r'(c/|240x480/|_master1200)', '', url) url = url.replace('master', 'original') r = self.sess.get(url, headers={'Referer': self.ILLUSTRATION_URL}) logger.info('access pivix url: %s status: %d' % (url, r.status_code)) if r.status_code == 404: url = re.sub(r'jpg$', 'png', url) r = self.sess.get(url, headers={'Referer': self.ILLUSTRATION_URL}) logger.info('retry access pivix url: %s status: %d' % (url, r.status_code)) creator = re.sub(r'[^A-Za-z0-9]+', '', item['data-user-name']) file_name = item['data-id']+'-'+creator+'.'+url[-3:] self.save(file_name, r.content)
def splits(): logger.info(f"SCRAPER,SPLITS,INITIATED,") now = datetime.now() report_df = pd.DataFrame() dt = datetime(now.year, now.month, 1).strftime("%m/%d/%Y") try: df = process(dt) store(df) _connector.execute(f"DELETE FROM stocksplitstmp{MODIFIER};") _connector.write(f"stocksplitstmp{MODIFIER}", df) _connector.execute(""" INSERT IGNORE INTO stocksplits{modifier} SELECT * FROM stocksplitstmp{modifier}; """.format(modifier=MODIFIER)) df = df[df.ex_date == DATE] if len(df) != 0: logger.info(f"SCRAPER,SPLITS,ADJUSTING,{len(df)}") _connector.register_splits(P_COLUMNS, MODIFIER) _connector.adjust_splits(MODIFIER) metric = 1 title_modifier = "SUCCESS" logger.info(f"SCRAPER,SPLITS,TERMINATED,{len(df)}") except Exception as e: metric = 0 title_modifier = "FAILURE" logger.warning(f"SCRAPER,SPLITS,FAILURE,{e}") ############################################################################################### report = _connector.read(""" SELECT * FROM stocksplitstatus{modifier} WHERE ex_date = "{date}" """.format(modifier=MODIFIER, date=DATE)) send_gcp_metric(CONFIG, "splits_success_indicator", "int64_value", metric) send_email(CONFIG, f"{title_modifier} - Stock Splits", report.to_html(), [], logger)
def collect(): logger.info(f"Downloading Table: {URL}") df = pd.read_html(URL, attrs=attrs) logger.info(f"Number of tables found: {len(df)}") if len(df) != 1: return df = df[0] df.columns = t_names df['date_current'] = pd.to_datetime(df.date_current) df = df.sort_values('date_current', ascending=False) df = df.reset_index(drop=True) ############################################################################################### df = df[df.date_current == DATE] logger.info(f"Number of items after filter: {len(df)}") if len(df) == 0: raise Exception("Data not up to date.") _connector.write("treasuryrates", df) df.to_csv(f"{DATA}.csv", index=False) ############################################################################################### r_map = df.iloc[-1, 1:].values r_map = np.array([0] + r_map.tolist()) chs = CubicHermiteSpline(t_map, r_map, [0]*len(t_map)) rm_df = pd.DataFrame() rm_df['days_to_expiry'] = np.arange(0, 365 * 10 + 1).astype(int) rm_df['rate'] = chs(rm_df.days_to_expiry.values) rm_df['date_current'] = DATE _connector.write("treasuryratemap", rm_df) return df
def main(): logger.info(f"SCRAPER,STORE,INITIATED,,") try: aggregate() compress() send_to_bucket(BUCKET_PREFIX, BUCKET_NAME, f"{DATE}.tar.xz", f"{DIR}/financial_data", logger=logger) remove() logger.info(f"SCRAPER,STORE,SUCCESS,,") except Exception as e: logger.warning(f"SCRAPER,STORE,FAILURE,{e},") logger.info(f"SCRAPER,STORE,TERMINATED,,")
def on_close(): for group in parallel_group: feed_threads[group].on_close() logger.info(f"RSS,Thread,Closed,{job_id} - {group}")
def download_company_names(): for file in DATA.iterdir(): if file.name == '.gitignore': continue file.unlink() ############################################################################################### options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("window-size=1024,768") options.add_argument("--no-sandbox") options.add_experimental_option("prefs", {"download.default_directory": str(DATA)}) driver = webdriver.Chrome(options=options) logger.info("Getting web page...") driver.get("http://eoddata.com") username_input = driver.find_element_by_id("ctl00_cph1_lg1_txtEmail") password_input = driver.find_element_by_id("ctl00_cph1_lg1_txtPassword") login_button = driver.find_element_by_id("ctl00_cph1_lg1_btnLogin") username_input.send_keys("zQuantz") password_input.send_keys("Street101!") logger.info("Logging in...") login_button.click() logger.info("Getting download page...") driver.get("http://eoddata.com/symbols.aspx") for exchange in EXCHANGES: logger.info(f"Downloading: {exchange}") exchange_selector = Select( driver.find_element_by_id("ctl00_cph1_cboExchange")) exchange_selector.select_by_value(exchange) time.sleep(5) download_button = driver.find_element_by_id("ctl00_cph1_ch1_divLink") download_button = download_button.find_element_by_tag_name("a") download_button.click() time.sleep(5) ############################################################################################### df = [] for file in DATA.iterdir(): if file.name == '.gitignore': continue df.append(pd.read_csv(file, delimiter="\t")) df[-1]['exchange'] = file.name[:-4] df = pd.concat(df) df.columns = ['ticker', 'name', 'exchange'] df = df.sort_values('ticker').reset_index(drop=True) ############################################################################################### df = df[df.ticker.str.len() <= 6] combo = df.name + " " + df.exchange vcs = combo.value_counts() df = df[combo.isin(vcs[vcs == 1].index)] df = df[df.ticker.str.count("\\.") <= 1] ndaq = df[df.exchange == 'NASDAQ'] ndaq = ndaq[ndaq.ticker.str.len() > 4] mods = ndaq.ticker.str[-1] mods = ndaq[~mods.isin(['A', 'B', 'C'])] df = df[~df.index.isin(mods.index)] ticker_mods = df[df.ticker.str.count("\\.") == 1] mod = ticker_mods.ticker.str.split("\\.").str[-1] ticker_mods = ticker_mods[~mod.isin(["A", "B", "C"])] df = df[~df.index.isin(ticker_mods.index)] df = df[~df.ticker.str[0].str.isnumeric()] df = df[~df.ticker.str[-1].str.isnumeric()] mods = df[df.ticker.str.count("-") == 1].ticker mods = mods[~mods.str[-1].isin(['A', 'B', 'C'])] df = df[~df.index.isin(mods.index)] return df
def index_data(batch_id, tickers): try: # options, ohlc = [], [] analysis, keystats = [], [] # for file in (DATA/"options").iterdir(): # ticker = file.name.split('_')[0] # if ticker not in tickers: # continue # options.append(pd.read_csv(file)) # for file in (DATA/"ohlc").iterdir(): # ticker = file.name.split('_')[0] # if ticker not in tickers: # continue # ohlc.append(pd.read_csv(file).iloc[:1, :]) for file in (DATA / "analysis").iterdir(): ticker = file.name.split('_')[0] if ticker not in tickers: continue analysis.append(pd.read_csv(file)) for file in (DATA / "keystats").iterdir(): ticker = file.name.split('_')[0] if ticker not in tickers: continue keystats.append(pd.read_csv(file)) pre = _connector.get_equities_table_count().row_count # if len(options) > 0: # options = pd.concat(options) # _connector.write("options", options) # if len(ohlc) > 0: # ohlc = pd.concat(ohlc) # _connector.write("ohlc", ohlc) if len(analysis) > 0: _connector.write("analysis", pd.concat(analysis)) if len(keystats) > 0: _connector.write("keystats", pd.concat(keystats)) # if len(options) > 0 and len(ohlc) > 0: # cols = ["date_current", "ticker", "adjclose_price"] # options = options.merge(ohlc[cols], on=cols[:2], how="inner") # options = options.rename({"adjclose_price" : "stock_price"}, axis=1) # options = options.merge(CONFIG['ratemap'], on="days_to_expiry", how="inner") # zsurface, surface = calculate_surface(options, CONFIG['reg_expirations']) # zsurface['date_current'], surface['date_current'] = DATE, DATE # info = f"{zsurface.ticker.nunique()}/{options.ticker.nunique()}" # logger.info(f"SCRAPER,{batch_id},zSURFACE ({len(zsurface)}),{info}") # info = f"{surface.ticker.nunique()}/{options.ticker.nunique()}" # logger.info(f"SCRAPER,{batch_id},SURFACE ({len(surface)}),{info}") # _connector.write("zsurface", zsurface) # _connector.write("surface", surface) post = _connector.get_equities_table_count().row_count db_stats = (pre.tolist(), post.tolist()) db_flag = 1 logger.info(f"SCRAPER,{batch_id},INDEXING,SUCCESS,") except Exception as e: logger.warning(f"SCRAPER,{batch_id},INDEXING,FAILURE,{e}") print_exc() db_stats = ([0] * 4, [0] * 4) db_flag = 0 return db_flag, db_stats
raw_path = Path(f"{DIR}/news_data") files = list(raw_path.iterdir()) files.remove(raw_path / ".gitignore") now = datetime.now() [ file.unlink() for file in files if check_file(file, now) ] n_items, n_unique = save_items(path, SDATE) send_metric(CONFIG, "clean_count", "int64_value", n_items) send_metric(CONFIG, "unique_clean_count", "int64_value", n_unique) send_to_bucket( CONFIG['GCP']['CLEAN_BUCKET'], 'news', xz_file, logger=logger ) logger.info(f"RSS save successeful.") send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 1) except Exception as e: logger.warning(f"RSS save failed. {e}, {format_exc()}") send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 0)
def init_folders(): os.mkdir(f'{DIR}/News_data/{date_today}') if __name__ == '__main__': init_folders() for ticker in TICKERS: try: get_news(ticker) logger.info('%s:Completed', ticker) ticker_list.append(ticker) current_complete = (len(ticker_list) / len(TICKERS)) * 100 logger.info('Current Percentage: %f %s', current_complete, percent) except Exception as e: logger.warning('Error Message: %s:%s', ticker, e) continue percent_successful = (len(ticker_list) / len(TICKERS)) * 100 logger.info('Percentage of successful tickers: %f %s', percent_successful, percent) # logging information # log_path = "/home/zqretrace/scripts/merge_logs/CNBC_Merged_logs/merge_logs_CNBC.log" logging.basicConfig(
from pathlib import Path import pandas as pd import sys, os import time sys.path.append(f"{DIR}/..") from utils import send_metric, send_email def get_diff(df1, df2): return df1[~df1.apply(tuple, 1).isin(df2.apply(tuple, 1))] if __name__ == '__main__': logger.info("company name downloader & curator initialized") metric = "company_names_success_indicator" try: ocomnames = pd.read_csv(f"{DIR}/data/company_names.csv") ocurnames = pd.read_csv(f"{DIR}/data/curated_company_names.csv") comnames = download_company_names() curnames = curate_company_names(comnames) new_comnames = get_diff(comnames, ocomnames) removed_comnames = get_diff(ocomnames, comnames) new_curnames = get_diff(curnames, ocurnames) removed_curnames = get_diff(ocurnames, curnames)
def sigterm_handler(signal_number, frame): logger.info(f"RSS,Job,SIGTERM,{os.getpid()}") on_close()
def curate_company_names(company_names): company_names['name'] = company_names.name.str.lower() logger.info(f"Initial {company_names.shape}") ## Suffix Section company_names = remove_suffix(company_names) logger.info(f"Remove Suffix {company_names.shape}") company_names = replace_special_cases(company_names) logger.info(f"Special Cases {company_names.shape}") company_names = remove_single_stop_and_english_words(company_names) logger.info(f"Single stop and english {company_names.shape}") ## Modifier Section company_names = remove_modifiers(company_names) logger.info(f"Modifiers {company_names.shape}") company_names = remove_modifier_stop_and_english_words(company_names) logger.info(f"Modifier stop and english {company_names.shape}") company_names = remove_modifier_duplicates(company_names) logger.info(f"Modifier Duplicates {company_names.shape}") ## Final cleaning exchanges = ["AMEX", "NASDAQ", "NYSE", "TSX", "TSXV", "LSE"] company_names = company_names[company_names.exchange.isin(exchanges)] logger.info(f"Exchange Filter {company_names.shape}") company_names = replace_all_synonyms(company_names) logger.info(f"Replace Synonyms {company_names.shape}") company_names = replace_indices(company_names) logger.info(f"Replace Indices {company_names.shape}") company_names = remove_all_number_names(company_names) logger.info(f"Remove Numbered Names {company_names.shape}") company_names = remove_short_names(company_names) logger.info(f"Remove Short Names {company_names.shape}") company_names = company_names[~company_names.name.isin(MANUAL_OVERRIDES)] logger.info(f"Remove Manual Overrides {company_names.shape}") company_names = remove_countries_and_currencies(company_names) logger.info(f"Remove Countries and Currencies {company_names.shape}") company_names = remove_english_two_grams(company_names) logger.info(f"Remove English Two Grams {company_names.shape}") company_names = remove_commodities(company_names) logger.info(f"Remove Commodities {company_names.shape}") ## Super special case of 'Target'. Add nicknames company_names = company_names[company_names.name != 'target'] logger.info(f"Remove Target {company_names.shape}") company_names = add_nicknames(company_names) logger.info(f"Added Nicknames {company_names.shape}") ## Sort and Save company_names = company_names.sort_values('ticker') company_names = company_names.drop_duplicates().reset_index(drop=True) logger.info(f"Drop dupes {company_names.shape}") return company_names
def compress_files(): filedate = datetime.now() - timedelta(days=1) filedate = filedate.strftime('%Y-%m-%d') raw_txt = f'{DIR}/news_data_backup/{filedate}.txt' raw_tar = f'{DIR}/news_data_backup/{filedate}.tar.xz' files = os.listdir(f"{DIR}/news_data") files = [f"{DIR}/news_data/{file}" for file in files] files = sorted(files, key=os.path.getmtime)[::-1] files.remove(f"{DIR}/news_data/.gitignore") cfiles = os.listdir(f"{DIR}/cleaned_news_data") cfiles = [f"{DIR}/cleaned_news_data/{file}" for file in cfiles] cfiles = sorted(cfiles, key=os.path.getmtime)[::-1] cfiles.remove(f"{DIR}/cleaned_news_data/.gitignore") ############################################################################################### ctr = 0 data, hashes = list(), set() sources, usources = dict(), dict() for file in files: with open(file, "r") as data_file: items = json.loads(data_file.read()) for item in items: ctr += 1 item_ = item.copy() item_.pop("oscrap_acquisition_datetime") if 'oscrap_source' not in item_: continue source = item_['oscrap_source'] if source in sources: sources[source] += 1 else: sources[source] = 1 hash_ = sha256(json.dumps(item_).encode()).hexdigest() if hash_ in hashes: continue if source in usources: usources[source] += 1 else: usources[source] = 1 data.append(item) hashes.add(hash_) logger.info(f"RSS,Storage,Data,{ctr}") logger.info(f"RSS,Storage,Unique Data,{len(hashes)}") send_gcp_metric(CONFIG, "rss_daily_item_uniques", "int64_value", len(hashes)) send_gcp_metric(CONFIG, "rss_daily_item_total", "int64_value", ctr) for source in sources: logger.info(f"RSS,Source Total,{source},{sources[source]}") metric_name = source.lower().replace(" ", "_") send_gcp_metric(CONFIG, f"{metric_name}_daily_item_total", "int64_value", sources[source]) for source in usources: logger.info(f"RSS,Source Uniques,{source},{usources[source]}") metric_name = source.lower().replace(" ", "_") send_gcp_metric(CONFIG, f"{metric_name}_daily_item_uniques", "int64_value", usources[source]) with open(raw_txt, "w") as file: file.write(json.dumps(data)) with tar.open(raw_tar, mode="x:xz") as tar_file: tar_file.add(raw_txt, arcname=os.path.basename(raw_txt)) ############################################################################################### ctr = 0 data, hashes = list(), set() for file in cfiles: with open(file, "r") as data_file: items = json.loads(data_file.read()) for item in items: ctr += 1 hash_ = sha256(json.dumps(item).encode()).hexdigest() if hash_ in hashes: continue data.append(item) hashes.add(hash_) send_gcp_metric(CONFIG, "rss_daily_clean_uniques", "int64_value", len(hashes)) send_gcp_metric(CONFIG, "rss_daily_clean_total", "int64_value", ctr) cleaned_txt = f"{DIR}/cleaned_news_data/{filedate}.txt" cleaned_tar = cleaned_txt[:-4] + ".tar.xz" with open(cleaned_txt, "w") as file: file.write(json.dumps(data)) with tar.open(cleaned_tar, mode="x:xz") as tar_file: tar_file.add(cleaned_txt, arcname=os.path.basename(cleaned_txt)) ############################################################################################### time.sleep(600) file_size = os.stat(raw_tar).st_size / 1_000_000 if file_size > 0: for file in files: os.remove(file) os.remove(raw_txt) else: raise Exception("TarFile Corrupted. File Size 0.") file_size = os.stat(cleaned_tar).st_size / 1_000_000 if file_size > 0: for file in cfiles: os.remove(file) os.remove(cleaned_txt) else: raise Exception("TarFile Corrupted. File Size 0.") return raw_tar, cleaned_tar
def cleaning_loop(): ctr = 0 files = {NEWS_DIR / ".gitignore"} n_clean = len(list(CLEAN_DIR.iterdir())) while True: new_files = get_files(files) n_clean_new = len(list(CLEAN_DIR.iterdir())) if n_clean_new < n_clean: files = {NEWS_DIR / ".gitignore"} reload(sys.modules['clean_item']) reload(sys.modules['find_company_names']) logger.info("reloading the company names") items = [] for new_file in new_files: with open(new_file, "r") as file: try: items.extend(json.loads(file.read())) files.add(new_file) except Exception as e: logger.warning(f"File read error. {e}") new_items = [] for item in items: if not item.get("title"): continue item = clean_item(item) dummy_item = { 'title': item['title'], 'article_source': item['article_source'], 'published_datetime': item['published_datetime'][:10] } if 'summary' in item: dummy_item['summary'] = item['summary'] _id = md5(json.dumps(dummy_item).encode()).hexdigest() new_items.append({ "_index": "news", "_id": _id, "_op_type": "create", "_source": item }) if len(new_items) > 50: new_items = filter(ES_CLIENT, new_items) if len(new_items) != 0: titles = [item['_source']['title'] for item in new_items] print( f"{datetime.now().isoformat()} - Scoring {len(new_items)} Files." ) scores = get_scores(titles) for item, score in zip(new_items, scores): item['_source']['sentiment'] = score['prediction'] item['_source']['sentiment_score'] = score['sentiment_score'] item['_source']['abs_sentiment_score'] = abs( score['sentiment_score']) successes, failures = helpers.bulk(ES_CLIENT, new_items, stats_only=True, raise_on_error=False) print(successes, failures) with open(CLEAN_DIR / f"{str(uuid.uuid4())}.json", "w") as file: file.write(json.dumps(new_items)) new_items = [] ########################################################################################### if ctr % 10 == 0: try: send_metric(CONFIG, "rss_counter", "int64_value", len(list(NEWS_DIRS[0].iterdir())) - 1) ctr = 0 except Exception as e: logger.warning(e) ########################################################################################### ctr += 1 time.sleep(2) n_clean = n_clean_new
send_to_bucket(CONFIG['GCP']['RAW_BUCKET'], 'news', xz_file, logger=logger) send_to_bucket(CONFIG['GCP']['RAW_VAULT'], 'news', xz_file, logger=logger) logger.info("sending metrics") send_metric(CONFIG, "news_count", "int64_value", n_items) send_metric(CONFIG, "unique_news_count", "int64_value", n_unique) if __name__ == '__main__': logger.info("news job, initializing") try: main() send_metric(CONFIG, "news_success_indicator", "int64_value", 1) except Exception as e: exc = traceback.format_exc() logger.warning(f"news job error, {e}, {exc}") send_metric(CONFIG, "news_success_indicator", "int64_value", 0) logger.info("news job, terminating")
def fix_faults(batch_id, tickers): def add_to_faults(key, obj, faults): for ticker in obj: try: faults[ticker][key] = obj[ticker] except Exception as e: faults[ticker] = {key: obj[ticker]} return faults def check_lower_bounds(tickers, product): lower_bounds = _connector.get_lower_bounds(f"{product}counts", batch_id) lower_bounds = lower_bounds.set_index("ticker") lower_bounds = lower_bounds.astype(int).to_dict()['lower_bound'] unhealthy = {} for ticker in tickers: if ticker not in lower_bounds: continue file = (DATA / product / f"{ticker}_{DATE}.csv") if file.exists(): df = pd.read_csv(file) if len(df) <= lower_bounds[ticker]: unhealthy[ticker] = { "lower_bound": lower_bounds[ticker], "old": len(df), "new": 0 } else: unhealthy[ticker] = { "lower_bound": lower_bounds[ticker], "old": 0, "new": 0 } return unhealthy def check_ohlc(tickers): tickers = _connector.get_distinct_ohlc_tickers(batch_id).ticker collected = [ ticker.split("_")[0] for ticker in os.listdir(f"{DATA}/ohlc") ] unhealthy = {} for ticker in tickers: if ticker not in collected: unhealthy[ticker] = {"status": 0, "new_status": 0} return unhealthy try: analysis_faults = check_lower_bounds(tickers, "analysis") keystats_faults = check_lower_bounds(tickers, "keystats") # options_faults = check_lower_bounds(tickers, "options") # ohlc_faults = check_ohlc(tickers) logger.info(f"SCRAPER,{batch_id},FAULTS,SUCCESS,") except Exception as e: logger.info(f"SCRAPER,{batch_id},FAULTS,FAILURE,{e}") faults = add_to_faults("analysis", analysis_faults, {}) faults = add_to_faults("keystats", keystats_faults, faults) # faults = add_to_faults("options", options_faults, faults) # faults = add_to_faults("ohlc", ohlc_faults, faults) faults = collect_data_again(batch_id, faults) faults_summary = { key: {} for key in ["analysis", "keystats", "ohlc", "options"][:2] } for ticker in faults: for key in faults[ticker]: faults_summary[key][ticker] = faults[ticker][key] return faults_summary
def main(): company_names = pd.read_csv(f"{DIR}/../clean/data/company_names.csv") company_names = company_names[['ticker', 'name']] company_names = pd.concat([company_names, buzzwords]) company_names = company_names.reset_index(drop=True) chunks = np.array_split(company_names, 5) id_cache, ids = get_id_cache() errors = mp.Queue() processes = [ mp.Process(target=collect_news, args=(job_id, chunk, id_cache, ids, errors)) for job_id, chunk in enumerate(chunks) ] for process in processes: process.start() for process in processes: process.join() if not errors.empty(): error = errors.get() raise Exception(error) ############################################################################################### for file in IDSDIR.iterdir(): if file.name == '.gitignore': continue with open(file, "r") as _file: id_cache[SDATE].extend(json.loads(_file.read())) id_cache[SDATE] = list(set(id_cache[SDATE])) n_items = len(id_cache[SDATE]) n_unique = n_items ids = set([_id for idlist in id_cache.values() for _id in idlist]) with open(f"{DIR}/data/id_cache.json", "w") as file: file.write(json.dumps(id_cache)) ############################################################################################### backups = os.listdir(f"{DIR}/news_data_backup") xz_file = Path(f"{DIR}/news_data_backup/{SDATE}.tar.xz") if datetime.now().hour >= 10 and not xz_file.exists(): logger.info("news job, daily save") n_items, n_unique = save_items(PATH, SDATE) if gethostname() != CONFIG['MACHINE']['HOSTNAME']: CONFIG['GCP']['RAW_BUCKET'] = "tmp_items" CONFIG['GCP']['RAW_VAULT'] = "tmp_items_vault" send_to_bucket(CONFIG['GCP']['RAW_BUCKET'], 'news', xz_file, logger=logger) send_to_bucket(CONFIG['GCP']['RAW_VAULT'], 'news', xz_file, logger=logger) logger.info("sending metrics") send_metric(CONFIG, "news_count", "int64_value", n_items) send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)