def action(self, response): try: scraped_cluster = ClusterScraper(response) except Exception as e: utils.Report.critical( 'Could not obtain cluster name! {}'.format(e)) return # Save the metrics to Datadog for host_name, host_metrics in scraped_cluster.metrics.items(): for resource_name, resource_metrics in host_metrics.items(): for name, value in resource_metrics.items(): metric_name = 'compose.cluster.host.{}.{}'.format( resource_name, name) metric_unit = ClusterScraper.UNITS_FROM_NAME[name] utils.send_metric( metric_name=metric_name, metric_description=metric_name + ' description', metric_unit=metric_unit, metric_value=value, cluster_name=scraped_cluster.name, host_name=host_name) yield dict( cluster_name=scraped_cluster.name, cluster_metrics=scraped_cluster.metrics)
def main(date): logger.info("News Stats Initiated") datestr = (date - timedelta(days=1)).isoformat()[:10] file = Path(f"{DIR}/data/{datestr}.csv") xz_file = file.with_suffix(".tar.xz") try: logger.info(f"Processing stats for {date}") pre_n, df = process_data(get_data(date)) df = create_sector_tickers(df, get_ticker_info()) pre_n += len(sector_to_ticker) df['date'] = datestr df['volume'] = df.volume.astype(int) df = df[['date', 'ticker', 'volume', 'sentiment']] logger.info( f"Processed stats for {len(df)} tickers. Collected {pre_n} items.") send_metric(CONFIG, "news_stats_ticker_pre_count", "int64_value", pre_n) send_metric(CONFIG, "news_stats_ticker_post_count", "int64_value", len(df)) if len(df) == 0: raise Exception("Zero tickers after filtering") df.to_csv(file, index=False) with tar.open(xz_file, "x:xz") as tar_file: tar_file.add(file, arcname=file.name) send_to_bucket("daily_news_stats", "", xz_file, logger=logger) os.unlink(file) os.unlink(xz_file) send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 1) except Exception as e: logger.info(f"News Stats Error - {e}") send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 0) logger.info("News Stats Terminated")
path = Path(f"{DIR}/clean_data") xz_file = Path(f"{DIR}/clean_data_backup/{SDATE}.tar.xz") raw_path = Path(f"{DIR}/news_data") files = list(raw_path.iterdir()) files.remove(raw_path / ".gitignore") now = datetime.now() [ file.unlink() for file in files if check_file(file, now) ] n_items, n_unique = save_items(path, SDATE) send_metric(CONFIG, "clean_count", "int64_value", n_items) send_metric(CONFIG, "unique_clean_count", "int64_value", n_unique) send_to_bucket( CONFIG['GCP']['CLEAN_BUCKET'], 'news', xz_file, logger=logger ) logger.info(f"RSS save successeful.") send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 1) except Exception as e: logger.warning(f"RSS save failed. {e}, {format_exc()}")
removed_curnames = get_diff(ocurnames, curnames) comnames.to_csv(f"{DIR}/data/company_names.csv", index=False) curnames.to_csv(f"{DIR}/data/curated_company_names.csv", index=False) body = '\n'.join([ "New Company Names", new_comnames.to_html(index=False), "\nRemoved Company Names", removed_comnames.to_html(index=False), "\nNew Curated Names", new_curnames.to_html(index=False), "\nRemoved Curated Names", removed_curnames.to_html(index=False), ]) n = new_comnames.shape[0] + removed_comnames.shape[0] n += new_curnames.shape[0] + removed_curnames.shape[0] if n > 0: send_email(CONFIG, "Company Name Summary", body, [], logger) logger.info("company name downloader & curator succesful") send_metric(CONFIG, metric, "int64_value", 1) except Exception as e: logger.warning( f"company name downloader & curator failed, {e}, {format_exc()}") send_metric(CONFIG, metric, "int64_value", 0) logger.info("company name downloader & curator terminated")
def main(): company_names = pd.read_csv(f"{DIR}/../clean/data/company_names.csv") company_names = company_names[['ticker', 'name']] company_names = pd.concat([company_names, buzzwords]) company_names = company_names.reset_index(drop=True) chunks = np.array_split(company_names, 5) id_cache, ids = get_id_cache() errors = mp.Queue() processes = [ mp.Process(target=collect_news, args=(job_id, chunk, id_cache, ids, errors)) for job_id, chunk in enumerate(chunks) ] for process in processes: process.start() for process in processes: process.join() if not errors.empty(): error = errors.get() raise Exception(error) ############################################################################################### for file in IDSDIR.iterdir(): if file.name == '.gitignore': continue with open(file, "r") as _file: id_cache[SDATE].extend(json.loads(_file.read())) id_cache[SDATE] = list(set(id_cache[SDATE])) n_items = len(id_cache[SDATE]) n_unique = n_items ids = set([_id for idlist in id_cache.values() for _id in idlist]) with open(f"{DIR}/data/id_cache.json", "w") as file: file.write(json.dumps(id_cache)) ############################################################################################### backups = os.listdir(f"{DIR}/news_data_backup") xz_file = Path(f"{DIR}/news_data_backup/{SDATE}.tar.xz") if datetime.now().hour >= 10 and not xz_file.exists(): logger.info("news job, daily save") n_items, n_unique = save_items(PATH, SDATE) if gethostname() != CONFIG['MACHINE']['HOSTNAME']: CONFIG['GCP']['RAW_BUCKET'] = "tmp_items" CONFIG['GCP']['RAW_VAULT'] = "tmp_items_vault" send_to_bucket(CONFIG['GCP']['RAW_BUCKET'], 'news', xz_file, logger=logger) send_to_bucket(CONFIG['GCP']['RAW_VAULT'], 'news', xz_file, logger=logger) logger.info("sending metrics") send_metric(CONFIG, "news_count", "int64_value", n_items) send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)
send_to_bucket(CONFIG['GCP']['RAW_BUCKET'], 'news', xz_file, logger=logger) send_to_bucket(CONFIG['GCP']['RAW_VAULT'], 'news', xz_file, logger=logger) logger.info("sending metrics") send_metric(CONFIG, "news_count", "int64_value", n_items) send_metric(CONFIG, "unique_news_count", "int64_value", n_unique) if __name__ == '__main__': logger.info("news job, initializing") try: main() send_metric(CONFIG, "news_success_indicator", "int64_value", 1) except Exception as e: exc = traceback.format_exc() logger.warning(f"news job error, {e}, {exc}") send_metric(CONFIG, "news_success_indicator", "int64_value", 0) logger.info("news job, terminating")
def cleaning_loop(): ctr = 0 files = {NEWS_DIR / ".gitignore"} n_clean = len(list(CLEAN_DIR.iterdir())) while True: new_files = get_files(files) n_clean_new = len(list(CLEAN_DIR.iterdir())) if n_clean_new < n_clean: files = {NEWS_DIR / ".gitignore"} reload(sys.modules['clean_item']) reload(sys.modules['find_company_names']) logger.info("reloading the company names") items = [] for new_file in new_files: with open(new_file, "r") as file: try: items.extend(json.loads(file.read())) files.add(new_file) except Exception as e: logger.warning(f"File read error. {e}") new_items = [] for item in items: if not item.get("title"): continue item = clean_item(item) dummy_item = { 'title': item['title'], 'article_source': item['article_source'], 'published_datetime': item['published_datetime'][:10] } if 'summary' in item: dummy_item['summary'] = item['summary'] _id = md5(json.dumps(dummy_item).encode()).hexdigest() new_items.append({ "_index": "news", "_id": _id, "_op_type": "create", "_source": item }) if len(new_items) > 50: new_items = filter(ES_CLIENT, new_items) if len(new_items) != 0: titles = [item['_source']['title'] for item in new_items] print( f"{datetime.now().isoformat()} - Scoring {len(new_items)} Files." ) scores = get_scores(titles) for item, score in zip(new_items, scores): item['_source']['sentiment'] = score['prediction'] item['_source']['sentiment_score'] = score['sentiment_score'] item['_source']['abs_sentiment_score'] = abs( score['sentiment_score']) successes, failures = helpers.bulk(ES_CLIENT, new_items, stats_only=True, raise_on_error=False) print(successes, failures) with open(CLEAN_DIR / f"{str(uuid.uuid4())}.json", "w") as file: file.write(json.dumps(new_items)) new_items = [] ########################################################################################### if ctr % 10 == 0: try: send_metric(CONFIG, "rss_counter", "int64_value", len(list(NEWS_DIRS[0].iterdir())) - 1) ctr = 0 except Exception as e: logger.warning(e) ########################################################################################### ctr += 1 time.sleep(2) n_clean = n_clean_new