示例#1
0
文件: job.py 项目: zQuantz/NewsLab
def main():

    logger.info(f"RSS,Job,Initated,{SDATE}")

    for file in os.listdir(f"{DIR}/pids"):

        if file == ".gitignore":
            continue

        os.remove(f"{DIR}/pids/{file}")

    os.system(f"touch {DIR}/pids/{os.getpid()}")

    group_keys = list(groups.keys())
    parallel_groups = [group_keys[0::2], group_keys[1::2]]

    try:

        Parallel(n_jobs=2)(
            delayed(parallel_job)(job_id, parallel_group)
            for job_id, parallel_group in enumerate(parallel_groups))

    except Exception as e:

        logger.warning(e)
示例#2
0
文件: batch.py 项目: zQuantz/OscraP
def collect_data_again(batch_id, faults):

    for i, ticker in enumerate(faults):

        try:

            retries = {
                key: key in faults[ticker]
                for key in ['analysis', 'keystats', 'ohlc', 'options'][:2]
            }

            ticker_obj = Ticker(ticker, logger, batch_id, retries,
                                faults[ticker])
            faults[ticker] = ticker_obj.fault_dict
            time.sleep(SLEEP)

            logger.info(f"{ticker},{batch_id},Re-Ticker,Success,")

        except Exception as e:

            logger.warning(f"{ticker},{batch_id},Re-Ticker,Failure,{e}")

        pct = (i + 1) / len(faults)
        pct = np.round(100 * pct, 4)
        logger.info(f"SCRAPER,{batch_id},RE-PROGRESS,{pct}%,")

    return faults
示例#3
0
def replace_all_synonyms(df):

    while True:
        new_df = replace_synonyms(df)
        logger.info(f"Synonym Replacement. Old:{len(df)} New:{len(new_df)}")
        if len(new_df) == len(df):
            break
        df = new_df.copy()

    return df
示例#4
0
文件: job.py 项目: zQuantz/OscraP
def scrape(exchange_code, exchange_name):

    stats = []
    for letter in LETTERS:

        page = get_bs_obj(index="stocklist",
                          exchange=exchange_code,
                          symbol=letter)
        rows = page.find("table", {"class": "quotes"}).find_all("tr")[1:]

        if rows[0].find('td').text[0] != letter.upper():
            continue

        for row in rows:

            try:

                error = None

                ticker = row.find('td').text
                name = row.find("td", text=ticker).next_sibling.text

                ticker = ticker.replace('.', '-')
                market_cap = get_market_cap(ticker)
                sector, industry, instrument_type = get_sector_and_industry(
                    ticker)

            except Exception as e:

                market_cap = 0
                sector, industry, instrument_type = None, None, None
                error = e

            stats.append([
                ticker,
                name,
                exchange_code,
                exchange_name,
                sector,
                industry,
                instrument_type,
                round(market_cap, 3),
            ])

            log_entry = list(map(str, stats[-1]))
            log_entry = ','.join(log_entry)
            log_entry += ',' if not error else ',' + str(error)
            logger.info(log_entry)

            time.sleep(SLEEP)

    df = pd.DataFrame(stats, columns=COLUMNS)
    df.to_csv(f'{DATA}/{exchange_code}_tickers.csv', index=False)
示例#5
0
文件: pivix.py 项目: small0305/iBot
    def save(self, name, content):
        folder = self.timestamp.strftime('%Y-%m-%d')
        prefix = self.timestamp.strftime('%Y%m%d%H%m')
        path = os.path.join(IMAGE_DIR, folder)

        if not os.path.exists(path):
            os.mkdir(path)

        with open(os.path.join(path, prefix+'-'+name), 'wb') as f:
            f.write(content)

        logger.info('pivix file saved: %s %s' % (folder, name))
示例#6
0
文件: job.py 项目: zQuantz/OscraP
def main():

	logger.info(f"SCRAPER,JOB,INITIATED,{DATE},")

	init()

	tickers = _connector.get_equity_tickers(N_USD)
	checkpoint = len(tickers) / BATCH_SIZE
	checkpoint = int(checkpoint / 4)

	faults_summary = {
		"options" : {},
		"analysis" : {},
		"keystats" : {},
		"ohlc" : {}
	}

	db_flags, db_stats = [], []

	###############################################################################################

	for batch_id, batch in enumerate(range(BATCH_SIZE, len(tickers) + BATCH_SIZE, BATCH_SIZE)):

		ticker_batch = tickers[batch - BATCH_SIZE : batch]

		results = batch_main(batch_id, ticker_batch)
		b_fault_summary, b_db_flag, b_db_stats = results

		for key in b_fault_summary:
			for ticker in b_fault_summary[key]:
				faults_summary[key][ticker] = b_fault_summary[key][ticker]

		db_flags.append(b_db_flag)
		db_stats.append(b_db_stats)

		success, failure = get_job_success_rates(tickers[ : BATCH_SIZE * (1 + batch_id)])
		send_metrics(success, failure)

		# if batch_id % checkpoint == 0 and batch_id != 0:
		# 	report("Partial", success, failure, faults_summary, db_flags, db_stats)

	###############################################################################################

	success, failure = get_job_success_rates(tickers)
	report("Full", success, failure, faults_summary, db_flags, db_stats)

	store()

	logger.info(f"SCRAPER,JOB,TERMINATED,{DATE},")
示例#7
0
文件: job.py 项目: zQuantz/OscraP
def get_exchanges():

    page = get_bs_obj(index="stocklist", exchange="AMEX", symbol=LETTERS[0])
    select = page.find("select", {
        "id": "ctl00_cph1_cboExchange"
    }).find_all("option")

    exchanges = []
    for option in select:

        if option.text not in EXCHANGES:
            continue

        exchanges.append((option.get_attribute_list("value")[0], option.text))
        logger.info(','.join(exchanges[-1]))

    return exchanges
示例#8
0
文件: job.py 项目: zQuantz/NewsLab
def parallel_job(job_id, parallel_group):

    logger.info(f"RSS,Job,PID,{os.getpid()}")

    def on_close():

        for group in parallel_group:

            feed_threads[group].on_close()
            logger.info(f"RSS,Thread,Closed,{job_id} - {group}")

    def sigterm_handler(signal_number, frame):

        logger.info(f"RSS,Job,SIGTERM,{os.getpid()}")
        on_close()

    signal.signal(signal.SIGTERM, sigterm_handler)
    os.system(f"touch {DIR}/pids/{os.getpid()}")

    ###############################################################################################

    try:

        feed_threads = {}

        for i, group in enumerate(parallel_group):

            group, sleep = group, groups[group]
            group_coords = feeds[feeds.source.isin(group)]

            feed_threads[group] = Feeds(sources=group_coords.source.values,
                                        feeds=group_coords.feed.values,
                                        sleep=sleep,
                                        logger=logger)

            feed_threads[group].start()

            logger.info(f"RSS,Thread,Initiated,{job_id} - {group}")

    except Exception as e:

        logger.warning(f"RSS,Thread,Error,{job_id} - {e}")

        on_close()

        raise Exception(f"RSS,Job,Terminated,{job_id} - {e}")
示例#9
0
文件: batch.py 项目: zQuantz/OscraP
def main(batch_id, tickers):

    logger.info(f"SCRAPER,{batch_id},INITIATED,,")

    collect_data(batch_id, tickers)

    _connector.init_batch_tickers(batch_id, tickers)

    faults_summary = fix_faults(batch_id, tickers)

    db_flag, db_stats = index_data(batch_id, tickers)

    _connector.launch_derived_engine(batch_id)

    logger.info(f"SCRAPER,{batch_id},TERMINATED,,")

    return faults_summary, db_flag, db_stats
示例#10
0
def main(date):

    logger.info("News Stats Initiated")
    datestr = (date - timedelta(days=1)).isoformat()[:10]
    file = Path(f"{DIR}/data/{datestr}.csv")
    xz_file = file.with_suffix(".tar.xz")

    try:

        logger.info(f"Processing stats for {date}")

        pre_n, df = process_data(get_data(date))
        df = create_sector_tickers(df, get_ticker_info())
        pre_n += len(sector_to_ticker)

        df['date'] = datestr
        df['volume'] = df.volume.astype(int)
        df = df[['date', 'ticker', 'volume', 'sentiment']]

        logger.info(
            f"Processed stats for {len(df)} tickers. Collected {pre_n} items.")
        send_metric(CONFIG, "news_stats_ticker_pre_count", "int64_value",
                    pre_n)
        send_metric(CONFIG, "news_stats_ticker_post_count", "int64_value",
                    len(df))

        if len(df) == 0:
            raise Exception("Zero tickers after filtering")

        df.to_csv(file, index=False)
        with tar.open(xz_file, "x:xz") as tar_file:
            tar_file.add(file, arcname=file.name)

        send_to_bucket("daily_news_stats", "", xz_file, logger=logger)

        os.unlink(file)
        os.unlink(xz_file)

        send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 1)

    except Exception as e:

        logger.info(f"News Stats Error - {e}")
        send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 0)

    logger.info("News Stats Terminated")
示例#11
0
文件: batch.py 项目: zQuantz/OscraP
def collect_data(batch_id, tickers):

    for i, ticker in enumerate(tickers):

        try:

            Ticker(ticker, logger, batch_id)
            time.sleep(SLEEP)

            logger.info(f"{ticker},{batch_id},Ticker,Success,")

        except Exception as e:

            logger.warning(f"{ticker},{batch_id},Ticker,Failure,{e}")

        pct = (i + 1) / len(tickers)
        pct = np.round(100 * pct, 4)
        logger.info(f"SCRAPER,{batch_id},PROGRESS,{pct}%,")
示例#12
0
文件: job.py 项目: zQuantz/OscraP
def store(df):

    try:

        df.to_csv(f"{DATA}/{DATE}.csv", index=False)

        with tar.open(f"{DATA}.tar.xz", "x:xz") as tar_file:
            for file in DATA.iterdir():
                tar_file.add(file, arcname=file.name)
            tar_file.add(f"{DIR}/log.log", arcname="log.log")

        send_to_bucket(BUCKET_PREFIX, BUCKET_NAME, f"{DATE}.tar.xz",
                       f"{DIR}/instrument_data", logger)

        for folder in (DIR / "instrument_data").iterdir():
            if folder.is_dir():
                shutil.rmtree(folder)

    except Exception as e:

        logger.info(f"Storage Error - {e}")
示例#13
0
def collect_news(job_id, company_names, id_cache, ids, errors):

    try:

        N = len(company_names)
        for i, data in enumerate(company_names.values):

            queries = ' '.join(data)
            progress = round(i / N * 100, 2)
            logger.info(f"collecting {queries}, {progress}%")

            ticker, company_name = data
            if ticker:
                fetch(ticker, id_cache, ids)
            fetch(company_name, id_cache, ids)

    except Exception as e:

        errors.put(f"{e}\n{traceback.format_exc()}")

    with open(IDSDIR / f"{job_id}.json", "w") as file:
        file.write(json.dumps(id_cache[SDATE]))
示例#14
0
文件: pivix.py 项目: small0305/iBot
    def run(self):
        rank = self.sess.get(self.RANK_URL)
        soup = BeautifulSoup(rank.content, 'html.parser', from_encoding='utf-8')

        ranking_items = soup.find_all('section',
                                      {'class': 'ranking-item'},
                                      limit=self.topN)

        for item in ranking_items:
            url = item.find('img')['data-src']
            url = re.sub(r'(c/|240x480/|_master1200)', '', url)
            url = url.replace('master', 'original')

            r = self.sess.get(url, headers={'Referer': self.ILLUSTRATION_URL})
            logger.info('access pivix url: %s status: %d' % (url, r.status_code))
            if r.status_code == 404:
                url = re.sub(r'jpg$', 'png', url)
                r = self.sess.get(url, headers={'Referer': self.ILLUSTRATION_URL})
                logger.info('retry access pivix url: %s status: %d' % (url, r.status_code))
            creator = re.sub(r'[^A-Za-z0-9]+', '', item['data-user-name'])
            file_name = item['data-id']+'-'+creator+'.'+url[-3:]
            self.save(file_name, r.content)
示例#15
0
def splits():

	logger.info(f"SCRAPER,SPLITS,INITIATED,")

	now = datetime.now()
	report_df = pd.DataFrame()
	dt = datetime(now.year, now.month, 1).strftime("%m/%d/%Y")

	try:

		df = process(dt)
		store(df)

		_connector.execute(f"DELETE FROM stocksplitstmp{MODIFIER};")
		_connector.write(f"stocksplitstmp{MODIFIER}", df)
		_connector.execute("""
				INSERT IGNORE INTO
					stocksplits{modifier}
				SELECT
					*
				FROM
					stocksplitstmp{modifier};
			""".format(modifier=MODIFIER))

		df = df[df.ex_date == DATE]
		if len(df) != 0:

			logger.info(f"SCRAPER,SPLITS,ADJUSTING,{len(df)}")
			_connector.register_splits(P_COLUMNS, MODIFIER)
			_connector.adjust_splits(MODIFIER)
		
		metric = 1
		title_modifier = "SUCCESS"
		logger.info(f"SCRAPER,SPLITS,TERMINATED,{len(df)}")

	except Exception as e:

		metric = 0
		title_modifier = "FAILURE"
		logger.warning(f"SCRAPER,SPLITS,FAILURE,{e}")

	###############################################################################################

	report = _connector.read("""
			SELECT
				*
			FROM
				stocksplitstatus{modifier}
			WHERE
				ex_date = "{date}"
		""".format(modifier=MODIFIER, date=DATE))

	send_gcp_metric(CONFIG, "splits_success_indicator", "int64_value", metric)
	send_email(CONFIG, f"{title_modifier} - Stock Splits", report.to_html(), [], logger)
示例#16
0
文件: job.py 项目: zQuantz/OscraP
def collect():

	logger.info(f"Downloading Table: {URL}")
	df = pd.read_html(URL, attrs=attrs)
	logger.info(f"Number of tables found: {len(df)}")

	if len(df) != 1:
		return

	df = df[0]
	df.columns = t_names

	df['date_current'] = pd.to_datetime(df.date_current)
	df = df.sort_values('date_current', ascending=False)
	df = df.reset_index(drop=True)

	###############################################################################################

	df = df[df.date_current == DATE]
	logger.info(f"Number of items after filter: {len(df)}")

	if len(df) == 0:
		raise Exception("Data not up to date.")

	_connector.write("treasuryrates", df)
	df.to_csv(f"{DATA}.csv", index=False)

	###############################################################################################

	r_map = df.iloc[-1, 1:].values
	r_map = np.array([0] + r_map.tolist())
	chs = CubicHermiteSpline(t_map, r_map, [0]*len(t_map))

	rm_df = pd.DataFrame()
	rm_df['days_to_expiry'] = np.arange(0, 365 * 10 + 1).astype(int)
	rm_df['rate'] = chs(rm_df.days_to_expiry.values)
	rm_df['date_current'] = DATE

	_connector.write("treasuryratemap", rm_df)

	return df
示例#17
0
def main():

    logger.info(f"SCRAPER,STORE,INITIATED,,")

    try:

        aggregate()
        compress()

        send_to_bucket(BUCKET_PREFIX,
                       BUCKET_NAME,
                       f"{DATE}.tar.xz",
                       f"{DIR}/financial_data",
                       logger=logger)

        remove()

        logger.info(f"SCRAPER,STORE,SUCCESS,,")

    except Exception as e:

        logger.warning(f"SCRAPER,STORE,FAILURE,{e},")

    logger.info(f"SCRAPER,STORE,TERMINATED,,")
示例#18
0
文件: job.py 项目: zQuantz/NewsLab
    def on_close():

        for group in parallel_group:

            feed_threads[group].on_close()
            logger.info(f"RSS,Thread,Closed,{job_id} - {group}")
示例#19
0
def download_company_names():

    for file in DATA.iterdir():
        if file.name == '.gitignore':
            continue
        file.unlink()

    ###############################################################################################

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("window-size=1024,768")
    options.add_argument("--no-sandbox")
    options.add_experimental_option("prefs",
                                    {"download.default_directory": str(DATA)})

    driver = webdriver.Chrome(options=options)

    logger.info("Getting web page...")
    driver.get("http://eoddata.com")

    username_input = driver.find_element_by_id("ctl00_cph1_lg1_txtEmail")
    password_input = driver.find_element_by_id("ctl00_cph1_lg1_txtPassword")
    login_button = driver.find_element_by_id("ctl00_cph1_lg1_btnLogin")

    username_input.send_keys("zQuantz")
    password_input.send_keys("Street101!")

    logger.info("Logging in...")
    login_button.click()

    logger.info("Getting download page...")
    driver.get("http://eoddata.com/symbols.aspx")

    for exchange in EXCHANGES:

        logger.info(f"Downloading: {exchange}")

        exchange_selector = Select(
            driver.find_element_by_id("ctl00_cph1_cboExchange"))
        exchange_selector.select_by_value(exchange)

        time.sleep(5)

        download_button = driver.find_element_by_id("ctl00_cph1_ch1_divLink")
        download_button = download_button.find_element_by_tag_name("a")
        download_button.click()

        time.sleep(5)

    ###############################################################################################

    df = []
    for file in DATA.iterdir():
        if file.name == '.gitignore':
            continue
        df.append(pd.read_csv(file, delimiter="\t"))
        df[-1]['exchange'] = file.name[:-4]

    df = pd.concat(df)
    df.columns = ['ticker', 'name', 'exchange']
    df = df.sort_values('ticker').reset_index(drop=True)

    ###############################################################################################

    df = df[df.ticker.str.len() <= 6]

    combo = df.name + " " + df.exchange
    vcs = combo.value_counts()
    df = df[combo.isin(vcs[vcs == 1].index)]
    df = df[df.ticker.str.count("\\.") <= 1]

    ndaq = df[df.exchange == 'NASDAQ']
    ndaq = ndaq[ndaq.ticker.str.len() > 4]
    mods = ndaq.ticker.str[-1]
    mods = ndaq[~mods.isin(['A', 'B', 'C'])]
    df = df[~df.index.isin(mods.index)]

    ticker_mods = df[df.ticker.str.count("\\.") == 1]
    mod = ticker_mods.ticker.str.split("\\.").str[-1]
    ticker_mods = ticker_mods[~mod.isin(["A", "B", "C"])]
    df = df[~df.index.isin(ticker_mods.index)]

    df = df[~df.ticker.str[0].str.isnumeric()]
    df = df[~df.ticker.str[-1].str.isnumeric()]

    mods = df[df.ticker.str.count("-") == 1].ticker
    mods = mods[~mods.str[-1].isin(['A', 'B', 'C'])]
    df = df[~df.index.isin(mods.index)]

    return df
示例#20
0
文件: batch.py 项目: zQuantz/OscraP
def index_data(batch_id, tickers):

    try:

        # options, ohlc = [], []
        analysis, keystats = [], []

        # for file in (DATA/"options").iterdir():

        # 	ticker = file.name.split('_')[0]
        # 	if ticker not in tickers:
        # 		continue

        # 	options.append(pd.read_csv(file))

        # for file in (DATA/"ohlc").iterdir():

        # 	ticker = file.name.split('_')[0]
        # 	if ticker not in tickers:
        # 		continue

        # 	ohlc.append(pd.read_csv(file).iloc[:1, :])

        for file in (DATA / "analysis").iterdir():

            ticker = file.name.split('_')[0]
            if ticker not in tickers:
                continue

            analysis.append(pd.read_csv(file))

        for file in (DATA / "keystats").iterdir():

            ticker = file.name.split('_')[0]
            if ticker not in tickers:
                continue

            keystats.append(pd.read_csv(file))

        pre = _connector.get_equities_table_count().row_count

        # if len(options) > 0:
        # 	options = pd.concat(options)
        # 	_connector.write("options", options)

        # if len(ohlc) > 0:
        # 	ohlc = pd.concat(ohlc)
        # 	_connector.write("ohlc", ohlc)

        if len(analysis) > 0:
            _connector.write("analysis", pd.concat(analysis))

        if len(keystats) > 0:
            _connector.write("keystats", pd.concat(keystats))

        # if len(options) > 0 and len(ohlc) > 0:

        # 	cols = ["date_current", "ticker", "adjclose_price"]
        # 	options = options.merge(ohlc[cols], on=cols[:2], how="inner")
        # 	options = options.rename({"adjclose_price" : "stock_price"}, axis=1)
        # 	options = options.merge(CONFIG['ratemap'], on="days_to_expiry", how="inner")

        # 	zsurface, surface = calculate_surface(options, CONFIG['reg_expirations'])
        # 	zsurface['date_current'], surface['date_current'] = DATE, DATE

        # 	info = f"{zsurface.ticker.nunique()}/{options.ticker.nunique()}"
        # 	logger.info(f"SCRAPER,{batch_id},zSURFACE ({len(zsurface)}),{info}")

        # 	info = f"{surface.ticker.nunique()}/{options.ticker.nunique()}"
        # 	logger.info(f"SCRAPER,{batch_id},SURFACE ({len(surface)}),{info}")

        # 	_connector.write("zsurface", zsurface)
        # 	_connector.write("surface", surface)

        post = _connector.get_equities_table_count().row_count

        db_stats = (pre.tolist(), post.tolist())
        db_flag = 1

        logger.info(f"SCRAPER,{batch_id},INDEXING,SUCCESS,")

    except Exception as e:

        logger.warning(f"SCRAPER,{batch_id},INDEXING,FAILURE,{e}")
        print_exc()

        db_stats = ([0] * 4, [0] * 4)
        db_flag = 0

    return db_flag, db_stats
示例#21
0
文件: save.py 项目: zQuantz/NewsLab
		raw_path = Path(f"{DIR}/news_data")
		files = list(raw_path.iterdir())
		files.remove(raw_path / ".gitignore")

		now = datetime.now()
		[
			file.unlink()
			for file in files
			if check_file(file, now)
		]

		n_items, n_unique = save_items(path, SDATE)
		send_metric(CONFIG, "clean_count", "int64_value", n_items)
		send_metric(CONFIG, "unique_clean_count", "int64_value", n_unique)

		send_to_bucket(
			CONFIG['GCP']['CLEAN_BUCKET'],
			'news',
			xz_file,
			logger=logger
		)

		logger.info(f"RSS save successeful.")
		send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 1)

	except Exception as e:

		logger.warning(f"RSS save failed. {e}, {format_exc()}")
		send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 0)
示例#22
0

def init_folders():

    os.mkdir(f'{DIR}/News_data/{date_today}')


if __name__ == '__main__':

    init_folders()

    for ticker in TICKERS:

        try:
            get_news(ticker)
            logger.info('%s:Completed', ticker)
            ticker_list.append(ticker)
            current_complete = (len(ticker_list) / len(TICKERS)) * 100
            logger.info('Current Percentage: %f %s', current_complete, percent)

        except Exception as e:
            logger.warning('Error Message: %s:%s', ticker, e)
            continue

    percent_successful = (len(ticker_list) / len(TICKERS)) * 100
    logger.info('Percentage of successful tickers: %f  %s', percent_successful,
                percent)

# logging information #
log_path = "/home/zqretrace/scripts/merge_logs/CNBC_Merged_logs/merge_logs_CNBC.log"
logging.basicConfig(
示例#23
0
from pathlib import Path
import pandas as pd
import sys, os
import time

sys.path.append(f"{DIR}/..")
from utils import send_metric, send_email


def get_diff(df1, df2):
    return df1[~df1.apply(tuple, 1).isin(df2.apply(tuple, 1))]


if __name__ == '__main__':

    logger.info("company name downloader & curator initialized")
    metric = "company_names_success_indicator"

    try:

        ocomnames = pd.read_csv(f"{DIR}/data/company_names.csv")
        ocurnames = pd.read_csv(f"{DIR}/data/curated_company_names.csv")

        comnames = download_company_names()
        curnames = curate_company_names(comnames)

        new_comnames = get_diff(comnames, ocomnames)
        removed_comnames = get_diff(ocomnames, comnames)

        new_curnames = get_diff(curnames, ocurnames)
        removed_curnames = get_diff(ocurnames, curnames)
示例#24
0
文件: job.py 项目: zQuantz/NewsLab
    def sigterm_handler(signal_number, frame):

        logger.info(f"RSS,Job,SIGTERM,{os.getpid()}")
        on_close()
示例#25
0
def curate_company_names(company_names):

    company_names['name'] = company_names.name.str.lower()
    logger.info(f"Initial {company_names.shape}")

    ## Suffix Section
    company_names = remove_suffix(company_names)
    logger.info(f"Remove Suffix {company_names.shape}")

    company_names = replace_special_cases(company_names)
    logger.info(f"Special Cases {company_names.shape}")

    company_names = remove_single_stop_and_english_words(company_names)
    logger.info(f"Single stop and english {company_names.shape}")

    ## Modifier Section

    company_names = remove_modifiers(company_names)
    logger.info(f"Modifiers {company_names.shape}")

    company_names = remove_modifier_stop_and_english_words(company_names)
    logger.info(f"Modifier stop and english {company_names.shape}")

    company_names = remove_modifier_duplicates(company_names)
    logger.info(f"Modifier Duplicates {company_names.shape}")

    ## Final cleaning
    exchanges = ["AMEX", "NASDAQ", "NYSE", "TSX", "TSXV", "LSE"]
    company_names = company_names[company_names.exchange.isin(exchanges)]
    logger.info(f"Exchange Filter {company_names.shape}")

    company_names = replace_all_synonyms(company_names)
    logger.info(f"Replace Synonyms {company_names.shape}")

    company_names = replace_indices(company_names)
    logger.info(f"Replace Indices {company_names.shape}")

    company_names = remove_all_number_names(company_names)
    logger.info(f"Remove Numbered Names {company_names.shape}")

    company_names = remove_short_names(company_names)
    logger.info(f"Remove Short Names {company_names.shape}")

    company_names = company_names[~company_names.name.isin(MANUAL_OVERRIDES)]
    logger.info(f"Remove Manual Overrides {company_names.shape}")

    company_names = remove_countries_and_currencies(company_names)
    logger.info(f"Remove Countries and Currencies {company_names.shape}")

    company_names = remove_english_two_grams(company_names)
    logger.info(f"Remove English Two Grams {company_names.shape}")

    company_names = remove_commodities(company_names)
    logger.info(f"Remove Commodities {company_names.shape}")

    ## Super special case of 'Target'. Add nicknames
    company_names = company_names[company_names.name != 'target']
    logger.info(f"Remove Target {company_names.shape}")

    company_names = add_nicknames(company_names)
    logger.info(f"Added Nicknames {company_names.shape}")

    ## Sort and Save
    company_names = company_names.sort_values('ticker')
    company_names = company_names.drop_duplicates().reset_index(drop=True)
    logger.info(f"Drop dupes {company_names.shape}")

    return company_names
示例#26
0
文件: store.py 项目: mpucci92/NewsLab
def compress_files():

    filedate = datetime.now() - timedelta(days=1)
    filedate = filedate.strftime('%Y-%m-%d')

    raw_txt = f'{DIR}/news_data_backup/{filedate}.txt'
    raw_tar = f'{DIR}/news_data_backup/{filedate}.tar.xz'

    files = os.listdir(f"{DIR}/news_data")
    files = [f"{DIR}/news_data/{file}" for file in files]
    files = sorted(files, key=os.path.getmtime)[::-1]
    files.remove(f"{DIR}/news_data/.gitignore")

    cfiles = os.listdir(f"{DIR}/cleaned_news_data")
    cfiles = [f"{DIR}/cleaned_news_data/{file}" for file in cfiles]
    cfiles = sorted(cfiles, key=os.path.getmtime)[::-1]
    cfiles.remove(f"{DIR}/cleaned_news_data/.gitignore")

    ###############################################################################################

    ctr = 0
    data, hashes = list(), set()
    sources, usources = dict(), dict()
    for file in files:

        with open(file, "r") as data_file:
            items = json.loads(data_file.read())

        for item in items:

            ctr += 1

            item_ = item.copy()
            item_.pop("oscrap_acquisition_datetime")

            if 'oscrap_source' not in item_:
                continue

            source = item_['oscrap_source']
            if source in sources:
                sources[source] += 1
            else:
                sources[source] = 1

            hash_ = sha256(json.dumps(item_).encode()).hexdigest()

            if hash_ in hashes:
                continue

            if source in usources:
                usources[source] += 1
            else:
                usources[source] = 1

            data.append(item)
            hashes.add(hash_)

    logger.info(f"RSS,Storage,Data,{ctr}")
    logger.info(f"RSS,Storage,Unique Data,{len(hashes)}")

    send_gcp_metric(CONFIG, "rss_daily_item_uniques", "int64_value",
                    len(hashes))
    send_gcp_metric(CONFIG, "rss_daily_item_total", "int64_value", ctr)

    for source in sources:

        logger.info(f"RSS,Source Total,{source},{sources[source]}")
        metric_name = source.lower().replace(" ", "_")
        send_gcp_metric(CONFIG, f"{metric_name}_daily_item_total",
                        "int64_value", sources[source])

    for source in usources:

        logger.info(f"RSS,Source Uniques,{source},{usources[source]}")
        metric_name = source.lower().replace(" ", "_")
        send_gcp_metric(CONFIG, f"{metric_name}_daily_item_uniques",
                        "int64_value", usources[source])

    with open(raw_txt, "w") as file:
        file.write(json.dumps(data))

    with tar.open(raw_tar, mode="x:xz") as tar_file:
        tar_file.add(raw_txt, arcname=os.path.basename(raw_txt))

    ###############################################################################################

    ctr = 0
    data, hashes = list(), set()
    for file in cfiles:

        with open(file, "r") as data_file:
            items = json.loads(data_file.read())

        for item in items:

            ctr += 1

            hash_ = sha256(json.dumps(item).encode()).hexdigest()

            if hash_ in hashes:
                continue

            data.append(item)
            hashes.add(hash_)

    send_gcp_metric(CONFIG, "rss_daily_clean_uniques", "int64_value",
                    len(hashes))
    send_gcp_metric(CONFIG, "rss_daily_clean_total", "int64_value", ctr)

    cleaned_txt = f"{DIR}/cleaned_news_data/{filedate}.txt"
    cleaned_tar = cleaned_txt[:-4] + ".tar.xz"

    with open(cleaned_txt, "w") as file:
        file.write(json.dumps(data))

    with tar.open(cleaned_tar, mode="x:xz") as tar_file:
        tar_file.add(cleaned_txt, arcname=os.path.basename(cleaned_txt))

    ###############################################################################################

    time.sleep(600)

    file_size = os.stat(raw_tar).st_size / 1_000_000
    if file_size > 0:
        for file in files:
            os.remove(file)
        os.remove(raw_txt)
    else:
        raise Exception("TarFile Corrupted. File Size 0.")

    file_size = os.stat(cleaned_tar).st_size / 1_000_000
    if file_size > 0:
        for file in cfiles:
            os.remove(file)
        os.remove(cleaned_txt)
    else:
        raise Exception("TarFile Corrupted. File Size 0.")

    return raw_tar, cleaned_tar
示例#27
0
文件: clean.py 项目: zQuantz/NewsLab
def cleaning_loop():

    ctr = 0
    files = {NEWS_DIR / ".gitignore"}
    n_clean = len(list(CLEAN_DIR.iterdir()))

    while True:

        new_files = get_files(files)
        n_clean_new = len(list(CLEAN_DIR.iterdir()))

        if n_clean_new < n_clean:
            files = {NEWS_DIR / ".gitignore"}
            reload(sys.modules['clean_item'])
            reload(sys.modules['find_company_names'])
            logger.info("reloading the company names")

        items = []
        for new_file in new_files:
            with open(new_file, "r") as file:
                try:
                    items.extend(json.loads(file.read()))
                    files.add(new_file)
                except Exception as e:
                    logger.warning(f"File read error. {e}")

        new_items = []
        for item in items:
            if not item.get("title"):
                continue

            item = clean_item(item)

            dummy_item = {
                'title': item['title'],
                'article_source': item['article_source'],
                'published_datetime': item['published_datetime'][:10]
            }
            if 'summary' in item:
                dummy_item['summary'] = item['summary']

            _id = md5(json.dumps(dummy_item).encode()).hexdigest()
            new_items.append({
                "_index": "news",
                "_id": _id,
                "_op_type": "create",
                "_source": item
            })

        if len(new_items) > 50:
            new_items = filter(ES_CLIENT, new_items)

        if len(new_items) != 0:

            titles = [item['_source']['title'] for item in new_items]
            print(
                f"{datetime.now().isoformat()} - Scoring {len(new_items)} Files."
            )
            scores = get_scores(titles)

            for item, score in zip(new_items, scores):
                item['_source']['sentiment'] = score['prediction']
                item['_source']['sentiment_score'] = score['sentiment_score']
                item['_source']['abs_sentiment_score'] = abs(
                    score['sentiment_score'])

            successes, failures = helpers.bulk(ES_CLIENT,
                                               new_items,
                                               stats_only=True,
                                               raise_on_error=False)

            print(successes, failures)
            with open(CLEAN_DIR / f"{str(uuid.uuid4())}.json", "w") as file:
                file.write(json.dumps(new_items))

            new_items = []

        ###########################################################################################

        if ctr % 10 == 0:

            try:

                send_metric(CONFIG, "rss_counter", "int64_value",
                            len(list(NEWS_DIRS[0].iterdir())) - 1)
                ctr = 0

            except Exception as e:

                logger.warning(e)

        ###########################################################################################

        ctr += 1
        time.sleep(2)
        n_clean = n_clean_new
示例#28
0
        send_to_bucket(CONFIG['GCP']['RAW_BUCKET'],
                       'news',
                       xz_file,
                       logger=logger)

        send_to_bucket(CONFIG['GCP']['RAW_VAULT'],
                       'news',
                       xz_file,
                       logger=logger)

    logger.info("sending metrics")
    send_metric(CONFIG, "news_count", "int64_value", n_items)
    send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)


if __name__ == '__main__':

    logger.info("news job, initializing")

    try:

        main()
        send_metric(CONFIG, "news_success_indicator", "int64_value", 1)

    except Exception as e:

        exc = traceback.format_exc()
        logger.warning(f"news job error, {e}, {exc}")
        send_metric(CONFIG, "news_success_indicator", "int64_value", 0)

    logger.info("news job, terminating")
示例#29
0
文件: batch.py 项目: zQuantz/OscraP
def fix_faults(batch_id, tickers):
    def add_to_faults(key, obj, faults):
        for ticker in obj:
            try:
                faults[ticker][key] = obj[ticker]
            except Exception as e:
                faults[ticker] = {key: obj[ticker]}
        return faults

    def check_lower_bounds(tickers, product):

        lower_bounds = _connector.get_lower_bounds(f"{product}counts",
                                                   batch_id)
        lower_bounds = lower_bounds.set_index("ticker")
        lower_bounds = lower_bounds.astype(int).to_dict()['lower_bound']

        unhealthy = {}
        for ticker in tickers:

            if ticker not in lower_bounds:
                continue

            file = (DATA / product / f"{ticker}_{DATE}.csv")
            if file.exists():

                df = pd.read_csv(file)
                if len(df) <= lower_bounds[ticker]:

                    unhealthy[ticker] = {
                        "lower_bound": lower_bounds[ticker],
                        "old": len(df),
                        "new": 0
                    }

            else:

                unhealthy[ticker] = {
                    "lower_bound": lower_bounds[ticker],
                    "old": 0,
                    "new": 0
                }

        return unhealthy

    def check_ohlc(tickers):

        tickers = _connector.get_distinct_ohlc_tickers(batch_id).ticker

        collected = [
            ticker.split("_")[0] for ticker in os.listdir(f"{DATA}/ohlc")
        ]

        unhealthy = {}
        for ticker in tickers:

            if ticker not in collected:

                unhealthy[ticker] = {"status": 0, "new_status": 0}

        return unhealthy

    try:

        analysis_faults = check_lower_bounds(tickers, "analysis")
        keystats_faults = check_lower_bounds(tickers, "keystats")
        # options_faults = check_lower_bounds(tickers, "options")
        # ohlc_faults = check_ohlc(tickers)

        logger.info(f"SCRAPER,{batch_id},FAULTS,SUCCESS,")

    except Exception as e:

        logger.info(f"SCRAPER,{batch_id},FAULTS,FAILURE,{e}")

    faults = add_to_faults("analysis", analysis_faults, {})
    faults = add_to_faults("keystats", keystats_faults, faults)
    # faults = add_to_faults("options", options_faults, faults)
    # faults = add_to_faults("ohlc", ohlc_faults, faults)
    faults = collect_data_again(batch_id, faults)

    faults_summary = {
        key: {}
        for key in ["analysis", "keystats", "ohlc", "options"][:2]
    }

    for ticker in faults:
        for key in faults[ticker]:
            faults_summary[key][ticker] = faults[ticker][key]

    return faults_summary
示例#30
0
def main():

    company_names = pd.read_csv(f"{DIR}/../clean/data/company_names.csv")
    company_names = company_names[['ticker', 'name']]

    company_names = pd.concat([company_names, buzzwords])
    company_names = company_names.reset_index(drop=True)

    chunks = np.array_split(company_names, 5)
    id_cache, ids = get_id_cache()

    errors = mp.Queue()

    processes = [
        mp.Process(target=collect_news,
                   args=(job_id, chunk, id_cache, ids, errors))
        for job_id, chunk in enumerate(chunks)
    ]

    for process in processes:
        process.start()

    for process in processes:
        process.join()

    if not errors.empty():
        error = errors.get()
        raise Exception(error)

    ###############################################################################################

    for file in IDSDIR.iterdir():

        if file.name == '.gitignore':
            continue

        with open(file, "r") as _file:
            id_cache[SDATE].extend(json.loads(_file.read()))

    id_cache[SDATE] = list(set(id_cache[SDATE]))
    n_items = len(id_cache[SDATE])
    n_unique = n_items

    ids = set([_id for idlist in id_cache.values() for _id in idlist])

    with open(f"{DIR}/data/id_cache.json", "w") as file:
        file.write(json.dumps(id_cache))

    ###############################################################################################

    backups = os.listdir(f"{DIR}/news_data_backup")
    xz_file = Path(f"{DIR}/news_data_backup/{SDATE}.tar.xz")

    if datetime.now().hour >= 10 and not xz_file.exists():

        logger.info("news job, daily save")
        n_items, n_unique = save_items(PATH, SDATE)

        if gethostname() != CONFIG['MACHINE']['HOSTNAME']:
            CONFIG['GCP']['RAW_BUCKET'] = "tmp_items"
            CONFIG['GCP']['RAW_VAULT'] = "tmp_items_vault"

        send_to_bucket(CONFIG['GCP']['RAW_BUCKET'],
                       'news',
                       xz_file,
                       logger=logger)

        send_to_bucket(CONFIG['GCP']['RAW_VAULT'],
                       'news',
                       xz_file,
                       logger=logger)

    logger.info("sending metrics")
    send_metric(CONFIG, "news_count", "int64_value", n_items)
    send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)