def main(): # redis subscriber index:0-3 client = mon.mongo_connection('linode1', 'mongo') with open('/Users/huangyiling/Github/stock/double_check_stock1.csv', newline='') as file: rows = csv.reader(file) for stock_id in rows: stock_id = ''.join(stock_id) print(f"get stock {stock_id}") coll_stock = mon.mongo_collection( client, 'stocks', f"stock{stock_id}") # for year in range(2010, 2021): # 測試當年度是否有資料 # test_month = 12 # test_url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(test_month).zfill(2)}01&stockNo={stock_id}""" # print(f"test stock {stock_id} in {year} exist ?") # test_docs = crawler.crawler(test_url) # if test_docs: # print("=> Yes, exist!") for month in range(1, 13): url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=2020{str(month).zfill(2)}01&stockNo={stock_id}""" print(f"-- Crawler >>> {url}") documents = crawler.crawler(url) if documents: # print(documents) for item in documents: # 記錄爬取的股票資料並寫入mongo mon.insert_document(coll_stock, item) print( f'stock: {stock_id} in 2020{str(month).zfill(2)} insert done.') time.sleep(10) print( f'stock: {stock_id} in 2020{str(month).zfill(2)} crawl done.')
def count_url_check(): expected_count = 938 * 11 * 12 collection = mon.mongo_collection(mon.mongo_connection('linode1', 'mongo'), 'stocks', 'crawlerURL') # reality_count = collection.find({}).count() reality_count = collection.count_documents print("Expected :", expected_count) print("Reality :", reality_count)
def all_stock_id(): client = mon.mongo_connection('linode1', 'mongo') collection_stock = mon.mongo_collection(client, 'stocks', "stockInfo") contents = list(collection_stock.find({}, {'_id': 1})) for item in contents: stock_id = item['_id'] # print(stock_id) print("amount of stocks:", len(contents)) return contents
def count_stocks(): stock_count = 0 client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', 'stockIndustry') contents = collection.find({}, {'stocks_count': 1}) for item in contents: id_list_count = item['stocks_count'] stock_count += id_list_count print('stocks_count :', stock_count)
def season_records(): # check today to appoint duration yesterday = datetime.date.today() - datetime.timedelta(1) year = yesterday.strftime("%Y") # set moongosdb and postgres connection client = mon.mongo_connection('linode1', 'mongo') conn_pos = pos.postgres_connection('linode1', 'postgres', 'stock') cursor_pos = pos.make_cursor(connection=conn_pos) # stock id for item in allStockID.all_stock_id(): stock_id = item['_id'] for item in allStockID.all_stock_id(): stock_id = item['_id'] # create table if not exists with table name like month+stock_id sql = f""" create table if not exists year{stock_id} ( ID varchar(8) primary key, duration char(4), sum_volumn decimal, avg_price decimal, avg_open decimal, avg_high decimal, avg_low decimal, avg_close decimal, avg_change decimal, sum_trade decimal ); """ pos.createTable(connection=conn_pos, cursor=cursor_pos, sql=sql) # iterately insert data into table for year in range(2010, 2021): dur_records = duration_records.get_year_record_mongo( stock_id, year) if dur_records: docs = compute_records.compute_records(dur_records) # 將計算好的結果存入postgres print(f"{stock_id}: {year}") print(docs) query = f""" INSERT INTO year{stock_id} (ID, duration, sum_volumn, sum_trade, avg_price,avg_open, avg_high, avg_low, avg_close, avg_change) VALUES ('{stock_id+year}', '{year}', {docs['sumVolume']}, {docs['sumTrades']}, {docs['avgPrice']}, {docs['avgOpen']}, {docs['avgHigh']},{docs['avgLow']}, {docs['avgClose']}, {docs['avgChange']}) """ if not conn_pos: conn_pos = pos.postgres_connection('linode1', 'postgres', 'stock') cursor_pos = pos.make_cursor(connection=conn_pos) pos.insertTable(connection=conn_pos, cursor=cursor_pos, query=query, exceptionfile='year') pos.close_connection(connection=conn_pos) mon.close_connection(client=client)
def season_records(): # check today to appoint duration yesterday = datetime.date.today() - datetime.timedelta(1) year = yesterday.strftime("%Y") month = yesterday.strftime("%m") season_dict = {'3': '01', '6': '02', '9': '03', '12': '04'} season = season_dict[month] # set moongosdb and postgres connection client = mon.mongo_connection('linode1', 'mongo') conn_pos = pos.postgres_connection('linode1', 'postgres', 'stock') cursor_pos = pos.make_cursor(connection=conn_pos) # stock id for item in allStockID.all_stock_id(): stock_id = item['_id'] # create table if not exists with table name like month+stock_id sql = f""" create table if not exists season{stock_id} ( ID varchar(10) primary key, duration char(6), sum_volumn decimal, avg_price decimal, avg_open decimal, avg_high decimal, avg_low decimal, avg_close decimal, avg_change decimal, sum_trade decimal ); """ pos.createTable(connection=conn_pos, cursor=cursor_pos, sql=sql) dur_records = duration_records.get_season_record_mongo( stock_id, year, season) if dur_records: # print('dur', dur_records) docs = compute_records.compute_records(dur_records) # 將計算好的結果存入postgres print(f"{stock_id}: {year}, season: {season}") print(docs) query = f""" INSERT INTO season{stock_id} (ID, duration, sum_volumn, sum_trade, avg_price, avg_open, avg_high, avg_low, avg_close, avg_change) VALUES ('{stock_id+year+season}', '{year+season.zfill(2)}', {docs['sumVolume']}, {docs['sumTrades']}, {docs['avgPrice']}, {docs['avgOpen']}, {docs['avgHigh']},{docs['avgLow']}, {docs['avgClose']}, {docs['avgChange']}) """ if not conn_pos: conn_pos = pos.postgres_connection('linode1', 'postgres', 'stock') cursor_pos = pos.make_cursor(connection=conn_pos) pos.insertTable(connection=conn_pos, cursor=cursor_pos, query=query, exceptionfile='season') pos.close_connection(connection=conn_pos) mon.close_connection(client=client)
def main(): # redis subscriber index:0-3 redisConnect = red.redis_connection('linode1', 'redis', db=0) client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') while True: # get keys and values from mongo keys = red.redis_get_all_kv(redisConnect) for key in keys: amount = int(os.environ.get("amount")) # amount of subscriber index = int(os.environ.get("index")) # subscriber num num = int(key.split('No_')[-1]) # redis key # 決定subscriber要取用哪筆資料 if num % int(amount) == int(index): stock_id = red.redis_get_value(redisConnect, key) print(f"get stock {stock_id}") red.redis_delete_key(redisConnect, key) # 取出stock_id就從redis刪掉 coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 2 }}) # 表示已經從redis刪掉 coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") for year in range(2010, 2022): for month in range(1, 13): if year == 2021 and month > 2: break url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(month).zfill(2)}01&stockNo={stock_id}""" print(f"-- Crawler >>> {url}") documents = crawler.crawler(url) if documents: for item in documents: # 記錄爬取的股票資料並寫入mongo mon.insert_document(coll_stock, item) coll_stockInfo.update_one({'_id': stock_id}, { '$set': { 'monthStatus': str(year) + str(month).zfill(2) } }) # 當月爬完 print( f'stock: {stock_id} in {year}{month} insert done.' ) time.sleep(10) print( f'stock: {stock_id} in {year}{month} crawl done.') coll_stockInfo.update_one({'_id': stock_id}, {'$set': { 'yearStatus': year }}) # 當年爬完 coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 3 }}) # 表示這支股票已經都爬完
def create_urls(stock_ids): mongoClient = mon.mongo_connection('linode1', 'mongo') mongoCollection = mon.mongo_collection(mongoClient, 'stocks', 'crawlerURL') for stock_id in stock_ids: for year in year_list: for month in month_list: url = f'https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year + month.zfill(2)}01&stockNo={stock_id}' doc = { '_id': stock_id + year + month.zfill(2), 'url': url, 'crawlerStatus': 0 } mon.insert_document(mongoCollection, doc)
def month_records(): # set moongosdb and postgres connection client = mon.mongo_connection('linode1', 'mongo') conn_pos = pos.postgres_connection('linode1', 'postgres', 'stock') cursor_pos = pos.make_cursor(connection=conn_pos) # stock id for item in allStockID.all_stock_id(): stock_id = item['_id'] # create table if not exists with table name like month+stock_id sql = f""" create table if not exists month{stock_id} ( ID varchar(10) primary key, duration char(6), sum_volumn decimal, avg_price decimal, avg_open decimal, avg_high decimal, avg_low decimal, avg_close decimal, avg_change decimal, sum_trade decimal ); """ pos.createTable(connection=conn_pos, cursor=cursor_pos, sql=sql) # iterately insert data into table for year in range(2010, 2021): for month in range(1, 13): dur_records = duration_records.get_month_record_mongo( stock_id, year, month) if dur_records: docs = compute_records.compute_records(dur_records) # 將計算好的結果存入postgres print(f"{stock_id}: {str(year)+str(month).zfill(2)}") print(docs) query = f""" INSERT INTO month{stock_id} (ID, duration, sum_volumn, sum_trade, avg_price, avg_open, avg_high, avg_low, avg_close, avg_change) VALUES ('{stock_id+str(year)+str(month).zfill(2)}', '{str(year)+str(month).zfill(2)}', {docs['sumVolume']}, {docs['sumTrades']}, {docs['avgPrice']}, {docs['avgOpen']}, {docs['avgHigh']},{docs['avgLow']}, {docs['avgClose']}, {docs['avgChange']}) """ if not conn_pos: conn_pos = pos.postgres_connection( 'linode1', 'postgres', 'stock') cursor_pos = pos.make_cursor(connection=conn_pos) pos.insertTable(connection=conn_pos, cursor=cursor_pos, query=query, exceptionfile='oldmonth') pos.close_connection(connection=conn_pos) mon.close_connection(client=client)
def industry_crawler(): url = 'https://www.cnyes.com/twstock/stock_astock.aspx?ga=nav' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } res = requests.get(url, headers=headers) # print(res.status_code) soup = BeautifulSoup(res.text, 'html.parser') industries = soup.select('div[id="kinditem_0"]>ul[class="kdlist"]>li') # get all industries for industry in industries: industry_name = industry.a.text print(industry_name) industry_url = 'https://www.cnyes.com/twstock/' + industry.a["href"] print(industry_url) industry_id = industry_url.split('groupId=')[-1].split('&stitle')[0] # get all stocks from the industry res_stock = requests.get(industry_url, headers=headers) # print(res_stock.status_code) soup_stock = BeautifulSoup(res_stock.text, 'html.parser') stocks = soup_stock.select('div[class="TableBox"]>table>tr') stock_list = [] stock_dict = dict() for stock in stocks[1:]: stock_info = stock.find_all('td') stock_id = stock_info[1].text # print(stock_id) stock_name = stock_info[2].text # print(stock_name) stock_list.append(stock_id) stock_dict[stock_id] = stock_name industry_key_id = 'industry_' + industry_id doc = { '_id': industry_key_id, 'industry': industry_kv[industry_name], 'industry_name': industry_name, 'stocks_list': stock_list, 'stocks_count': len(stock_list), 'stocks': stock_dict } # print(doc) mongo_client = mon.mongo_connection('linode1', 'mongo') mongo_collection = mon.mongo_collection(mongo_client, 'stocks', 'stockIndustry') mon.insert_document(mongo_collection, doc) time.sleep(20)
def stock_crawler(stock_ids_list): for stocks in stock_ids_list: stock_ids = stocks['stocks_list'] client = mon.mongo_connection('linode1', 'mongo') for stock_id in stock_ids: collection = mon.create_collection(client, 'stocks', f'stock{stock_id}') for year in range(10, 21): for month in [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12' ]: stock_url = f"https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=20{str(year)}{month}01&stockNo={stock_id}" res = requests.get(stock_url, headers=headers) soup = BeautifulSoup(res.text, "lxml") table = soup.find_all('table')[0] df = pd.read_html(str(table))[0] for index in range(len(df)): date = df.iat[index, 0] # 交易日 date_ad = str(1911 + int(date.split('/')[0])) + ''.join( date.split('/')[1:]) volume = int(df.iat[index, 1]) # 交易量(股數) price = float(df.iat[index, 2]) # 成交金額 open_ = float(df.iat[index, 3]) # 開盤價 high = float(df.iat[index, 4]) # 最高價 low = float(df.iat[index, 5]) # 最低價 close_ = float(df.iat[index, 6]) # 收盤價 change_ori = df.iat[index, 7] # 高低價差 if change_ori == 'X0.00': change = float(0.00) else: change = float(change_ori) trades = int(df.iat[index, 8]) # 成交筆數 doc = { '_id': stock_id + date_ad, 'trade_date': date_ad, 'volume': volume, 'price': price, 'open': open_, 'high': high, 'low': low, 'close': close_, 'change': change, 'trades': trades } print(doc) mon.insert_document(collection, doc) # df.to_csv(f'/Users/huangyiling/Desktop/stock/2330/stock{stock_id}_20{str(i)}{j}.csv') time.sleep(20)
def get_month_record_mongo(stock_id, year, month): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") # 檢查當是否有存資料 if collection.find({ "trade_date": { "$regex": f"{str(year)+str(month)}" } }, { "trade_date": 1 }).count() != 0: contents = list( collection.find({'_id': re.compile(f"{stock_id}{year}{month}")})) return contents else: return
def pick_proxy(amount=10): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'proxy', 'proxyPool_1') contents = collection.find({}, {'ip': 1, 'port': 1}).limit(amount) redisConnect = red.redis_connection('linode1', 'redis', db=1) # proxy use db=1 for item in contents: proxy = item['ip'] + ':' + item['port'] try: validate_proxy(proxy) for index in range(1, 21): key = f'proxy{index}' if not redisConnect.exists(key): red.redis_set_key_value(redisConnect, key, proxy) except Exception: pass
def check_records_exist(): client = mon.mongo_connection('linode1', 'mongo') for content in allStockID.all_stock_id(): stock_id = content['_id'] # print(stock_id) coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") # stocks_con = list(coll_stock.find( # {"trade_date": {"$regex": "2020"}}, {"trade_date": 1})) records_count = coll_stock.find({ "trade_date": { "$regex": "202102" } }, { "trade_date": 1 }).count() if records_count < 5: print(stock_id, records_count) wcsv.writeToCsv("double_check_stock", [stock_id])
def stockInfo(): client = mon.mongo_connection('linode1', 'mongo') coll_stockIndustry = mon.mongo_collection(s client, 'stocks', 'stockIndustry') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') for item in mon.find_all_mongo(coll_stockIndustry): # print(item) ids = item['stocks_list'] for stock_id in ids: doc = { '_id': stock_id, 'industry': item['_id'], 'name': 'name', 'abbreviation': item['stocks'][stock_id], 'dailyStatus': 0, 'monthStatus': 0, 'yearStatus': 0} print(doc) mon.insert_document(coll_stockInfo, doc)
def get_season_record_mongo(stock_id, year, season): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") print('count_documents:', collection.count_documents({})) if season == 1: contents = get_3_month_records(collection, stock_id, year, ['01', '02', '03']) return contents elif season == 2: contents = get_3_month_records(collection, stock_id, year, ['04', '05', '06']) return contents elif season == 3: contents = get_3_month_records(collection, stock_id, year, ['07', '08', '09']) return contents elif season == 4: contents = get_3_month_records(collection, stock_id, year, ['10', '11', '12']) return contents else: return
def main(): client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') redisConnect = red.redis_connection('linode1', 'redis', db=0) while True: try: # check if key in redis doesn't exist for num in range(1, 9): key = f'stock_No_{num}' if not redisConnect.exists(key): # 表示還有未丟到redis的stock_id if coll_stockInfo.find({ 'crawlerStatus': 0 }, { '_id': 1 }).count() != 0: content = coll_stockInfo.find({ 'crawlerStatus': 0 }, { '_id': 1 }).limit(1) stock_id = content[0]['_id'] print(f"{key} disapear >>> set {stock_id}") # 放到redis red.redis_set_key_value(redisConnect, key, stock_id) # 表示已經放到redis coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 1 }}) # 表示已經從redis刪掉但還沒爬蟲好的stock_id # elif coll_stockInfo.find({'crawlerStatus': 2}, {'_id': 1}).count() != 0: # coll_stockInfo.update({'crawlerStatus': {'$ne': 3}}, { # '$set': {'crawlerStatus': 0}}) # main() # content = coll_stockInfo.find( # {'crawlerStatus': {'$ne': 1}}, {'_id': 1}).limit(1) # stock_id = content[0]['_id'] # print(f"{key} disapear >>> set {stock_id}") # # 再丟上去一次redis # red.redis_set_key_value(redisConnect, key, stock_id) else: wcsv.writeToCsv('./data/publisherStatus', [ 'all stock ids have published on redis', datetime.datetime.now() ]) break print(f"{key} still exist.") time.sleep(100) # 全部爬蟲完成就中止丟資料到redis if coll_stockInfo.find({ 'crawlerStatus': { '$ne': 3 } }, { '_id': 1 }).count() == 0: print("== All stock crawlering done ==") break except Exception as e: wcsv.writeToCsv("./dataStore/redisException", [e]) print(e)
def get_stock_ids(): mongo_client = mon.mongo_connection('linode1', 'mongo') coll_stockIndustry = mon.mongo_collection(mongo_client, 'stocks', 'stockIndustry') contents = mon.find_some_fields_mongo(coll_stockIndustry, ['stocks_list']) return contents
def get_stock_ids(): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', 'stockIndustry') contents = collection.find({}, {'stocks_list': 1}) return contents
def get_year_record_mongo(stock_id, year): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") contents = list(collection.find({'_id': re.compile(f"{stock_id}{year}")})) return contents
def crawler_daily(): counts = 0 # notify daily updation starts goo.main('stock_crawler', 'Stocks Daily Updation Starts!') # start time t1 = datetime.datetime.now() # set daily status zero for default client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') coll_stockInfo.update_many({}, {'$set': {'dailyStatus': 0}}) # today today = datetime.date.today() #-datetime.timedelta(1) year = today.strftime("%Y") month = today.strftime("%m") day = today.strftime("%d") # get all stocks' id for content in allStockID.all_stock_id(): stock_id = content['_id'] print(stock_id) retry = 0 url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year}{month}01&stockNo={stock_id}""" coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") while retry < 3: try: contents = crawler.crawler(url) # print(contents) for item in contents: # daily record to mongo mon.insert_document(coll_stock, item) # crawlering and writing to mongo done, set daily status as datetime coll_stockInfo.update_one( {'_id': stock_id}, {'$set': { 'dailyStatus': f"{year+month+day}" }}) counts += 1 time.sleep(10) break except Exception as e: print(e) time.sleep(10) retry += 1 if retry == 3: # sent notify with googlebot goo.main('stock_crawler', f"{stock_id}, {year,month,day} Wrong: {e}") wcsv.writeToCsv( f'./dataStore/DailyCrawlerException_{today}', [stock_id, year, month, day]) continue # check daily update done if coll_stockInfo.find({ 'dailyStatus': { '$ne': f"{year+month+day}" } }, { '_id': 1 }).count() != 0: crawler_daily() # notify daily updation done cost_time = datetime.datetime.now() - t1 goo.main( 'stock_crawler', f"{datetime.date.today()}: Daily Updation Finished!\nCheck amount of stock: {counts}, except: {938-counts}\nCost_time: {cost_time}" ) return