def main(): # redis subscriber index:0-3 redisConnect = red.redis_connection('linode1', 'redis', db=0) client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') while True: # get keys and values from mongo keys = red.redis_get_all_kv(redisConnect) for key in keys: amount = int(os.environ.get("amount")) # amount of subscriber index = int(os.environ.get("index")) # subscriber num num = int(key.split('No_')[-1]) # redis key # 決定subscriber要取用哪筆資料 if num % int(amount) == int(index): stock_id = red.redis_get_value(redisConnect, key) print(f"get stock {stock_id}") red.redis_delete_key(redisConnect, key) # 取出stock_id就從redis刪掉 coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 2 }}) # 表示已經從redis刪掉 coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") for year in range(2010, 2022): for month in range(1, 13): if year == 2021 and month > 2: break url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(month).zfill(2)}01&stockNo={stock_id}""" print(f"-- Crawler >>> {url}") documents = crawler.crawler(url) if documents: for item in documents: # 記錄爬取的股票資料並寫入mongo mon.insert_document(coll_stock, item) coll_stockInfo.update_one({'_id': stock_id}, { '$set': { 'monthStatus': str(year) + str(month).zfill(2) } }) # 當月爬完 print( f'stock: {stock_id} in {year}{month} insert done.' ) time.sleep(10) print( f'stock: {stock_id} in {year}{month} crawl done.') coll_stockInfo.update_one({'_id': stock_id}, {'$set': { 'yearStatus': year }}) # 當年爬完 coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 3 }}) # 表示這支股票已經都爬完
def main(): # redis subscriber index:0-3 client = mon.mongo_connection('linode1', 'mongo') with open('/Users/huangyiling/Github/stock/double_check_stock1.csv', newline='') as file: rows = csv.reader(file) for stock_id in rows: stock_id = ''.join(stock_id) print(f"get stock {stock_id}") coll_stock = mon.mongo_collection( client, 'stocks', f"stock{stock_id}") # for year in range(2010, 2021): # 測試當年度是否有資料 # test_month = 12 # test_url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(test_month).zfill(2)}01&stockNo={stock_id}""" # print(f"test stock {stock_id} in {year} exist ?") # test_docs = crawler.crawler(test_url) # if test_docs: # print("=> Yes, exist!") for month in range(1, 13): url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=2020{str(month).zfill(2)}01&stockNo={stock_id}""" print(f"-- Crawler >>> {url}") documents = crawler.crawler(url) if documents: # print(documents) for item in documents: # 記錄爬取的股票資料並寫入mongo mon.insert_document(coll_stock, item) print( f'stock: {stock_id} in 2020{str(month).zfill(2)} insert done.') time.sleep(10) print( f'stock: {stock_id} in 2020{str(month).zfill(2)} crawl done.')
def count_url_check(): expected_count = 938 * 11 * 12 collection = mon.mongo_collection(mon.mongo_connection('linode1', 'mongo'), 'stocks', 'crawlerURL') # reality_count = collection.find({}).count() reality_count = collection.count_documents print("Expected :", expected_count) print("Reality :", reality_count)
def count_stocks(): stock_count = 0 client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', 'stockIndustry') contents = collection.find({}, {'stocks_count': 1}) for item in contents: id_list_count = item['stocks_count'] stock_count += id_list_count print('stocks_count :', stock_count)
def all_stock_id(): client = mon.mongo_connection('linode1', 'mongo') collection_stock = mon.mongo_collection(client, 'stocks', "stockInfo") contents = list(collection_stock.find({}, {'_id': 1})) for item in contents: stock_id = item['_id'] # print(stock_id) print("amount of stocks:", len(contents)) return contents
def create_urls(stock_ids): mongoClient = mon.mongo_connection('linode1', 'mongo') mongoCollection = mon.mongo_collection(mongoClient, 'stocks', 'crawlerURL') for stock_id in stock_ids: for year in year_list: for month in month_list: url = f'https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year + month.zfill(2)}01&stockNo={stock_id}' doc = { '_id': stock_id + year + month.zfill(2), 'url': url, 'crawlerStatus': 0 } mon.insert_document(mongoCollection, doc)
def industry_crawler(): url = 'https://www.cnyes.com/twstock/stock_astock.aspx?ga=nav' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } res = requests.get(url, headers=headers) # print(res.status_code) soup = BeautifulSoup(res.text, 'html.parser') industries = soup.select('div[id="kinditem_0"]>ul[class="kdlist"]>li') # get all industries for industry in industries: industry_name = industry.a.text print(industry_name) industry_url = 'https://www.cnyes.com/twstock/' + industry.a["href"] print(industry_url) industry_id = industry_url.split('groupId=')[-1].split('&stitle')[0] # get all stocks from the industry res_stock = requests.get(industry_url, headers=headers) # print(res_stock.status_code) soup_stock = BeautifulSoup(res_stock.text, 'html.parser') stocks = soup_stock.select('div[class="TableBox"]>table>tr') stock_list = [] stock_dict = dict() for stock in stocks[1:]: stock_info = stock.find_all('td') stock_id = stock_info[1].text # print(stock_id) stock_name = stock_info[2].text # print(stock_name) stock_list.append(stock_id) stock_dict[stock_id] = stock_name industry_key_id = 'industry_' + industry_id doc = { '_id': industry_key_id, 'industry': industry_kv[industry_name], 'industry_name': industry_name, 'stocks_list': stock_list, 'stocks_count': len(stock_list), 'stocks': stock_dict } # print(doc) mongo_client = mon.mongo_connection('linode1', 'mongo') mongo_collection = mon.mongo_collection(mongo_client, 'stocks', 'stockIndustry') mon.insert_document(mongo_collection, doc) time.sleep(20)
def get_month_record_mongo(stock_id, year, month): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") # 檢查當是否有存資料 if collection.find({ "trade_date": { "$regex": f"{str(year)+str(month)}" } }, { "trade_date": 1 }).count() != 0: contents = list( collection.find({'_id': re.compile(f"{stock_id}{year}{month}")})) return contents else: return
def pick_proxy(amount=10): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'proxy', 'proxyPool_1') contents = collection.find({}, {'ip': 1, 'port': 1}).limit(amount) redisConnect = red.redis_connection('linode1', 'redis', db=1) # proxy use db=1 for item in contents: proxy = item['ip'] + ':' + item['port'] try: validate_proxy(proxy) for index in range(1, 21): key = f'proxy{index}' if not redisConnect.exists(key): red.redis_set_key_value(redisConnect, key, proxy) except Exception: pass
def check_records_exist(): client = mon.mongo_connection('linode1', 'mongo') for content in allStockID.all_stock_id(): stock_id = content['_id'] # print(stock_id) coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") # stocks_con = list(coll_stock.find( # {"trade_date": {"$regex": "2020"}}, {"trade_date": 1})) records_count = coll_stock.find({ "trade_date": { "$regex": "202102" } }, { "trade_date": 1 }).count() if records_count < 5: print(stock_id, records_count) wcsv.writeToCsv("double_check_stock", [stock_id])
def stockInfo(): client = mon.mongo_connection('linode1', 'mongo') coll_stockIndustry = mon.mongo_collection(s client, 'stocks', 'stockIndustry') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') for item in mon.find_all_mongo(coll_stockIndustry): # print(item) ids = item['stocks_list'] for stock_id in ids: doc = { '_id': stock_id, 'industry': item['_id'], 'name': 'name', 'abbreviation': item['stocks'][stock_id], 'dailyStatus': 0, 'monthStatus': 0, 'yearStatus': 0} print(doc) mon.insert_document(coll_stockInfo, doc)
def get_season_record_mongo(stock_id, year, season): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") print('count_documents:', collection.count_documents({})) if season == 1: contents = get_3_month_records(collection, stock_id, year, ['01', '02', '03']) return contents elif season == 2: contents = get_3_month_records(collection, stock_id, year, ['04', '05', '06']) return contents elif season == 3: contents = get_3_month_records(collection, stock_id, year, ['07', '08', '09']) return contents elif season == 4: contents = get_3_month_records(collection, stock_id, year, ['10', '11', '12']) return contents else: return
def get_stock_ids(): mongo_client = mon.mongo_connection('linode1', 'mongo') coll_stockIndustry = mon.mongo_collection(mongo_client, 'stocks', 'stockIndustry') contents = mon.find_some_fields_mongo(coll_stockIndustry, ['stocks_list']) return contents
def get_year_record_mongo(stock_id, year): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") contents = list(collection.find({'_id': re.compile(f"{stock_id}{year}")})) return contents
def main(): client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') redisConnect = red.redis_connection('linode1', 'redis', db=0) while True: try: # check if key in redis doesn't exist for num in range(1, 9): key = f'stock_No_{num}' if not redisConnect.exists(key): # 表示還有未丟到redis的stock_id if coll_stockInfo.find({ 'crawlerStatus': 0 }, { '_id': 1 }).count() != 0: content = coll_stockInfo.find({ 'crawlerStatus': 0 }, { '_id': 1 }).limit(1) stock_id = content[0]['_id'] print(f"{key} disapear >>> set {stock_id}") # 放到redis red.redis_set_key_value(redisConnect, key, stock_id) # 表示已經放到redis coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 1 }}) # 表示已經從redis刪掉但還沒爬蟲好的stock_id # elif coll_stockInfo.find({'crawlerStatus': 2}, {'_id': 1}).count() != 0: # coll_stockInfo.update({'crawlerStatus': {'$ne': 3}}, { # '$set': {'crawlerStatus': 0}}) # main() # content = coll_stockInfo.find( # {'crawlerStatus': {'$ne': 1}}, {'_id': 1}).limit(1) # stock_id = content[0]['_id'] # print(f"{key} disapear >>> set {stock_id}") # # 再丟上去一次redis # red.redis_set_key_value(redisConnect, key, stock_id) else: wcsv.writeToCsv('./data/publisherStatus', [ 'all stock ids have published on redis', datetime.datetime.now() ]) break print(f"{key} still exist.") time.sleep(100) # 全部爬蟲完成就中止丟資料到redis if coll_stockInfo.find({ 'crawlerStatus': { '$ne': 3 } }, { '_id': 1 }).count() == 0: print("== All stock crawlering done ==") break except Exception as e: wcsv.writeToCsv("./dataStore/redisException", [e]) print(e)
def get_stock_ids(): client = mon.mongo_connection('linode1', 'mongo') collection = mon.mongo_collection(client, 'stocks', 'stockIndustry') contents = collection.find({}, {'stocks_list': 1}) return contents
def crawler_daily(): counts = 0 # notify daily updation starts goo.main('stock_crawler', 'Stocks Daily Updation Starts!') # start time t1 = datetime.datetime.now() # set daily status zero for default client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') coll_stockInfo.update_many({}, {'$set': {'dailyStatus': 0}}) # today today = datetime.date.today() #-datetime.timedelta(1) year = today.strftime("%Y") month = today.strftime("%m") day = today.strftime("%d") # get all stocks' id for content in allStockID.all_stock_id(): stock_id = content['_id'] print(stock_id) retry = 0 url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year}{month}01&stockNo={stock_id}""" coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") while retry < 3: try: contents = crawler.crawler(url) # print(contents) for item in contents: # daily record to mongo mon.insert_document(coll_stock, item) # crawlering and writing to mongo done, set daily status as datetime coll_stockInfo.update_one( {'_id': stock_id}, {'$set': { 'dailyStatus': f"{year+month+day}" }}) counts += 1 time.sleep(10) break except Exception as e: print(e) time.sleep(10) retry += 1 if retry == 3: # sent notify with googlebot goo.main('stock_crawler', f"{stock_id}, {year,month,day} Wrong: {e}") wcsv.writeToCsv( f'./dataStore/DailyCrawlerException_{today}', [stock_id, year, month, day]) continue # check daily update done if coll_stockInfo.find({ 'dailyStatus': { '$ne': f"{year+month+day}" } }, { '_id': 1 }).count() != 0: crawler_daily() # notify daily updation done cost_time = datetime.datetime.now() - t1 goo.main( 'stock_crawler', f"{datetime.date.today()}: Daily Updation Finished!\nCheck amount of stock: {counts}, except: {938-counts}\nCost_time: {cost_time}" ) return