Пример #1
0
def main():  # redis subscriber index:0-3
    redisConnect = red.redis_connection('linode1', 'redis', db=0)
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    while True:
        # get keys and values from mongo
        keys = red.redis_get_all_kv(redisConnect)
        for key in keys:
            amount = int(os.environ.get("amount"))  # amount of subscriber
            index = int(os.environ.get("index"))  # subscriber num
            num = int(key.split('No_')[-1])  # redis key
            # 決定subscriber要取用哪筆資料
            if num % int(amount) == int(index):
                stock_id = red.redis_get_value(redisConnect, key)
                print(f"get stock {stock_id}")
                red.redis_delete_key(redisConnect, key)  # 取出stock_id就從redis刪掉
                coll_stockInfo.update({'_id': stock_id},
                                      {'$set': {
                                          'crawlerStatus': 2
                                      }})  # 表示已經從redis刪掉
                coll_stock = mon.mongo_collection(client, 'stocks',
                                                  f"stock{stock_id}")
                for year in range(2010, 2022):
                    for month in range(1, 13):
                        if year == 2021 and month > 2:
                            break
                        url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(month).zfill(2)}01&stockNo={stock_id}"""
                        print(f"-- Crawler >>> {url}")
                        documents = crawler.crawler(url)
                        if documents:
                            for item in documents:
                                # 記錄爬取的股票資料並寫入mongo
                                mon.insert_document(coll_stock, item)
                            coll_stockInfo.update_one({'_id': stock_id}, {
                                '$set': {
                                    'monthStatus':
                                    str(year) + str(month).zfill(2)
                                }
                            })  # 當月爬完
                            print(
                                f'stock: {stock_id} in {year}{month} insert done.'
                            )
                        time.sleep(10)
                        print(
                            f'stock: {stock_id} in {year}{month} crawl done.')
                    coll_stockInfo.update_one({'_id': stock_id},
                                              {'$set': {
                                                  'yearStatus': year
                                              }})  # 當年爬完
                coll_stockInfo.update({'_id': stock_id},
                                      {'$set': {
                                          'crawlerStatus': 3
                                      }})  # 表示這支股票已經都爬完
Пример #2
0
def main():  # redis subscriber index:0-3
    client = mon.mongo_connection('linode1', 'mongo')
    with open('/Users/huangyiling/Github/stock/double_check_stock1.csv', newline='') as file:
        rows = csv.reader(file)
        for stock_id in rows:
            stock_id = ''.join(stock_id)
            print(f"get stock {stock_id}")
            coll_stock = mon.mongo_collection(
                client, 'stocks', f"stock{stock_id}")
            # for year in range(2010, 2021):
                # 測試當年度是否有資料
                # test_month = 12
                # test_url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(test_month).zfill(2)}01&stockNo={stock_id}"""
                # print(f"test stock {stock_id} in {year} exist ?")
                # test_docs = crawler.crawler(test_url)
                # if test_docs:
                    # print("=> Yes, exist!")
            for month in range(1, 13):
                url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=2020{str(month).zfill(2)}01&stockNo={stock_id}"""
                print(f"-- Crawler >>> {url}")
                documents = crawler.crawler(url)
                if documents:
                    # print(documents)
                    for item in documents:
                        # 記錄爬取的股票資料並寫入mongo
                        mon.insert_document(coll_stock, item)
                    print(
                        f'stock: {stock_id} in 2020{str(month).zfill(2)} insert done.')
                time.sleep(10)
                print(
                    f'stock: {stock_id} in 2020{str(month).zfill(2)} crawl done.')
Пример #3
0
def count_url_check():
    expected_count = 938 * 11 * 12
    collection = mon.mongo_collection(mon.mongo_connection('linode1', 'mongo'),
                                      'stocks', 'crawlerURL')
    # reality_count = collection.find({}).count()
    reality_count = collection.count_documents
    print("Expected :", expected_count)
    print("Reality :", reality_count)
Пример #4
0
def count_stocks():
    stock_count = 0
    client = mon.mongo_connection('linode1', 'mongo')
    collection = mon.mongo_collection(client, 'stocks', 'stockIndustry')
    contents = collection.find({}, {'stocks_count': 1})
    for item in contents:
        id_list_count = item['stocks_count']
        stock_count += id_list_count
    print('stocks_count :', stock_count)
Пример #5
0
def all_stock_id():
    client = mon.mongo_connection('linode1', 'mongo')
    collection_stock = mon.mongo_collection(client, 'stocks', "stockInfo")
    contents = list(collection_stock.find({}, {'_id': 1}))
    for item in contents:
        stock_id = item['_id']
        # print(stock_id)
    print("amount of stocks:", len(contents))
    return contents
Пример #6
0
def create_urls(stock_ids):
    mongoClient = mon.mongo_connection('linode1', 'mongo')
    mongoCollection = mon.mongo_collection(mongoClient, 'stocks', 'crawlerURL')
    for stock_id in stock_ids:
        for year in year_list:
            for month in month_list:
                url = f'https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year + month.zfill(2)}01&stockNo={stock_id}'
                doc = {
                    '_id': stock_id + year + month.zfill(2),
                    'url': url,
                    'crawlerStatus': 0
                }
                mon.insert_document(mongoCollection, doc)
Пример #7
0
def industry_crawler():
    url = 'https://www.cnyes.com/twstock/stock_astock.aspx?ga=nav'
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    res = requests.get(url, headers=headers)
    # print(res.status_code)
    soup = BeautifulSoup(res.text, 'html.parser')

    industries = soup.select('div[id="kinditem_0"]>ul[class="kdlist"]>li')
    # get all industries
    for industry in industries:
        industry_name = industry.a.text
        print(industry_name)
        industry_url = 'https://www.cnyes.com/twstock/' + industry.a["href"]
        print(industry_url)
        industry_id = industry_url.split('groupId=')[-1].split('&stitle')[0]
        # get all stocks from the industry
        res_stock = requests.get(industry_url, headers=headers)
        # print(res_stock.status_code)
        soup_stock = BeautifulSoup(res_stock.text, 'html.parser')
        stocks = soup_stock.select('div[class="TableBox"]>table>tr')
        stock_list = []
        stock_dict = dict()
        for stock in stocks[1:]:
            stock_info = stock.find_all('td')
            stock_id = stock_info[1].text
            # print(stock_id)
            stock_name = stock_info[2].text
            # print(stock_name)
            stock_list.append(stock_id)
            stock_dict[stock_id] = stock_name

        industry_key_id = 'industry_' + industry_id
        doc = {
            '_id': industry_key_id,
            'industry': industry_kv[industry_name],
            'industry_name': industry_name,
            'stocks_list': stock_list,
            'stocks_count': len(stock_list),
            'stocks': stock_dict
        }
        # print(doc)
        mongo_client = mon.mongo_connection('linode1', 'mongo')
        mongo_collection = mon.mongo_collection(mongo_client, 'stocks',
                                                'stockIndustry')
        mon.insert_document(mongo_collection, doc)
        time.sleep(20)
Пример #8
0
def get_month_record_mongo(stock_id, year, month):
    client = mon.mongo_connection('linode1', 'mongo')
    collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}")
    # 檢查當是否有存資料
    if collection.find({
            "trade_date": {
                "$regex": f"{str(year)+str(month)}"
            }
    }, {
            "trade_date": 1
    }).count() != 0:
        contents = list(
            collection.find({'_id': re.compile(f"{stock_id}{year}{month}")}))
        return contents
    else:
        return
Пример #9
0
def pick_proxy(amount=10):
    client = mon.mongo_connection('linode1', 'mongo')
    collection = mon.mongo_collection(client, 'proxy', 'proxyPool_1')
    contents = collection.find({}, {'ip': 1, 'port': 1}).limit(amount)
    redisConnect = red.redis_connection('linode1', 'redis',
                                        db=1)  # proxy use db=1
    for item in contents:
        proxy = item['ip'] + ':' + item['port']
        try:
            validate_proxy(proxy)
            for index in range(1, 21):
                key = f'proxy{index}'
                if not redisConnect.exists(key):
                    red.redis_set_key_value(redisConnect, key, proxy)
        except Exception:
            pass
Пример #10
0
def check_records_exist():
    client = mon.mongo_connection('linode1', 'mongo')
    for content in allStockID.all_stock_id():
        stock_id = content['_id']
        # print(stock_id)
        coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}")
        # stocks_con = list(coll_stock.find(
        #     {"trade_date": {"$regex": "2020"}}, {"trade_date": 1}))
        records_count = coll_stock.find({
            "trade_date": {
                "$regex": "202102"
            }
        }, {
            "trade_date": 1
        }).count()
        if records_count < 5:
            print(stock_id, records_count)
            wcsv.writeToCsv("double_check_stock", [stock_id])
Пример #11
0
def stockInfo():
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockIndustry = mon.mongo_collection(s
        client, 'stocks', 'stockIndustry')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    for item in mon.find_all_mongo(coll_stockIndustry):
        # print(item)
        ids = item['stocks_list']
        for stock_id in ids:
            doc = {
                '_id': stock_id,
                'industry': item['_id'],
                'name': 'name',
                'abbreviation': item['stocks'][stock_id],
                'dailyStatus': 0,
                'monthStatus': 0,
                'yearStatus': 0}
            print(doc)
            mon.insert_document(coll_stockInfo, doc)
Пример #12
0
def get_season_record_mongo(stock_id, year, season):
    client = mon.mongo_connection('linode1', 'mongo')
    collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}")
    print('count_documents:', collection.count_documents({}))
    if season == 1:
        contents = get_3_month_records(collection, stock_id, year,
                                       ['01', '02', '03'])
        return contents
    elif season == 2:
        contents = get_3_month_records(collection, stock_id, year,
                                       ['04', '05', '06'])
        return contents
    elif season == 3:
        contents = get_3_month_records(collection, stock_id, year,
                                       ['07', '08', '09'])
        return contents
    elif season == 4:
        contents = get_3_month_records(collection, stock_id, year,
                                       ['10', '11', '12'])
        return contents
    else:
        return
Пример #13
0
def get_stock_ids():
    mongo_client = mon.mongo_connection('linode1', 'mongo')
    coll_stockIndustry = mon.mongo_collection(mongo_client, 'stocks',
                                              'stockIndustry')
    contents = mon.find_some_fields_mongo(coll_stockIndustry, ['stocks_list'])
    return contents
Пример #14
0
def get_year_record_mongo(stock_id, year):
    client = mon.mongo_connection('linode1', 'mongo')
    collection = mon.mongo_collection(client, 'stocks', f"stock{stock_id}")
    contents = list(collection.find({'_id': re.compile(f"{stock_id}{year}")}))
    return contents
Пример #15
0
def main():
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    redisConnect = red.redis_connection('linode1', 'redis', db=0)
    while True:
        try:
            # check if key in redis doesn't exist
            for num in range(1, 9):
                key = f'stock_No_{num}'
                if not redisConnect.exists(key):
                    # 表示還有未丟到redis的stock_id
                    if coll_stockInfo.find({
                            'crawlerStatus': 0
                    }, {
                            '_id': 1
                    }).count() != 0:
                        content = coll_stockInfo.find({
                            'crawlerStatus': 0
                        }, {
                            '_id': 1
                        }).limit(1)
                        stock_id = content[0]['_id']
                        print(f"{key} disapear >>> set {stock_id}")
                        # 放到redis
                        red.redis_set_key_value(redisConnect, key, stock_id)
                        # 表示已經放到redis
                        coll_stockInfo.update({'_id': stock_id},
                                              {'$set': {
                                                  'crawlerStatus': 1
                                              }})
                    # 表示已經從redis刪掉但還沒爬蟲好的stock_id
                    # elif coll_stockInfo.find({'crawlerStatus': 2}, {'_id': 1}).count() != 0:
                    #     coll_stockInfo.update({'crawlerStatus': {'$ne': 3}}, {
                    #                           '$set': {'crawlerStatus': 0}})
                    #     main()
                    # content = coll_stockInfo.find(
                    #     {'crawlerStatus': {'$ne': 1}}, {'_id': 1}).limit(1)
                    # stock_id = content[0]['_id']
                    # print(f"{key} disapear >>> set {stock_id}")
                    # # 再丟上去一次redis
                    # red.redis_set_key_value(redisConnect, key, stock_id)
                    else:
                        wcsv.writeToCsv('./data/publisherStatus', [
                            'all stock ids have published on redis',
                            datetime.datetime.now()
                        ])
                        break
                print(f"{key} still exist.")
            time.sleep(100)
            # 全部爬蟲完成就中止丟資料到redis
            if coll_stockInfo.find({
                    'crawlerStatus': {
                        '$ne': 3
                    }
            }, {
                    '_id': 1
            }).count() == 0:
                print("== All stock crawlering done ==")
                break
        except Exception as e:
            wcsv.writeToCsv("./dataStore/redisException", [e])
            print(e)
Пример #16
0
def get_stock_ids():
    client = mon.mongo_connection('linode1', 'mongo')
    collection = mon.mongo_collection(client, 'stocks', 'stockIndustry')
    contents = collection.find({}, {'stocks_list': 1})
    return contents
Пример #17
0
def crawler_daily():
    counts = 0
    # notify daily updation starts
    goo.main('stock_crawler', 'Stocks Daily Updation Starts!')
    # start time
    t1 = datetime.datetime.now()
    # set daily status zero for default
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    coll_stockInfo.update_many({}, {'$set': {'dailyStatus': 0}})
    # today
    today = datetime.date.today()  #-datetime.timedelta(1)
    year = today.strftime("%Y")
    month = today.strftime("%m")
    day = today.strftime("%d")
    # get all stocks' id
    for content in allStockID.all_stock_id():
        stock_id = content['_id']
        print(stock_id)
        retry = 0
        url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year}{month}01&stockNo={stock_id}"""
        coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}")
        while retry < 3:
            try:
                contents = crawler.crawler(url)
                # print(contents)
                for item in contents:
                    # daily record to mongo
                    mon.insert_document(coll_stock, item)
                # crawlering and writing to mongo done, set daily status as datetime
                coll_stockInfo.update_one(
                    {'_id': stock_id},
                    {'$set': {
                        'dailyStatus': f"{year+month+day}"
                    }})
                counts += 1
                time.sleep(10)
                break
            except Exception as e:
                print(e)
                time.sleep(10)
                retry += 1
                if retry == 3:
                    # sent notify with googlebot
                    goo.main('stock_crawler',
                             f"{stock_id}, {year,month,day} Wrong: {e}")
                    wcsv.writeToCsv(
                        f'./dataStore/DailyCrawlerException_{today}',
                        [stock_id, year, month, day])
                continue

    # check daily update done
    if coll_stockInfo.find({
            'dailyStatus': {
                '$ne': f"{year+month+day}"
            }
    }, {
            '_id': 1
    }).count() != 0:
        crawler_daily()

    # notify daily updation done
    cost_time = datetime.datetime.now() - t1
    goo.main(
        'stock_crawler',
        f"{datetime.date.today()}: Daily Updation Finished!\nCheck amount of stock: {counts}, except: {938-counts}\nCost_time: {cost_time}"
    )
    return