예제 #1
0
def main():  # redis subscriber index:0-3
    client = mon.mongo_connection('linode1', 'mongo')
    with open('/Users/huangyiling/Github/stock/double_check_stock1.csv', newline='') as file:
        rows = csv.reader(file)
        for stock_id in rows:
            stock_id = ''.join(stock_id)
            print(f"get stock {stock_id}")
            coll_stock = mon.mongo_collection(
                client, 'stocks', f"stock{stock_id}")
            # for year in range(2010, 2021):
                # 測試當年度是否有資料
                # test_month = 12
                # test_url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(test_month).zfill(2)}01&stockNo={stock_id}"""
                # print(f"test stock {stock_id} in {year} exist ?")
                # test_docs = crawler.crawler(test_url)
                # if test_docs:
                    # print("=> Yes, exist!")
            for month in range(1, 13):
                url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=2020{str(month).zfill(2)}01&stockNo={stock_id}"""
                print(f"-- Crawler >>> {url}")
                documents = crawler.crawler(url)
                if documents:
                    # print(documents)
                    for item in documents:
                        # 記錄爬取的股票資料並寫入mongo
                        mon.insert_document(coll_stock, item)
                    print(
                        f'stock: {stock_id} in 2020{str(month).zfill(2)} insert done.')
                time.sleep(10)
                print(
                    f'stock: {stock_id} in 2020{str(month).zfill(2)} crawl done.')
예제 #2
0
def main():  # redis subscriber index:0-3
    redisConnect = red.redis_connection('linode1', 'redis', db=0)
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    while True:
        # get keys and values from mongo
        keys = red.redis_get_all_kv(redisConnect)
        for key in keys:
            amount = int(os.environ.get("amount"))  # amount of subscriber
            index = int(os.environ.get("index"))  # subscriber num
            num = int(key.split('No_')[-1])  # redis key
            # 決定subscriber要取用哪筆資料
            if num % int(amount) == int(index):
                stock_id = red.redis_get_value(redisConnect, key)
                print(f"get stock {stock_id}")
                red.redis_delete_key(redisConnect, key)  # 取出stock_id就從redis刪掉
                coll_stockInfo.update({'_id': stock_id},
                                      {'$set': {
                                          'crawlerStatus': 2
                                      }})  # 表示已經從redis刪掉
                coll_stock = mon.mongo_collection(client, 'stocks',
                                                  f"stock{stock_id}")
                for year in range(2010, 2022):
                    for month in range(1, 13):
                        if year == 2021 and month > 2:
                            break
                        url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(month).zfill(2)}01&stockNo={stock_id}"""
                        print(f"-- Crawler >>> {url}")
                        documents = crawler.crawler(url)
                        if documents:
                            for item in documents:
                                # 記錄爬取的股票資料並寫入mongo
                                mon.insert_document(coll_stock, item)
                            coll_stockInfo.update_one({'_id': stock_id}, {
                                '$set': {
                                    'monthStatus':
                                    str(year) + str(month).zfill(2)
                                }
                            })  # 當月爬完
                            print(
                                f'stock: {stock_id} in {year}{month} insert done.'
                            )
                        time.sleep(10)
                        print(
                            f'stock: {stock_id} in {year}{month} crawl done.')
                    coll_stockInfo.update_one({'_id': stock_id},
                                              {'$set': {
                                                  'yearStatus': year
                                              }})  # 當年爬完
                coll_stockInfo.update({'_id': stock_id},
                                      {'$set': {
                                          'crawlerStatus': 3
                                      }})  # 表示這支股票已經都爬完
예제 #3
0
def create_urls(stock_ids):
    mongoClient = mon.mongo_connection('linode1', 'mongo')
    mongoCollection = mon.mongo_collection(mongoClient, 'stocks', 'crawlerURL')
    for stock_id in stock_ids:
        for year in year_list:
            for month in month_list:
                url = f'https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year + month.zfill(2)}01&stockNo={stock_id}'
                doc = {
                    '_id': stock_id + year + month.zfill(2),
                    'url': url,
                    'crawlerStatus': 0
                }
                mon.insert_document(mongoCollection, doc)
예제 #4
0
def stock_crawler(stock_ids_list):
    for stocks in stock_ids_list:
        stock_ids = stocks['stocks_list']
        client = mon.mongo_connection('linode1', 'mongo')
        for stock_id in stock_ids:
            collection = mon.create_collection(client, 'stocks',
                                               f'stock{stock_id}')
            for year in range(10, 21):
                for month in [
                        '01', '02', '03', '04', '05', '06', '07', '08', '09',
                        '10', '11', '12'
                ]:
                    stock_url = f"https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=20{str(year)}{month}01&stockNo={stock_id}"
                    res = requests.get(stock_url, headers=headers)
                    soup = BeautifulSoup(res.text, "lxml")
                    table = soup.find_all('table')[0]
                    df = pd.read_html(str(table))[0]
                    for index in range(len(df)):
                        date = df.iat[index, 0]  # 交易日
                        date_ad = str(1911 +
                                      int(date.split('/')[0])) + ''.join(
                                          date.split('/')[1:])
                        volume = int(df.iat[index, 1])  # 交易量(股數)
                        price = float(df.iat[index, 2])  # 成交金額
                        open_ = float(df.iat[index, 3])  # 開盤價
                        high = float(df.iat[index, 4])  # 最高價
                        low = float(df.iat[index, 5])  # 最低價
                        close_ = float(df.iat[index, 6])  # 收盤價
                        change_ori = df.iat[index, 7]  # 高低價差
                        if change_ori == 'X0.00':
                            change = float(0.00)
                        else:
                            change = float(change_ori)
                        trades = int(df.iat[index, 8])  # 成交筆數
                        doc = {
                            '_id': stock_id + date_ad,
                            'trade_date': date_ad,
                            'volume': volume,
                            'price': price,
                            'open': open_,
                            'high': high,
                            'low': low,
                            'close': close_,
                            'change': change,
                            'trades': trades
                        }
                        print(doc)
                        mon.insert_document(collection, doc)
                    # df.to_csv(f'/Users/huangyiling/Desktop/stock/2330/stock{stock_id}_20{str(i)}{j}.csv')
                    time.sleep(20)
예제 #5
0
def industry_crawler():
    url = 'https://www.cnyes.com/twstock/stock_astock.aspx?ga=nav'
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    res = requests.get(url, headers=headers)
    # print(res.status_code)
    soup = BeautifulSoup(res.text, 'html.parser')

    industries = soup.select('div[id="kinditem_0"]>ul[class="kdlist"]>li')
    # get all industries
    for industry in industries:
        industry_name = industry.a.text
        print(industry_name)
        industry_url = 'https://www.cnyes.com/twstock/' + industry.a["href"]
        print(industry_url)
        industry_id = industry_url.split('groupId=')[-1].split('&stitle')[0]
        # get all stocks from the industry
        res_stock = requests.get(industry_url, headers=headers)
        # print(res_stock.status_code)
        soup_stock = BeautifulSoup(res_stock.text, 'html.parser')
        stocks = soup_stock.select('div[class="TableBox"]>table>tr')
        stock_list = []
        stock_dict = dict()
        for stock in stocks[1:]:
            stock_info = stock.find_all('td')
            stock_id = stock_info[1].text
            # print(stock_id)
            stock_name = stock_info[2].text
            # print(stock_name)
            stock_list.append(stock_id)
            stock_dict[stock_id] = stock_name

        industry_key_id = 'industry_' + industry_id
        doc = {
            '_id': industry_key_id,
            'industry': industry_kv[industry_name],
            'industry_name': industry_name,
            'stocks_list': stock_list,
            'stocks_count': len(stock_list),
            'stocks': stock_dict
        }
        # print(doc)
        mongo_client = mon.mongo_connection('linode1', 'mongo')
        mongo_collection = mon.mongo_collection(mongo_client, 'stocks',
                                                'stockIndustry')
        mon.insert_document(mongo_collection, doc)
        time.sleep(20)
예제 #6
0
def stockInfo():
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockIndustry = mon.mongo_collection(s
        client, 'stocks', 'stockIndustry')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    for item in mon.find_all_mongo(coll_stockIndustry):
        # print(item)
        ids = item['stocks_list']
        for stock_id in ids:
            doc = {
                '_id': stock_id,
                'industry': item['_id'],
                'name': 'name',
                'abbreviation': item['stocks'][stock_id],
                'dailyStatus': 0,
                'monthStatus': 0,
                'yearStatus': 0}
            print(doc)
            mon.insert_document(coll_stockInfo, doc)
예제 #7
0
def crawler_daily():
    counts = 0
    # notify daily updation starts
    goo.main('stock_crawler', 'Stocks Daily Updation Starts!')
    # start time
    t1 = datetime.datetime.now()
    # set daily status zero for default
    client = mon.mongo_connection('linode1', 'mongo')
    coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo')
    coll_stockInfo.update_many({}, {'$set': {'dailyStatus': 0}})
    # today
    today = datetime.date.today()  #-datetime.timedelta(1)
    year = today.strftime("%Y")
    month = today.strftime("%m")
    day = today.strftime("%d")
    # get all stocks' id
    for content in allStockID.all_stock_id():
        stock_id = content['_id']
        print(stock_id)
        retry = 0
        url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year}{month}01&stockNo={stock_id}"""
        coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}")
        while retry < 3:
            try:
                contents = crawler.crawler(url)
                # print(contents)
                for item in contents:
                    # daily record to mongo
                    mon.insert_document(coll_stock, item)
                # crawlering and writing to mongo done, set daily status as datetime
                coll_stockInfo.update_one(
                    {'_id': stock_id},
                    {'$set': {
                        'dailyStatus': f"{year+month+day}"
                    }})
                counts += 1
                time.sleep(10)
                break
            except Exception as e:
                print(e)
                time.sleep(10)
                retry += 1
                if retry == 3:
                    # sent notify with googlebot
                    goo.main('stock_crawler',
                             f"{stock_id}, {year,month,day} Wrong: {e}")
                    wcsv.writeToCsv(
                        f'./dataStore/DailyCrawlerException_{today}',
                        [stock_id, year, month, day])
                continue

    # check daily update done
    if coll_stockInfo.find({
            'dailyStatus': {
                '$ne': f"{year+month+day}"
            }
    }, {
            '_id': 1
    }).count() != 0:
        crawler_daily()

    # notify daily updation done
    cost_time = datetime.datetime.now() - t1
    goo.main(
        'stock_crawler',
        f"{datetime.date.today()}: Daily Updation Finished!\nCheck amount of stock: {counts}, except: {938-counts}\nCost_time: {cost_time}"
    )
    return