Exemplo n.º 1
0
def get_stock_list(stock_type='上市'):
    df_stocks = None
    if stock_type == '上市':
        df_stocks = pd.read_csv(gen_output_path('data', '上市.csv'),
                                engine='python',
                                encoding='big5')

    elif stock_type == '上櫃':
        df_stocks = pd.read_csv(gen_output_path('data', '上櫃.csv'),
                                engine='python',
                                encoding='big5')
    return df_stocks
Exemplo n.º 2
0
 def test_parse_balance_sheet(self):
     # stock_list = get_stock_codes(stock_type='上市') + get_stock_codes(stock_type='上櫃')
     stock_list = [2801]
     # stock_list = [1409]
     empty_list = []
     non_investment = []
     non_property = []
     for stock_id in stock_list:
         # print('parse ', stock_id)
         file_name = "balance_sheet_data_" + str(stock_id)
         input_path = gen_output_path('raw_datas/test_balance_sheets', file_name)
         year = 2020
         season = 1
         balance_sheet_processor = SimpleBalanceSheetProcessor(stock_id)
         with open(input_path, 'rb') as in_put:
             # str_trs = in_put.readlines()
             raw_input = in_put.read()
             in_put.close()
             soup = BeautifulSoup(raw_input, 'html.parser')
             # print(soup.prettify())
             parse_data = balance_sheet_processor.parse_balance_sheet(soup, year, season)
             if parse_data is not None and len(parse_data) < 2:
                 if len(parse_data) == 0:
                     empty_list.append(stock_id)
                 else:
                     if '長期投資' not in parse_data:
                         non_investment.append(stock_id)
                     if '固定資產' not in parse_data:
                         non_property.append(stock_id)
     print('empty_list = ', empty_list)
     print('non_investment = ', non_investment)
     print('non_property = ', non_property)
Exemplo n.º 3
0
 def test_get_prediction(self):
     with open(gen_output_path('data', 'prices.xlsx'), 'rb') as file:
         df = pd.read_excel(file)
         file.close()
     prices = df.loc[:, '收盤價']
     # errors = generate_predictions(prices, get_stock_codes(stock_type='上市') + get_stock_codes(stock_type='上櫃'))
     # print('test_get_prediction errors = ', errors)
     result = generate_prediction(2841, float(prices.loc[str(2841)]))
     print(result)
Exemplo n.º 4
0
def get_raw_data(input_dir, file_name):
    input_path = gen_output_path(input_dir, file_name)
    try:
        with open(input_path, 'rb') as in_put:
            raw_input = in_put.read()
            in_put.close()
            return raw_input
    except FileNotFoundError as fnfe:
        return None
Exemplo n.º 5
0
 def action(stock_id, time_line):
     year = time_line['year']
     season = time_line.get('season')
     dir_path = (root_dir_path + str(year)) if season is None else (root_dir_path + str(year) + "Q" + str(season))
     file_path = gen_output_path(dir_path, str(stock_id))
     if path.exists(file_path) is False:
         result = fetcher(stock_id, year) if season is None else fetcher(stock_id, year, season)
         if result is not None:
             store_raw_data(result, dir_path, str(stock_id))
Exemplo n.º 6
0
def fetch_price_measurement_raw_datas(stock_ids):
    db = mongo_client[DB_TWSE]
    collection = db[TABLE_TWSE_PRICE_MEASUREMENT]
    for stock_id in stock_ids:
        params = {'STOCK_ID': stock_id}
        path_dir = os.path.abspath(gen_output_path(PATH_DIR_RAW_DATA_PRICE_MEASUREMENT))
        url = "https://goodinfo.tw/StockInfo/StockBzPerformance.asp?" + urlencode(params)
        print('url = ', url)
        profile = FirefoxProfile()
        profile.set_preference("browser.download.panel.shown", False)
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.dir", path_dir)
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/html")
        driver = webdriver.Firefox(firefox_profile=profile)
        driver.get(url)
        inputs = driver.find_elements_by_tag_name("input")
        download_button = None
        for input_ in inputs:
            if input_.get_attribute("value") == '匯出HTML':
                download_button = input_
        download_button.click()
        driver.quit()
        file_path = os.path.join(path_dir, "BzPerformance.html")
        try:
            output_file_path = gen_output_path(PATH_DIR_RAW_DATA_PRICE_MEASUREMENT, str(stock_id))
            # copyfile(file_path, output_file_path)
            with open(file_path, 'rb') as in_put:
                raw_input = in_put.read()
                in_put.close()
                print(raw_input)
                record = {
                    "stock_id": str(stock_id),
                    "content": raw_input
                }
                post_id = collection.insert_one(record)
                print("post_id = ", post_id)

        except Exception as inst:
            __logger.error("get exception in " + str(stock_id) + ":" + str(inst))
            traceback.print_tb(inst.__traceback__)
        finally:
            os.remove(file_path)
Exemplo n.º 7
0
def read_dfs(stock_id, path=None):
    try:
        with open(
                path if path is not None else gen_output_path(
                    'data', 'performance_{0}.xlsx'.format(str(stock_id))),
                'rb') as file:
            df_profit_matrix = pd.read_excel(file, sheet_name=None)
            file.close()
            return df_profit_matrix
    except FileNotFoundError as err:
        traceback.print_tb(err.__traceback__)
        return None
Exemplo n.º 8
0
    def get_data_frame(self, year=None, season=None, indexType=IndexType.INT_INDEX):
        # create a new Firefox session
        path = os.path.abspath(gen_output_path("data"))
        print("path = ", path)

        params = {'STOCK_ID': self._stock_id}
        url = "https://goodinfo.tw/StockInfo/StockBzPerformance.asp?" + urlencode(params)
        print('url = ', url)
        profile = FirefoxProfile()
        profile.set_preference("browser.download.panel.shown", False)
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.dir", path)
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/html")
        driver = webdriver.Firefox(firefox_profile=profile)
        driver.get(url)
        elements = driver.find_elements_by_id("txtFinDetailData")
        buttons = elements[0].find_elements_by_xpath(".//input")
        buttons[5].click()
        driver.quit()
        file_path = os.path.join(path, "BzPerformance.html")
        print("path2 = ", file_path)
        try:
            df = pd.read_html(file_path)[0]
            years = self.__correct_years(df.loc[:, 0].tolist())
            prices = list(map(lambda price: float(price) if price != '-' else float(0), df.loc[:, 4].tolist()))
            if indexType == IndexType.INT_INDEX:
                data = {'年度': years, '平均股價': prices}
                df = pd.DataFrame(data)
                df = df.set_index(['年度'])
            else:
                # periodIndexes = list(map(lambda year: pd.PeriodIndex(start=pd.Period(year, freq='Y'),
                #                                                      end=pd.Period(year, freq='Y'),
                #                                                      freq='Y'), years))
                # print('periodIndex = ', periodIndexes)
                periodIndex = pd.PeriodIndex(start=pd.Period(years[-1], freq='Y'), end=pd.Period(years[0], freq='Y'),
                                             freq='Y')
                print(periodIndex)
                print([prices[i:i + 1] for i in range(0, len(prices))])
                df = pd.DataFrame(data=[prices[i:i + 1] for i in range(len(prices) - 1, -1, -1)], columns=['平均股價'],
                                  index=periodIndex)

            print('df = ', df)
            # df['平均股價'] = df['平均股價'].map(lambda price: float(price) if price != '-' else float(0))
            # df2.index = pd.PeriodIndex(df2.index, freq='A')

        except Exception as inst:
            print("get exception", inst)
            traceback.print_tb(inst.__traceback__)
            return None
        finally:
            os.remove(file_path)
        return df
Exemplo n.º 9
0
def generate_predictions2(df_prices, stock_ids=[]):
    error_stock_ids = []
    df_predictions = None

    for stock_id in stock_ids:
        str_stock_id = str(stock_id)

        # try:
        #     stock_data = get_stock_data(str_stock_id)
        # except:
        #     stock_data = None
        #
        # if stock_data is None:
        #     continue

        try:
            stock_data = read(str_stock_id)

            s_stock = get_predict_evaluate(stock_data,
                                           float(df_prices.loc[str_stock_id]))
        except Exception as e:
            print("Unexpected error:", e)
            traceback.print_tb(e.__traceback__)
            print('Get error when get stock ', stock_id, ' stock_data = ',
                  stock_data)
            error_stock_ids.append(stock_id)
            s_stock = None
        except:
            print("Unexpected error:", sys.exc_info()[0])
            error_stock_ids.append(stock_id)
            s_stock = None

        if s_stock is None:
            continue
        if df_predictions is None:
            df_predictions = pd.DataFrame(columns=s_stock.index,
                                          data=[s_stock.values])
            df_predictions['股號'] = [str_stock_id]
            df_predictions = df_predictions.set_index('股號')
            print("first record")
            print(df_predictions)
            print("index = ", df_predictions.index)
        else:
            print('get index = ', df_predictions.index)
            df_predictions.loc[str_stock_id] = s_stock.values

        print('result = ', df_predictions)
        print('err_id = ', error_stock_ids)
    output_path = gen_output_path('data', 'evaluations.xlsx')
    with pd.ExcelWriter(output_path) as writer:
        df_predictions.to_excel(writer, sheet_name='predictions')
Exemplo n.º 10
0
def generate_predictions(df_prices, stock_ids=[]):
    error_stock_ids = []
    results = None
    df_predictions = None

    df_stock_list = _get_stock_list()
    for stock_id in stock_ids:
        str_stock_id = str(stock_id)
        try:
            result = None if not df_prices.index.contains(str_stock_id) else generate_prediction(stock_id, float(df_prices.loc[str_stock_id]))
        except Exception as e:
            print("Unexpected error:", sys.exc_info()[0])
            traceback.print_tb(e.__traceback__)
            error_stock_ids.append(stock_id)
            result = None

        if result is not None:
            if results is None:
                results = {'公司代號': [stock_id], '公司簡稱': [df_stock_list.loc[str_stock_id, '公司簡稱']]}
                results.update(dict(map(lambda x: (x[0], [x[1]]), result.items())))

                print(list(results.keys()))
                # df_predictions = pd.DataFrame(result)
                # df_predictions['股號'] = [str_stock_id]
                # df_predictions = df_predictions.set_index('股號')
                print("first record")
                # print(df_predictions)
            else:
                for pair in result.items():
                    results[pair[0]].append(pair[1])
                results['公司代號'].append(stock_id)
                results['公司簡稱'].append(df_stock_list.loc[str_stock_id, '公司簡稱'])
                print('results = ', results)

    df_predictions = pd.DataFrame(results)
    df_predictions = df_predictions.set_index('公司代號')
    print("result = ", df_predictions)
    output_path = gen_output_path('data', 'evaluations.xlsx')
    with pd.ExcelWriter(output_path) as writer:
        df_predictions.to_excel(writer, sheet_name='predictions')
    return error_stock_ids
Exemplo n.º 11
0
def store_df(stock_id, dict_sheet_dfs, directory=None, filename=None):
    if dict_sheet_dfs is None or len(
            dict_sheet_dfs.items()) == 0 or stock_id is None:
        print('dict_sheet_dfs and stock_id should not be None')
        return

    output_path = gen_output_path(
        directory if directory is not None else 'data',
        filename=filename
        if filename is not None else 'performance_{0}.xlsx'.format(stock_id))
    dict_dfs = read_dfs(stock_id, output_path)
    if dict_dfs is None:
        dict_dfs = {}

    for sheet_name, df in dict_sheet_dfs.items():
        dict_dfs[sheet_name] = df

    with pd.ExcelWriter(output_path) as writer:
        for sheet_name, df in dict_dfs.items():
            print('sheet_name = ', sheet_name, " df = ", df)
            df.to_excel(writer, sheet_name=sheet_name)
        writer.close()
Exemplo n.º 12
0
def _read_df_datas(stock_id):
    df_statements = read_dfs(stock_id, gen_output_path('data', 'statments_{0}.xlsx'.format(str(stock_id))))
    if df_statements is None:
        df_statements = {}
    df_profit_statement = df_statements.get('profit_statement', None)
    df_balance_sheet = df_statements.get('balance_sheet', None)
    df_cash_flow_statement = df_statements.get('cash_flow_statement', None)
    df_dividend_policy = df_statements.get('dividend_policy', None)
    df_performance = df_statements.get('performance', None)
    print('df_cash_flow_statement = ', df_cash_flow_statement)
    print('df_dividend_policy = ', df_dividend_policy)
    print('df_balance_sheet = ', df_balance_sheet)

    if df_profit_statement is not None:
        df_profit_statement = df_profit_statement.set_index([df_profit_statement.index.to_period("Q")])
        df_statements['profit_statement'] = df_profit_statement
        print('df_profit_statement = ', df_profit_statement)

    if df_balance_sheet is not None:
        df_balance_sheet = df_balance_sheet.set_index([df_balance_sheet.index.to_period("Q")])
        df_balance_sheet.columns = ['固定資產', '每股淨值', '長期投資', ('權益總額', '期初餘額'), ('權益總額', '期末餘額')]
        df_statements['balance_sheet'] = df_balance_sheet
        print('df_balance_sheet columns = ', df_balance_sheet.columns)

    if df_cash_flow_statement is not None:
        df_cash_flow_statement = df_cash_flow_statement.set_index([df_cash_flow_statement.index.to_period("Q")])
        df_statements['cash_flow_statement'] = df_cash_flow_statement
        print('df_cash_flow_statement = ', df_cash_flow_statement)

    if df_dividend_policy is not None:
        df_dividend_policy = df_dividend_policy.set_index([df_dividend_policy.index.to_period("Y")])
        df_statements['dividend_policy'] = df_dividend_policy
        print('df_dividend_policy = ', df_dividend_policy)

    if df_performance is not None:
        df_performance = df_performance.set_index([df_performance.index.to_period("Y")])
        df_statements['performance'] = df_performance
    return df_statements
Exemplo n.º 13
0
    def test_predict_evaluation(self):
        from stock_data import read
        stock_data = read('2330')
        self.assertIsNotNone(stock_data)

        df = None
        with open(gen_output_path('data', 'prices.xlsx'), 'rb') as file:
            df = pd.read_excel(file)
            file.close()

        s_2330 = get_predict_evaluate(stock_data, float(df.loc['2330', '收盤價'])).rename('2330')
        result = pd.concat([s_2330], axis=1)
        print('result 1', result.T)

        stock_data = read('6294')
        s_6294 = get_predict_evaluate(stock_data, float(df.loc['6294', '收盤價'])).rename('6294')
        self.assertIsNotNone(stock_data)

        result.loc[:, '6294'] = s_6294
        print('result 2', result.T)

        result.loc[:, '6294'] = s_6294
        print('result 3', result.T)
Exemplo n.º 14
0
def get_prices():
    crawler = Crawler()
    df = crawler.get_data((2021, 9, 7))
    with pd.ExcelWriter(gen_output_path('data', 'prices.xlsx')) as writer:
        df.to_excel(writer)
        writer.close()
Exemplo n.º 15
0
def generate_predictions(stock_ids=[]):
    try:
        with open(gen_output_path('data', 'evaluations.xlsx'), 'rb') as file:
            df_predictions = pd.read_excel(file,
                                           sheet_name='predictions',
                                           dtype={'股號': str})
            df_predictions = df_predictions.set_index('股號')
    except FileNotFoundError as err:
        df_predictions = None
    # print('index =', df_predictions.index)
    # print(df_predictions)
    # df_predictions.reset_index()
    # print('index =', df_predictions.index)
    # df_predictions.reindex(df_predictions.index.astype('str'))
    # print('index =', df_predictions.index)
    #
    # print(df_predictions)
    tor_proc = None
    if not _tor_process_exists():
        print('open tor process')
        tor_proc = _launch_tor()

    controller = Controller.from_port(port=ctrl_port)
    controller.authenticate()
    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", proxy_port)
    socket.socket = socks.socksocket

    newIP = urlopen("http://icanhazip.com").read()
    print("NewIP Address: %s" % newIP)

    error_stock_ids = []
    for stock_id in stock_ids:
        str_stock_id = str(stock_id)
        if df_predictions is not None and str_stock_id in df_predictions.index:
            continue
        try:
            stock_data = get_stock_data(stock_id)
        except:
            stock_data = None

        if stock_data is None:
            continue
        break_loop = False
        while True:
            try:
                s_stock = get_predict_evaluate(stock_data)
                break
            except (IndexError, requests.exceptions.ConnectionError,
                    NewConnectionError, socks.SOCKS5Error) as e:
                print("get exception: ", e, " for ", stock_id)
                controller.signal(Signal.NEWNYM)
                if not controller.is_newnym_available():
                    print("Waiting time for Tor to change IP: " +
                          str(controller.get_newnym_wait()) + " seconds")
                    time.sleep(controller.get_newnym_wait())
                    newIP = urlopen("http://icanhazip.com").read()
                    print("NewIP Address: %s" % newIP)

            except Exception as e:
                print("Unexpected error:", e)
                traceback.print_tb(e.__traceback__)
                print('Get error when get stock ', stock_id, ' stock_data = ',
                      stock_data)
                error_stock_ids.append(stock_id)
                # break_loop = True
                s_stock = None
                break
        if break_loop:
            break
        if s_stock is None:
            continue
        if df_predictions is None:
            # df_predictions = pd.DataFrame(columns=s_stock.index, data=[s_stock.values], index=[str_stock_id])
            df_predictions = pd.DataFrame(columns=s_stock.index,
                                          data=[s_stock.values])
            df_predictions['股號'] = [str_stock_id]
            df_predictions = df_predictions.set_index('股號')
            print("first record")
            print(df_predictions)
            print("index = ", df_predictions.index)
        else:
            print('get index = ', df_predictions.index)
            df_predictions.loc[str_stock_id] = s_stock.values

    print('result = ', df_predictions)
    output_path = gen_output_path('data', 'evaluations.xlsx')
    with pd.ExcelWriter(output_path) as writer:
        df_predictions.to_excel(writer, sheet_name='predictions')
    # print('error_ids = ', error_stock_ids)

    controller.close()
    tor_proc.terminate()
Exemplo n.º 16
0
 def get_raw_data(self, input_dir, file_name):
     input_path = gen_output_path(input_dir, file_name)
     with open(input_path, 'rb') as in_put:
         raw_input = in_put.read()
         in_put.close()
         return raw_input
Exemplo n.º 17
0
 def store_raw_data(self, data, output_dir, file_name):
     if data is not None:
         output_path = gen_output_path(output_dir, file_name)
         with open(output_path, 'wb') as output:
             output.write(data)
             output.close()
Exemplo n.º 18
0
 def test_tsec_crawler(self):
     crawler = Crawler()
     df = crawler.get_data((2021, 4, 29))
     with pd.ExcelWriter(gen_output_path('data', 'prices.xlsx')) as writer:
         df.to_excel(writer)
         writer.close()
Exemplo n.º 19
0
                }]}, {'$set': {
                    "content": raw_data
                }},
                upsert=True)


def get_prices():
    crawler = Crawler()
    df = crawler.get_data((2021, 9, 7))
    with pd.ExcelWriter(gen_output_path('data', 'prices.xlsx')) as writer:
        df.to_excel(writer)
        writer.close()


if __name__ == "__main__":
    logger.info('start')
    twse_code_list = get_stock_codes(stock_type='上市')
    tpex_code_list = get_stock_codes(stock_type='上櫃')
    sync_statements(twse_code_list)
    sync_statements(tpex_code_list)
    sync_performance(twse_code_list + tpex_code_list)
    get_prices()
    with open(gen_output_path('data', 'prices.xlsx'), 'rb') as file:
        df = pd.read_excel(file)
        file.close()
    prices = df.loc[:, '收盤價']
    errors = generate_predictions(
        prices,
        get_stock_codes(stock_type='上市') + get_stock_codes(stock_type='上櫃'))
    print('test_get_prediction errors = ', errors)