def test_multiple_ctx(self): c1 = py_mini_racer.MiniRacer() c2 = py_mini_racer.MiniRacer() c3 = py_mini_racer.MiniRacer() c1.eval('var x = 1') c2.eval('var x = 2') c3.eval('var x = 3') self.assertEqual(c1.eval('(x)'), 1) self.assertEqual(c2.eval('(x)'), 2) self.assertEqual(c3.eval('(x)'), 3)
def stock_board_concept_name_ths() -> pd.DataFrame: """ 同花顺-板块-概念板块-概念 http://q.10jqka.com.cn/gn/detail/code/301558/ :return: 所有概念板块的名称和链接 :rtype: pandas.DataFrame """ url = "http://q.10jqka.com.cn/gn/index/field/addtime/order/desc/page/1/ajax/1/" js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call('v') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'Cookie': f'v={v_code}' } r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") total_page = soup.find('span', attrs={'class': 'page_info'}).text.split('/')[1] big_df = pd.DataFrame() for page in tqdm(range(1, int(total_page)+1), leave=False): url = f"http://q.10jqka.com.cn/gn/index/field/addtime/order/desc/page/{page}/ajax/1/" js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call('v') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'Cookie': f'v={v_code}' } r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") soup.find('table', attrs={'class': 'm-table m-pager-table'}).find('tbody') url_list = [] for item in soup.find('table', attrs={'class': 'm-table m-pager-table'}).find('tbody').find_all('tr'): inner_url = item.find_all("td")[1].find('a')['href'] url_list.append(inner_url) temp_df = pd.read_html(r.text)[0] temp_df['网址'] = url_list big_df = big_df.append(temp_df, ignore_index=True) big_df = big_df[[ '日期', '概念名称', '成分股数量', '网址' ]] big_df['日期'] = pd.to_datetime(big_df['日期']).dt.date big_df['成分股数量'] = pd.to_numeric(big_df['成分股数量']) big_df['代码'] = big_df['网址'].str.split("/", expand=True).iloc[:, 6] return big_df
def stock_board_concept_name_ths() -> pd.DataFrame: """ 同花顺-板块-概念板块-概念 http://q.10jqka.com.cn/gn/detail/code/301558/ :return: 所有概念板块的名称和链接 :rtype: pandas.DataFrame """ url = "http://q.10jqka.com.cn/gn/index/field/addtime/order/desc/page/1/ajax/1/" js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") total_page = soup.find("span", attrs={"class": "page_info"}).text.split("/")[1] big_df = pd.DataFrame() for page in tqdm(range(1, int(total_page) + 1), leave=False): url = f"http://q.10jqka.com.cn/gn/index/field/addtime/order/desc/page/{page}/ajax/1/" js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") url_list = [] for item in ( soup.find("table", attrs={"class": "m-table m-pager-table"}) .find("tbody") .find_all("tr") ): inner_url = item.find_all("td")[1].find("a")["href"] url_list.append(inner_url) temp_df = pd.read_html(r.text)[0] temp_df["网址"] = url_list big_df = big_df.append(temp_df, ignore_index=True) big_df = big_df[["日期", "概念名称", "成分股数量", "网址"]] big_df["日期"] = pd.to_datetime(big_df["日期"]).dt.date big_df["成分股数量"] = pd.to_numeric(big_df["成分股数量"]) big_df["代码"] = big_df["网址"].str.split("/", expand=True).iloc[:, 6] return big_df
def stock_us_daily(symbol: str = "AAPL", adjust: str = "") -> pd.DataFrame: """ 新浪财经-美股 http://finance.sina.com.cn/stock/usstock/sector.shtml 备注:CIEN 新浪复权因子错误 :param symbol: 可以使用 get_us_stock_name 获取 :type symbol: str :param adjust: "": 返回未复权的数据 ; qfq: 返回前复权后的数据; qfq-factor: 返回前复权因子和调整; :type adjust: str :return: 指定 adjust 的数据 :rtype: pandas.DataFrame """ res = requests.get(f"https://finance.sina.com.cn/staticdata/us/{symbol}") js_code = py_mini_racer.MiniRacer() js_code.eval(zh_js_decode) dict_list = js_code.call( "d", res.text.split("=")[1].split(";")[0].replace('"', "") ) # 执行js解密代码 data_df = pd.DataFrame(dict_list) data_df.index = pd.to_datetime(data_df["date"]) del data_df["amount"] del data_df["date"] data_df = data_df.astype("float") res = requests.get(us_sina_stock_hist_qfq_url.format(symbol)) qfq_factor_df = pd.DataFrame(eval(res.text.split("=")[1].split("\n")[0])["data"]) qfq_factor_df.rename(columns={"c": "adjust", "d": "date", "f": "qfq_factor", }, inplace=True) qfq_factor_df.index = pd.to_datetime(qfq_factor_df["date"]) del qfq_factor_df["date"] # 处理复权因子 temp_date_range = pd.date_range("1900-01-01", qfq_factor_df.index[0].isoformat()) temp_df = pd.DataFrame(range(len(temp_date_range)), temp_date_range) new_range = pd.merge( temp_df, qfq_factor_df, left_index=True, right_index=True, how="left" ) new_range = new_range.fillna(method="ffill") new_range = new_range.iloc[:, [1, 2]] if adjust == "qfq": if len(new_range) == 1: new_range.index.values[0] = pd.to_datetime(str(data_df.index.date[0])) temp_df = pd.merge( data_df, new_range, left_index=True, right_index=True, how="left" ) temp_df.fillna(method="ffill", inplace=True) temp_df.fillna(method="bfill", inplace=True) temp_df = temp_df.astype(float) temp_df["open"] = temp_df["open"] * temp_df["qfq_factor"] + temp_df["adjust"] temp_df["high"] = temp_df["high"] * temp_df["qfq_factor"] + temp_df["adjust"] temp_df["close"] = temp_df["close"] * temp_df["qfq_factor"] + temp_df["adjust"] temp_df["low"] = temp_df["low"] * temp_df["qfq_factor"] + temp_df["adjust"] temp_df = temp_df.apply(lambda x: round(x, 4)) temp_df = temp_df.astype("float") return temp_df.iloc[:, :-2] if adjust == "qfq-factor": return qfq_factor_df if adjust == "": return data_df
def stock_zh_index_daily(symbol: str = "sh000922") -> pd.DataFrame: """ 新浪财经-指数获取某个指数的历史行情数据, 大量抓取容易封IP :param symbol: str e.g., sz399998 :return: pandas.DataFrame open high low close volume date 2015-06-16 2526.056 2577.092 2469.216 2487.513 2224345088 2015-06-17 2476.863 2567.842 2422.229 2560.914 2181699840 2015-06-18 2553.739 2587.655 2480.321 2480.674 2032781312 2015-06-19 2431.218 2453.794 2286.967 2287.758 1687013248 2015-06-23 2280.189 2341.795 2156.396 2341.359 1627453440 ... ... ... ... ... 2019-11-11 1210.968 1210.968 1182.442 1182.718 415074658 2019-11-12 1184.118 1196.425 1184.005 1195.790 397246387 2019-11-13 1195.925 1195.925 1180.293 1185.293 334027614 2019-11-14 1185.788 1187.431 1178.414 1180.791 271514767 2019-11-15 1181.090 1181.739 1165.898 1166.536 338309880 """ params = {"d": "2020_2_4"} res = requests.get(zh_sina_index_stock_hist_url.format(symbol), params=params) js_code = py_mini_racer.MiniRacer() js_code.eval(hk_js_decode) dict_list = js_code.call("d", res.text.split("=")[1].split(";")[0].replace( '"', "")) # 执行js解密代码 data_df = pd.DataFrame(dict_list) data_df.index = pd.to_datetime(data_df["date"]) del data_df["date"] data_df = data_df.astype("float") return data_df
def parse_item(self, response): self.log('This is an item page! %s' % response.url) searchObj = re.search(r'#jquery_jplayer_1.*?this', response.text, re.S | re.I) url = '' # open('file.txt', 'w').write(response.text) if searchObj: code = searchObj.group() code = ''.join(code.splitlines()[2:-1]) ctx = py_mini_racer.MiniRacer() url = ctx.eval(code) title = response.css(".jp-title ul li ::text").extract_first() if title and url: item = IshuyinItem() item['title'] = title.strip().split('-')[0].strip() item['file_urls'] = [url] item['album'] = self.album item['picture'] = self.picture item['artist'] = self.artist yield item else: logging.debug("Parse Item error - Title:%s URL:%s", title, url) raise DropItem("Missing mp3 %s" % response.url)
def stock_rank_xzjp_ths() -> pd.DataFrame: """ 同花顺-数据中心-技术选股-险资举牌 http://data.10jqka.com.cn/financial/xzjp/ :return: 险资举牌 :rtype: pandas.DataFrame """ js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://data.10jqka.com.cn/ajax/xzjp/field/DECLAREDATE/order/desc/ajax/1/free/1/" r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") try: total_page = soup.find("span", attrs={"class": "page_info"}).text.split("/")[1] except AttributeError as e: total_page = 1 big_df = pd.DataFrame() for page in tqdm(range(1, int(total_page) + 1), leave=False): v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://data.10jqka.com.cn/ajax/xzjp/field/DECLAREDATE/order/desc/ajax/1/free/1/" r = requests.get(url, headers=headers) temp_df = pd.read_html(r.text, converters={"股票代码": str})[0] big_df = big_df.append(temp_df, ignore_index=True) big_df.columns = [ '序号', '举牌公告日', '股票代码', '股票简称', '现价', '涨跌幅', '举牌方', '增持数量', '交易均价', '增持数量占总股本比例', '变动后持股总数', '变动后持股比例', '历史数据', ] big_df['涨跌幅'] = big_df['涨跌幅'].astype(str).str.zfill(6) big_df["增持数量占总股本比例"] = big_df["增持数量占总股本比例"].astype(str).str.strip("%") big_df["变动后持股比例"] = big_df["变动后持股比例"].astype(str).str.strip("%") big_df["涨跌幅"] = pd.to_numeric(big_df["涨跌幅"], errors='coerce') big_df["增持数量占总股本比例"] = pd.to_numeric(big_df["增持数量占总股本比例"]) big_df["变动后持股比例"] = pd.to_numeric(big_df["变动后持股比例"]) big_df["举牌公告日"] = pd.to_datetime(big_df["举牌公告日"]).dt.date big_df["股票代码"] = big_df["股票代码"].astype(str).str.zfill(6) big_df["现价"] = pd.to_numeric(big_df["现价"]) big_df["交易均价"] = pd.to_numeric(big_df["交易均价"]) del big_df['历史数据'] return big_df
def stock_zh_a_cdr_daily( symbol: str = "sh689009", start_date: str = "19900101", end_date: str = "22201116" ) -> pd.DataFrame: """ 新浪财经-A股-CDR个股的历史行情数据, 大量抓取容易封 IP # TODO 观察复权情况 https://finance.sina.com.cn/realstock/company/sh689009/nc.shtml :param start_date: 20201103; 开始日期 :type start_date: str :param end_date: 20201103; 结束日期 :type end_date: str :param symbol: sh689009 :type symbol: str :return: specific data :rtype: pandas.DataFrame """ res = requests.get(zh_sina_a_stock_hist_url.format(symbol)) js_code = py_mini_racer.MiniRacer() js_code.eval(hk_js_decode) dict_list = js_code.call( "d", res.text.split("=")[1].split(";")[0].replace('"', "") ) # 执行js解密代码 data_df = pd.DataFrame(dict_list) data_df.index = pd.to_datetime(data_df["date"]) del data_df["date"] data_df = data_df.astype("float") temp_df = data_df[start_date:end_date] temp_df["open"] = round(temp_df["open"], 2) temp_df["high"] = round(temp_df["high"], 2) temp_df["low"] = round(temp_df["low"], 2) temp_df["close"] = round(temp_df["close"], 2) return temp_df
def test_cannot_parse(self): context = py_mini_racer.MiniRacer() js_source = "var f = function(" with six.assertRaisesRegex(self, py_mini_racer.JSParseException, '.*Unexpected end of input.*'): context.eval(js_source)
def get_us_stock_name() -> pd.DataFrame: """ u.s. stock's english name, chinese name and symbol you should use symbol to get apply into the next function http://finance.sina.com.cn/stock/usstock/sector.shtml :return: stock's english name, chinese name and symbol :rtype: pandas.DataFrame """ big_df = pd.DataFrame() page_count = get_us_page_count() for page in tqdm(range(1, page_count + 1)): # page = "1" us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format( page) js_code = py_mini_racer.MiniRacer() js_code.eval(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update({"page": "{}".format(page)}) res = requests.get(us_sina_stock_list_url.format(dict_list), params=us_sina_stock_dict_payload) data_json = json.loads(res.text[res.text.find("({") + 1:res.text.rfind(");")]) big_df = big_df.append(pd.DataFrame(data_json["data"]), ignore_index=True) return big_df[["name", "cname", "symbol"]]
def stock_zh_index_daily(symbol: str = "sh000922") -> pd.DataFrame: """ 新浪财经-指数-历史行情数据, 大量抓取容易封 IP https://finance.sina.com.cn/realstock/company/sh000909/nc.shtml :param symbol: sz399998, 指定指数代码 :type symbol: str :return: 历史行情数据 :rtype: pandas.DataFrame """ params = {"d": "2020_2_4"} res = requests.get(zh_sina_index_stock_hist_url.format(symbol), params=params) js_code = py_mini_racer.MiniRacer() js_code.eval(hk_js_decode) dict_list = js_code.call("d", res.text.split("=")[1].split(";")[0].replace( '"', "")) # 执行js解密代码 temp_df = pd.DataFrame(dict_list) temp_df["date"] = pd.to_datetime(temp_df["date"]).dt.date temp_df["open"] = pd.to_numeric(temp_df["open"]) temp_df["close"] = pd.to_numeric(temp_df["close"]) temp_df["high"] = pd.to_numeric(temp_df["high"]) temp_df["low"] = pd.to_numeric(temp_df["low"]) temp_df["volume"] = pd.to_numeric(temp_df["volume"]) return temp_df
def bond_zh_hs_daily(symbol: str = "sh010107") -> pd.DataFrame: """ 新浪财经-债券-沪深债券-历史行情数据, 大量抓取容易封IP http://vip.stock.finance.sina.com.cn/mkt/#hs_z :param symbol: 沪深债券代码; e.g., sh010107 :type symbol: str :return: 指定沪深债券代码的日 K 线数据 :rtype: pandas.DataFrame """ res = requests.get( zh_sina_bond_hs_hist_url.format( symbol, datetime.datetime.now().strftime("%Y_%m_%d"))) js_code = py_mini_racer.MiniRacer() js_code.eval(hk_js_decode) dict_list = js_code.call("d", res.text.split("=")[1].split(";")[0].replace( '"', "")) # 执行js解密代码 data_df = pd.DataFrame(dict_list) data_df["date"] = pd.to_datetime(data_df["date"]).dt.date data_df['open'] = pd.to_numeric(data_df['open']) data_df['high'] = pd.to_numeric(data_df['high']) data_df['low'] = pd.to_numeric(data_df['low']) data_df['close'] = pd.to_numeric(data_df['close']) return data_df
def test_cannot_parse(self): context = py_mini_racer.MiniRacer() js_source = "var f = function(" with self.assertRaisesRegex( py_mini_racer.JSParseException, '.*Unknown JavaScript error during parse.*'): context.eval(js_source)
def test_cannot_parse(self): context = py_mini_racer.MiniRacer() js_source = "var f = function(" with self.assertRaises(py_mini_racer.JSParseException): context.eval(js_source)
def stock_rank_lxxd_ths() -> pd.DataFrame: """ 同花顺-数据中心-技术选股-连续下跌 http://data.10jqka.com.cn/rank/lxxd/ :return: 连续下跌 :rtype: pandas.DataFrame """ js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://data.10jqka.com.cn/rank/lxxd/field/lxts/order/desc/page/1/ajax/1/free/1/" r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") try: total_page = soup.find("span", attrs={ "class": "page_info" }).text.split("/")[1] except AttributeError as e: total_page = 1 big_df = pd.DataFrame() for page in tqdm(range(1, int(total_page) + 1), leave=False): v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://data.10jqka.com.cn/rank/lxxd/field/lxts/order/desc/page/{page}/ajax/1/free/1/" r = requests.get(url, headers=headers) temp_df = pd.read_html(r.text, converters={"股票代码": str})[0] big_df = big_df.append(temp_df, ignore_index=True) big_df.columns = [ "序号", "股票代码", "股票简称", "收盘价", "最高价", "最低价", "连涨天数", "连续涨跌幅", "累计换手率", "所属行业", ] big_df["连续涨跌幅"] = big_df["连续涨跌幅"].str.strip("%") big_df["累计换手率"] = big_df["累计换手率"].str.strip("%") big_df["连续涨跌幅"] = pd.to_numeric(big_df["连续涨跌幅"]) big_df["累计换手率"] = pd.to_numeric(big_df["累计换手率"]) big_df["收盘价"] = pd.to_numeric(big_df["收盘价"]) big_df["最高价"] = pd.to_numeric(big_df["最高价"]) big_df["最低价"] = pd.to_numeric(big_df["最低价"]) big_df["连涨天数"] = pd.to_numeric(big_df["连涨天数"]) return big_df
def _init(self): # Set up interpreter self._intp = py_mini_racer.MiniRacer() self._time_cs = 0 # Make initial source read for fname in self._js_filenames: with open(fname, "r") as f: self._intp.eval(f.read())
async def _get_us_stock_name_async( request_per_batch: int = 15) -> pd.DataFrame: count_per_page = 20 big_df = pd.DataFrame() page_count = get_us_page_count(count_per_page) start = time.time() with tqdm(total=page_count) as pbar: all_pages = range(1, page_count + 1) finished_pages = [] while len(finished_pages) < page_count: to_req_pages = [x for x in all_pages if x not in finished_pages] request_per_batch = min(request_per_batch, len(to_req_pages)) tasks = {} for page in to_req_pages: if len(tasks) < request_per_batch: us_js_decode = f"US_CategoryService.getList?page={page}&num={count_per_page}&sort=&asc=0&market=&id=" js_code = py_mini_racer.MiniRacer() js_code.eval(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update( {"page": "{}".format(page)}) tasks[page] = asyncio.create_task( request(us_sina_stock_list_url.format(dict_list), us_sina_stock_dict_payload)) continue # n requests per aio loop for _, task in tasks.items(): await task n_failed_req = 0 for page_no, task in tasks.items(): try: res = task.result() data_json = json.loads(res[res.find("({") + 1:res.rfind(");")]) big_df = big_df.append(pd.DataFrame(data_json["data"]), ignore_index=True) finished_pages.append(page_no) pbar.update(1) except requests.exceptions.ConnectionError as ident: n_failed_req += 1 print( f'{ident} page_no={page_no}, sleep for longer time and try in next batch' ) pass tasks.clear() interval_time = 3 if n_failed_req >= 1: # wait longger for sina anti-spider interval_time = 10 time.sleep(interval_time) end = time.time() print('Cost time:', end - start) return big_df[["name", "cname", "symbol"]]
def test_exception_thrown(self): context = py_mini_racer.MiniRacer() js_source = "var f = function() {throw 'error'};" context.eval(js_source) with self.assertRaises(py_mini_racer.JSEvalException): context.eval("f()")
def stock_rank_cxd_ths(symbol: str = "创月新低") -> pd.DataFrame: """ 同花顺-数据中心-技术选股-创新低 http://data.10jqka.com.cn/rank/cxd/ :param symbol: choice of {"创月新低", "半年新低", "一年新低", "历史新低"} :type symbol: str :return: 创新低数据 :rtype: pandas.DataFrame """ symbol_map = { "创月新低": "4", "半年新低": "3", "一年新低": "2", "历史新低": "1", } js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://data.10jqka.com.cn/rank/cxd/board/{symbol_map[symbol]}/field/stockcode/order/asc/page/1/ajax/1/free/1/" r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") try: total_page = soup.find("span", attrs={ "class": "page_info" }).text.split("/")[1] except AttributeError as e: total_page = 1 big_df = pd.DataFrame() for page in tqdm(range(1, int(total_page) + 1), leave=False): v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://data.10jqka.com.cn/rank/cxd/board/{symbol_map[symbol]}/field/stockcode/order/asc/page/{page}/ajax/1/free/1/" r = requests.get(url, headers=headers) temp_df = pd.read_html(r.text)[0] big_df = big_df.append(temp_df, ignore_index=True) big_df.columns = [ "序号", "股票代码", "股票简称", "涨跌幅", "换手率", "最新价", "前期低点", "前期低点日期" ] big_df["股票代码"] = big_df["股票代码"].astype(str).str.zfill(6) big_df["涨跌幅"] = big_df["涨跌幅"].str.strip("%") big_df["换手率"] = big_df["换手率"].str.strip("%") big_df["前期低点日期"] = pd.to_datetime(big_df["前期低点日期"]).dt.date big_df["涨跌幅"] = pd.to_numeric(big_df["涨跌幅"]) big_df["换手率"] = pd.to_numeric(big_df["换手率"]) big_df["最新价"] = pd.to_numeric(big_df["最新价"]) big_df["前期低点"] = pd.to_numeric(big_df["前期低点"]) return big_df
def stock_dividents_cninfo(symbol: str = "600009") -> pd.DataFrame: """ 巨潮资讯-个股-历史分红 http://webapi.cninfo.com.cn/#/company?companyid=600009 :param symbol: 股票代码 :type symbol: str :return: 历史分红 :rtype: pandas.DataFrame """ url = "http://webapi.cninfo.com.cn/api/sysapi/p_sysapi1139" params = {'scode': symbol} random_time_str = str(int(time.time())) js_code = py_mini_racer.MiniRacer() js_code.eval(js_str) mcode = js_code.call("mcode", random_time_str) headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Content-Length": "0", "Host": "webapi.cninfo.com.cn", "mcode": mcode, "Origin": "http://webapi.cninfo.com.cn", "Pragma": "no-cache", "Proxy-Connection": "keep-alive", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } r = requests.post(url, params=params, headers=headers) data_json = r.json() temp_df = pd.DataFrame(data_json["records"]) temp_df.columns = [ "实施方案公告日期", "送股比例", "转增比例", "派息比例", "股权登记日", "除权日", "派息日", "股份到账日", "实施方案分红说明", "分红类型", "报告时间", ] temp_df["实施方案公告日期"] = pd.to_datetime(temp_df["实施方案公告日期"]).dt.date temp_df["送股比例"] = pd.to_numeric(temp_df["送股比例"], errors="coerce") temp_df["转增比例"] = pd.to_numeric(temp_df["转增比例"], errors="coerce") temp_df["派息比例"] = pd.to_numeric(temp_df["派息比例"], errors="coerce") temp_df["股权登记日"] = pd.to_datetime(temp_df["股权登记日"], errors="coerce").dt.date temp_df["除权日"] = pd.to_datetime(temp_df["除权日"], errors="coerce").dt.date temp_df["派息日"] = pd.to_datetime(temp_df["派息日"], errors="coerce").dt.date return temp_df
def solveChallenge(self, text): text = re.split("{|}|;", text) replaceFunction = "return message.replace(/./g, function(char, position) {" rebuilt = [ text[1] + "{", text[2] + ";", replaceFunction, text[9] + ";})};", text[0] ] jsEngine = py_mini_racer.MiniRacer() solution = jsEngine.eval("".join(rebuilt)) return self._shiftBits(solution)
def test_null_byte(self): context = py_mini_racer.MiniRacer() s = "\x00 my string!" # Try return a string including a null byte in_val = "var str = \"" + s + "\"; str;" result = context.eval(in_val) self.assertEqual(result, s)
def solveChallenge(self, text): # Rebuilt Javascript so engine can solve it text = text.replace('\t', '', -1).encode('ascii', 'ignore').decode('utf-8') text = re.split("{|}|;", text) replaceFunction = "return message.replace(/./g, function(char, position) {" rebuilt = [text[1] + "{", text[2] + ";", replaceFunction, text[7] + ";})};", text[0]] jsEngine = py_mini_racer.MiniRacer() solution = jsEngine.eval("".join(rebuilt)) return self._shiftBits(solution)
def find_dlbutton(html): fullpage = BeautifulSoup(html, "html.parser") dl = fullpage.find("dlbutton") for e in fullpage.find_all("script", type="text/javascript"): if "dlbutton" in (e.text): line = e.text.split(";") for l in line: if "dlbutton" in l: l = l.split("=")[1] ctx = py_mini_racer.MiniRacer() print(ctx.eval(l))
def covid_tracker(): ctx = py_mini_racer.MiniRacer() tracker_xpath = '/html/body/script[1]/text()' r = session.get('https://covidtracker.5lab.co/') if r.status_code == 200: raw_script = r.html.xpath(tracker_xpath) function = util.m_func(raw_script) data_exe = ctx.execute(function) cases_data = data_exe['state']['cases'] Export(cases_data, 'covid-tracker') print('Extract Covid-Tracker Successful')
def execute_javascript(function, source, args): interpreter = py_mini_racer.MiniRacer() interpreter.eval('%s = function () { %s }' % (function, source)) def stringify(s): return '"%s"' % s.replace('"', '\\"') eval_args = ','.join(stringify(a) for a in args) return interpreter.eval('%s(%s)' % (function, eval_args), timeout=EvalClient.JS_TIMEOUT, max_memory=EvalClient.JS_MEMORY)
def predict(self, X): code = get_file_content(self.script_path) args = ",".join(map(utils.format_arg, X)) caller = f"score([{args}])" ctx = py_mini_racer.MiniRacer() ctx.eval(code) result = ctx.execute(caller) return result
def stock_board_concept_cons_ths(symbol: str = "阿里巴巴概念") -> pd.DataFrame: """ 同花顺-板块-概念板块-成份股 http://q.10jqka.com.cn/gn/detail/code/301558/ :param symbol: 板块名称 :type symbol: str :return: 成份股 :rtype: pandas.DataFrame """ stock_board_ths_map_df = stock_board_concept_name_ths() symbol = ( stock_board_ths_map_df[stock_board_ths_map_df["概念名称"] == symbol]["网址"] .values[0] .split("/")[-2] ) js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://q.10jqka.com.cn/gn/detail/field/264648/order/desc/page/1/ajax/1/code/{symbol}" r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") try: page_num = int(soup.find_all("a", attrs={"class": "changePage"})[-1]["page"]) except IndexError as e: page_num = 1 big_df = pd.DataFrame() for page in tqdm(range(1, page_num + 1), leave=False): v_code = js_code.call("v") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "Cookie": f"v={v_code}", } url = f"http://q.10jqka.com.cn/gn/detail/field/264648/order/desc/page/{page}/ajax/1/code/{symbol}" r = requests.get(url, headers=headers) temp_df = pd.read_html(r.text)[0] big_df = big_df.append(temp_df, ignore_index=True) big_df.rename( { "涨跌幅(%)": "涨跌幅", "涨速(%)": "涨速", "换手(%)": "换手", "振幅(%)": "振幅", }, inplace=True, axis=1, ) del big_df["加自选"] big_df["代码"] = big_df["代码"].astype(str).str.zfill(6) return big_df
def stock_board_industry_cons_ths(symbol: str = "半导体及元件") -> pd.DataFrame: """ 同花顺-板块-行业板块-成份股 http://q.10jqka.com.cn/thshy/detail/code/881121/ :param symbol: 板块名称 :type symbol: str :return: 成份股 :rtype: pandas.DataFrame """ stock_board_ths_map_df = stock_board_industry_name_ths() symbol = stock_board_ths_map_df[stock_board_ths_map_df['name'] == symbol]['url'].values[0].split('/')[-2] js_code = py_mini_racer.MiniRacer() js_content = _get_file_content_ths("ths.js") js_code.eval(js_content) v_code = js_code.call('v') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'Cookie': f'v={v_code}' } url = f'http://q.10jqka.com.cn/thshy/detail/field/199112/order/desc/page/1/ajax/1/code/{symbol}' r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") try: page_num = int( soup.find_all('a', attrs={'class': 'changePage'})[-1]['page']) except IndexError as e: page_num = 1 big_df = pd.DataFrame() for page in tqdm(range(1, page_num + 1)): v_code = js_code.call('v') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 'Cookie': f'v={v_code}' } url = f'http://q.10jqka.com.cn/thshy/detail/field/199112/order/desc/page/{page}/ajax/1/code/{symbol}' r = requests.get(url, headers=headers) temp_df = pd.read_html(r.text)[0] big_df = big_df.append(temp_df, ignore_index=True) big_df.rename( { "涨跌幅(%)": "涨跌幅", "涨速(%)": "涨速", "换手(%)": "换手", "振幅(%)": "振幅", }, inplace=True, axis=1) del big_df['加自选'] big_df['代码'] = big_df['代码'].astype(str).str.zfill(6) return big_df
def prepare_js(self): logging.info('prepare js runtime') js_tag = self.etree.find('./{}'.format(self.etree.getroot().attrib.get( 'script', 'script'))) self.js = py_mini_racer.MiniRacer() for item in js_tag.findall('./require/item'): libcode = self.url.get(item.attrib['url']) logging.info('load script require: {}'.format(item.attrib['url'])) self.js.eval(libcode) logging.info('load script') self.js.eval(js_tag.find('./code').text) self.js.eval('SiteD = {}') # TODO 导入 SiteD 全局变量