def record(self, entity, start, end, size, timestamps): for page in range(1, 5): resp = requests.get(self.category_stocks_url.format(page, entity.code)) try: if resp.text == "null" or resp.text is None: break category_jsons = demjson3.decode(resp.text) the_list = [] for category in category_jsons: stock_code = category["code"] stock_id = china_stock_code_to_id(stock_code) block_id = entity.id the_list.append( { "id": "{}_{}".format(block_id, stock_id), "entity_id": block_id, "entity_type": "block", "exchange": entity.exchange, "code": entity.code, "name": entity.name, "timestamp": now_pd_timestamp(), "stock_id": stock_id, "stock_code": stock_code, "stock_name": category["name"], } ) if the_list: df = pd.DataFrame.from_records(the_list) df_to_db(data_schema=self.data_schema, df=df, provider=self.provider, force_update=True) self.logger.info("finish recording BlockStock:{},{}".format(entity.category, entity.name)) except Exception as e: self.logger.error("error:,resp.text:", e, resp.text) self.sleep()
def tokenizer_dict(text, text_cmd='', substring='', current_cmd={}): tokens = {'final': set(), 'new': set()} if len(text) < 6: return tokens if text[:1] + text[-1:] not in ['{}', '[]']: return tokens dct = None try: # JSON dct = json.loads(text) except: pass if dct is None: try: # Python dict dct = ast.literal_eval(text) except: pass if dct is None: try: # JavaScript Object dct = demjson3.decode(text) except: pass if dct is not None: dct_tokens = dict_keys_values(dct) values = list_str(dct_tokens['values']) tokens = { 'final': set(list_str(dct_tokens['keys']) + values), 'new': set(values) } return tokens return tokens
def run(self): # 抓取沪市 ETF 列表 url = "http://query.sse.com.cn/commonQuery.do?sqlId=COMMON_SSE_ZQPZ_ETFLB_L_NEW" response = requests.get(url, headers=DEFAULT_SH_ETF_LIST_HEADER) response_dict = demjson3.decode(response.text) df = pd.DataFrame(response_dict.get("result", [])) self.persist_etf_list(df, exchange="sh") self.logger.info("沪市 ETF 列表抓取完成...") # 抓取沪市 ETF 成分股 self.download_sh_etf_component(df) self.logger.info("沪市 ETF 成分股抓取完成...") # 抓取深市 ETF 列表 url = "http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1945" response = requests.get(url) df = pd.read_excel(io.BytesIO(response.content), dtype=str) self.persist_etf_list(df, exchange="sz") self.logger.info("深市 ETF 列表抓取完成...") # 抓取深市 ETF 成分股 self.download_sz_etf_component(df) self.logger.info("深市 ETF 成分股抓取完成...")
def get_news(entity_id, ps=200, index=1): sec_id = to_em_sec_id(entity_id=entity_id) url = f"https://np-listapi.eastmoney.com/comm/wap/getListInfo?cb=callback&client=wap&type=1&mTypeAndCode={sec_id}&pageSize={ps}&pageIndex={index}&callback=jQuery1830017478247906740352_{now_timestamp() - 1}&_={now_timestamp()}" resp = requests.get(url) # { # "Art_ShowTime": "2022-02-11 14:29:25", # "Art_Image": "", # "Art_MediaName": "每日经济新闻", # "Art_Code": "202202112274017262", # "Art_Title": "潍柴动力:巴拉德和锡里斯不纳入合并财务报表范围", # "Art_SortStart": "1644560965017262", # "Art_VideoCount": 0, # "Art_OriginUrl": "http://finance.eastmoney.com/news/1354,202202112274017262.html", # "Art_Url": "http://finance.eastmoney.com/a/202202112274017262.html", # } if resp.status_code == 200: json_text = resp.text[resp.text.index("(") + 1 : resp.text.rindex(")")] json_result = demjson3.decode(json_text)["data"]["list"] if json_result: json_result = [ { "id": f'{entity_id}_{item["Art_ShowTime"]}', "entity_id": entity_id, "timestamp": to_pd_timestamp(item["Art_ShowTime"]), "news_title": item["Art_Title"], } for item in json_result ] next_data = get_news(entity_id=entity_id, ps=ps, index=index + 1) if next_data: return json_result + next_data else: return json_result
def populate_sh_etf_type(df: pd.DataFrame): """ 填充沪市 ETF 代码对应的 TYPE 到列表数据中 :param df: ETF 列表数据 :return: 包含 ETF 对应 TYPE 的列表数据 """ query_url = ( "http://query.sse.com.cn/infodisplay/queryETFNewAllInfo.do?" "isPagination=false&type={}&pageHelp.pageSize=25") type_df = pd.DataFrame() for etf_class in [1, 2]: url = query_url.format(etf_class) response = requests.get(url, headers=DEFAULT_SH_ETF_LIST_HEADER) response_dict = demjson3.decode(response.text) response_df = pd.DataFrame(response_dict.get("result", [])) response_df = response_df[["fundid1", "etftype"]] type_df = pd.concat([type_df, response_df]) result_df = df.copy() result_df = result_df.sort_values(by="FUND_ID").reset_index(drop=True) type_df = type_df.sort_values(by="fundid1").reset_index(drop=True) result_df["ETF_TYPE"] = type_df["etftype"] return result_df
def __init__(self, mid: int = 0, page: str = None, fromImage=False, offset: int = None, limit: int = None): self.offset = offset self.limit = limit self.artistId = mid if page is not None: payload = None # detect if image count != 0 if not fromImage: payload = demjson3.decode(page) if payload["error"]: raise PixivException( payload["message"], errorCode=PixivException.OTHER_MEMBER_ERROR, htmlPage=page) if payload["body"] is None: raise PixivException( "Missing body content, possible artist id doesn't exists.", errorCode=PixivException.USER_ID_NOT_EXISTS, htmlPage=page) self.ParseImages(payload["body"]) self.ParseMangaList(payload["body"]) self.ParseNovelList(payload["body"]) else: payload = self.parseJs(page) self.isLastPage = True self.haveImages = True # parse artist info self.ParseInfo(payload, fromImage)
def record(self, entity, start, end, size, timestamps): # 此 url 不支持分页,如果超过我们想取的条数,则只能取最大条数 if start is None or size > self.default_size: size = 8000 param = { "security_item": entity, "level": self.level.value, "size": size } security_item = param["security_item"] size = param["size"] url = ChinaETFDayKdataRecorder.url.format(security_item.exchange, security_item.code, size) response = requests.get(url) response_json = demjson3.decode(response.text) if response_json is None or len(response_json) == 0: return [] df = pd.DataFrame(response_json) df.rename(columns={"day": "timestamp"}, inplace=True) df["timestamp"] = pd.to_datetime(df["timestamp"]) df["name"] = security_item.name df["provider"] = "sina" df["level"] = param["level"] return df.to_dict(orient="records")
def fetch_cumulative_net_value(self, security_item, start, end) -> pd.DataFrame: query_url = ( "http://api.fund.eastmoney.com/f10/lsjz?" "fundCode={}&pageIndex={}&pageSize=200&startDate={}&endDate={}") page = 1 df = pd.DataFrame() while True: url = query_url.format(security_item.code, page, to_time_str(start), to_time_str(end)) response = requests.get(url, headers=EASTMONEY_ETF_NET_VALUE_HEADER) response_json = demjson3.decode(response.text) response_df = pd.DataFrame(response_json["Data"]["LSJZList"]) # 最后一页 if response_df.empty: break response_df["FSRQ"] = pd.to_datetime(response_df["FSRQ"]) response_df["JZZZL"] = pd.to_numeric(response_df["JZZZL"], errors="coerce") response_df["LJJZ"] = pd.to_numeric(response_df["LJJZ"], errors="coerce") response_df = response_df.fillna(0) response_df.set_index("FSRQ", inplace=True, drop=True) df = pd.concat([df, response_df]) page += 1 self.sleep() return df
def record(self, entity, start, end, size, timestamps): json_results = [] for timestamp in timestamps: timestamp_str = to_time_str(timestamp) url = self.url.format(timestamp_str) response = requests.get(url=url, headers=DEFAULT_SH_SUMMARY_HEADER) results = demjson3.decode(response.text[response.text.index("(") + 1 : response.text.index(")")])["result"] result = [result for result in results if result["productType"] == "1"] if result and len(result) == 1: result_json = result[0] # 有些较老的数据不存在,默认设为0.0 json_results.append( { "provider": "exchange", "timestamp": timestamp, "name": "上证指数", "pe": to_float(result_json["profitRate"], 0.0), "total_value": to_float(result_json["marketValue1"] + "亿", 0.0), "total_tradable_vaule": to_float(result_json["negotiableValue1"] + "亿", 0.0), "volume": to_float(result_json["trdVol1"] + "万", 0.0), "turnover": to_float(result_json["trdAmt1"] + "亿", 0.0), "turnover_rate": to_float(result_json["exchangeRate"], 0.0), } ) if len(json_results) > 30: return json_results return json_results
def get_exchange_data(interface, session=None) -> list: """ This function can fetch 5min data for any PJM interface in the current day. Extracts load and timestamp data from html source then joins them together. """ base_url = "http://www.pjm.com/Charts/InterfaceChart.aspx?open=" url = base_url + exchange_mapping[interface] s = session or requests.Session() req = s.get(url) soup = BeautifulSoup(req.content, "html.parser") scripts = soup.find( "script", { "type": "text/javascript", "src": "/assets/js/Highcharts/HighCharts/highcharts.js", }, ) exchange_script = scripts.find_next_sibling("script") load_pattern = r"var load = (\[(.*)\])" load = re.search(load_pattern, str(exchange_script)).group(1) load_vals = demjson.decode(load)[0] # Occasionally load_vals contains a null at the end of the list which must be caught. actual_load = [float(val) for val in load_vals if val is not None] time_pattern = r"var timeArray = (\[(.*)\])" time_array = re.search(time_pattern, str(exchange_script)).group(1) time_vals = demjson.decode(time_array) flows = zip(actual_load, time_vals) arr_date = arrow.now("America/New_York").floor("day") converted_flows = [] for flow in flows: arr_time = arrow.get(flow[1], "h:mm A") arr_dt = arr_date.replace(hour=arr_time.hour, minute=arr_time.minute).datetime converted_flow = (flow[0], arr_dt) converted_flows.append(converted_flow) return converted_flows
def extract_data(session=None) -> tuple: """ Makes a request to the PJM data url. Finds timestamp of current data and converts into a useful form. Finds generation data inside script tag. :return: tuple of generation data and datetime. """ s = session or requests.Session() req = requests.get(url) soup = BeautifulSoup(req.content, 'html.parser') try: time_div = soup.find("div", id="asOfDate").text except AttributeError: raise LookupError('No data is available for US-PJM.') time_pattern = re.compile(r"""(\d{1,2} #Hour can be 1/2 digits. : #Separator. \d{2})\s #Minutes must be 2 digits with a space after. (a.m.|p.m.) #Either am or pm allowed.""", re.X) latest_time = re.search(time_pattern, time_div) time_data = latest_time.group(1).split(":") am_or_pm = latest_time.group(2) hour = int(time_data[0]) minute = int(time_data[1]) # Time format used by PJM is slightly unusual and needs to be converted so arrow can use it. if am_or_pm == "p.m." and hour != 12: # Time needs to be in 24hr format hour += 12 elif am_or_pm == "a.m." and hour == 12: # Midnight is 12 a.m. hour = 0 arr_dt = arrow.now('America/New_York').replace(hour=hour, minute=minute) future_check = arrow.now('America/New_York') if arr_dt > future_check: # Generation mix lags 1-2hrs behind present. # This check prevents data near midnight being given the wrong date. arr_dt = arr_dt.shift(days=-1) dt = arr_dt.floor('minute').datetime generation_mix_div = soup.find("div", id="rtschartallfuelspjmGenFuelM_container") generation_mix_script = generation_mix_div.next_sibling pattern = r'series: \[(.*)\]' script_data = re.search(pattern, str(generation_mix_script)).group(1) # demjson is required because script data is javascript not valid json. raw_data = demjson.decode(script_data) data = raw_data["data"] return data, dt
def __init__(self, artist_id, page, tzInfo=None, dateFormat=None): self.posts = list() self.dateFormat = dateFormat self._tzInfo = tzInfo if page is not None: post_json = demjson3.decode(page) self.parse_artist(post_json["data"])
def get(self, request, *args, **kwargs): data_json = os.path.join(settings.BASE_DIR, 'tyadmin_api/menu.json') with open(data_json, encoding='utf-8') as fr: content = fr.read() import demjson3 content = demjson3.decode(content) print(json.dumps(content, ensure_ascii=False)) return JsonResponse({ "data": content })
def parseJs(self, page): parsed = BeautifulSoup(page, features="html5lib") jss = parsed.find('meta', attrs={'id': 'meta-preload-data'}) # cleanup parsed.decompose() del parsed if jss is None or len(jss["content"]) == 0: return None # Possibly error page payload = demjson3.decode(jss["content"]) return payload
def parseArtistIds(cls, page): ids = list() js = demjson3.decode(page) if "error" in js and js["error"]: raise PixivException("Error when requesting Fanbox", 9999, page) if "body" in js and js["body"] is not None: js_body = js["body"] if "supportingPlans" in js["body"]: js_body = js_body["supportingPlans"] for creator in js_body: ids.append(creator["user"]["userId"]) return ids
def __init__(self, post_id, artist, page, tzInfo=None, dateFormat=None): self.imageUrls = list() self.imageResizedUrls = list() self.imageId = int(post_id) self._tzInfo = tzInfo self.dateFormat = dateFormat if page is not None: post_json = demjson3.decode(page) if artist is None: artist_id = post_json["data"]["item"]["user"]["id"] self.artist = SketchArtist(artist_id, page, tzInfo, dateFormat) else: self.artist = artist self.parse_post(post_json["data"]["item"])
def download_sh_etf_component(self, df: pd.DataFrame): """ ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF 5. 债券 ETF 6. 黄金 ETF :param df: ETF 列表数据 :return: None """ query_url = ( "http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?" "isPagination=false&type={}&etfClass={}") etf_df = df[(df["ETF_CLASS"] == "1") | (df["ETF_CLASS"] == "2")] etf_df = self.populate_sh_etf_type(etf_df) for _, etf in etf_df.iterrows(): url = query_url.format(etf["ETF_TYPE"], etf["ETF_CLASS"]) response = requests.get(url, headers=DEFAULT_SH_ETF_LIST_HEADER) response_dict = demjson3.decode(response.text) response_df = pd.DataFrame(response_dict.get("result", [])) etf_code = etf["FUND_ID"] etf_id = f"etf_sh_{etf_code}" response_df = response_df[["instrumentId", "instrumentName"]].copy() response_df.rename(columns={ "instrumentId": "stock_code", "instrumentName": "stock_name" }, inplace=True) response_df["entity_id"] = etf_id response_df["entity_type"] = "etf" response_df["exchange"] = "sh" response_df["code"] = etf_code response_df["name"] = etf["FUND_NAME"] response_df["timestamp"] = now_pd_timestamp() response_df["stock_id"] = response_df["stock_code"].apply( lambda code: china_stock_code_to_id(code)) response_df["id"] = response_df["stock_id"].apply( lambda x: f"{etf_id}_{x}") df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider) self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...') self.sleep()
def parse_posts(self, page): post_json = demjson3.decode(page) links_root = post_json["_links"] if "next" in links_root: self.next_page = links_root["next"]["href"] else: self.next_page = None for item in post_json["data"]["items"]: post_id = item["id"] post = SketchPost(post_id, None, None, self._tzInfo, self.dateFormat) post.parse_post(item) post.artist = self self.posts.append(post)
async def _current_data(self) -> Dict[str, Any]: """ Retrieve the data from the printer. Throws ValueError if host does not support SyncThru """ data = {"status": {"hrDeviceStatus": SyncthruState.OFFLINE.value}} if self.connection_mode in [ConnectionMode.AUTO, ConnectionMode.API]: url = "{}{}".format(self.url, ENDPOINT_API) try: async with self._session.get(url) as response: res = demjson3.decode(await response.text(), strict=False) # type: Dict[str, Any] # if we get something back from this endpoint, # we directly return it return res except (aiohttp.ClientError, asyncio.TimeoutError): pass except demjson3.JSONDecodeError: # If no JSON data is provided but we want to only connect # in this mode, raise an Exception if self.connection_mode != ConnectionMode.AUTO: raise SyncThruAPINotSupported( "Invalid host, does not support SyncThru JSON API.") if self.connection_mode in [ConnectionMode.AUTO, ConnectionMode.HTML]: any_connection_successful = False for endpoint_url, parsers in ENDPOINT_HTML_PARSERS.items(): html_url = "{}{}".format(self.url, endpoint_url) try: async with self._session.get(html_url) as response: html_res = await response.text() any_connection_successful = True for parser in parsers: parser(data).feed(html_res) # if successful, set device status to unknown except (aiohttp.ClientError, asyncio.TimeoutError): pass if (any_connection_successful and data["status"]["hrDeviceStatus"] == SyncthruState.OFFLINE.value): data["status"]["hrDeviceStatus"] = SyncthruState.UNKNOWN.value return data
def parse_resp(resp: Response, key=None): if resp.status_code != 200: raise Exception(f"code:{resp.status_code},msg:{resp.content}") # { # "re": true, # "message": "", # "result": {} # } result = resp.text js_text = result[result.index("(") + 1:result.index(")")] ret = demjson3.decode(js_text) logger.info(f"ret:{ret}") data = ret.get("data") if data and key: result_value = data.get(key) else: result_value = data return ret["state"], result_value
def parsePosts(self, page) -> List[FanboxPost]: js = demjson3.decode(page) if "error" in js and js["error"]: raise PixivException( f"Error when requesting Fanbox artist: {self.artistId}", 9999, page) if js["body"] is not None: js_body = js["body"] posts = list() if "creator" in js_body: self.artistName = js_body["creator"]["user"]["name"] if "post" in js_body: # new api post_root = js_body["post"] else: # https://www.pixiv.net/ajax/fanbox/post?postId={0} # or old api post_root = js_body for jsPost in post_root["items"]: post_id = int(jsPost["id"]) post = FanboxPost(post_id, self, jsPost, tzInfo=self._tzInfo) posts.append(post) # sanity check assert (self.artistId == int(jsPost["user"]["userId"]) ), "Different user id from constructor!" self.nextUrl = post_root["nextUrl"] if self.nextUrl is not None and len(self.nextUrl) > 0: self.hasNextPage = True return posts
def load_json_from_file(self, file_name): with open(file_name, 'r') as f: self.options = Dict( demjson.decode(f.read().encode("ascii", "ignore"))) return self.options
def load_json(self, options_string): self.options = Dict( demjson.decode(options_string.encode("ascii", "ignore"))) return self.options
def add_dataset(self, dataset_plot_cfg): self.options.data.datasets.append( Dict(demjson.decode(dataset_plot_cfg.encode("ascii", "ignore"))))
def get_page_info(self, page, **kwargs) -> Profile: result = {} desc = None try: about_url = f'/{page}/about/' logger.debug(f"Requesting page from: {about_url}") resp = self.get(about_url) desc = resp.html.find("meta[name='description']", first=True) result["about"] = resp.html.find( '#pages_msite_body_contents,div.aboutme', first=True).text cover_photo = resp.html.find( "#msite-pages-header-contents i.coverPhoto", first=True) if cover_photo: match = re.search(r"url\('(.+)'\)", cover_photo.attrs["style"]) if match: result["cover_photo"] = utils.decode_css_url( match.groups()[0]) profile_photo = resp.html.find("#msite-pages-header-contents img", first=True) if profile_photo: result["profile_photo"] = profile_photo.attrs["src"] except Exception as e: logger.error(e) try: url = f'/{page}/' logger.debug(f"Requesting page from: {url}") resp = self.get(url) desc = resp.html.find("meta[name='description']", first=True) ld_json = None try: ld_json = resp.html.find("script[type='application/ld+json']", first=True).text except: logger.error("No ld+json element") url = f'/{page}/community' logger.debug(f"Requesting page from: {url}") try: community_resp = self.get(url) ld_json = community_resp.html.find( "script[type='application/ld+json']", first=True).text except: logger.error("No ld+json element") if ld_json: meta = demjson.decode(ld_json) result.update(meta["author"]) result["type"] = result.pop("@type") for interaction in meta.get("interactionStatistic", []): if interaction[ "interactionType"] == "http://schema.org/FollowAction": result["followers"] = interaction[ "userInteractionCount"] try: result["about"] = resp.html.find( '#pages_msite_body_contents>div>div:nth-child(2)', first=True).text except Exception as e: logger.error(e) result = self.get_profile(page) for elem in resp.html.find( "div[data-sigil*='profile-intro-card-log']"): text = elem.text.split("\n")[0] if " Followers" in text: result["followers"] = utils.convert_numeric_abbr( text.replace(" Followers", "")) if text.startswith("Price Range"): result["Price Range"] = text.split(" · ")[-1] link = elem.find("a[href]", first=True) if link: link = link.attrs["href"] if "active_ads" in link: result["active_ads_link"] = link if "maps.google.com" in link: result["map_link"] = parse_qs( urlparse(link).query).get("u")[0] result["address"] = text if link.startswith("tel:"): result["phone"] = link.replace("tel:", "") if link.startswith("mailto:"): result["email"] = link.replace("mailto:", "") result["rating"] = resp.html.find( "div[data-nt='FB:TEXT4']")[1].text except Exception as e: logger.error(e) if desc: logger.debug(desc.attrs["content"]) match = re.search(r'\..+?(\d[\d,.]+).+·', desc.attrs["content"]) if match: result["likes"] = utils.parse_int(match.groups()[0]) bits = desc.attrs["content"].split("·") if len(bits) == 3: result["people_talking_about_this"] = utils.parse_int(bits[1]) result["checkins"] = utils.parse_int(bits[2]) result["reviews"] = self.get_page_reviews(page, **kwargs) return result