def get_changes(self) -> pd.DataFrame: logger.info(f"get sp500 history changes......") # NOTE: may update the index of the table changes_df = pd.read_html(self.WIKISP500_CHANGES_URL)[-1] changes_df = changes_df.iloc[:, [0, 1, 3]] changes_df.columns = [self.DATE_FIELD_NAME, self.ADD, self.REMOVE] changes_df[self.DATE_FIELD_NAME] = pd.to_datetime( changes_df[self.DATE_FIELD_NAME]) _result = [] for _type in [self.ADD, self.REMOVE]: _df = changes_df.copy() _df[self.CHANGE_TYPE_FIELD] = _type _df[self.SYMBOL_FIELD_NAME] = _df[_type] _df.dropna(subset=[self.SYMBOL_FIELD_NAME], inplace=True) if _type == self.ADD: _df[self.DATE_FIELD_NAME] = _df[self.DATE_FIELD_NAME].apply( lambda x: get_trading_date_by_shift( self.calendar_list, x, 0)) else: _df[self.DATE_FIELD_NAME] = _df[self.DATE_FIELD_NAME].apply( lambda x: get_trading_date_by_shift( self.calendar_list, x, -1)) _result.append(_df[[ self.DATE_FIELD_NAME, self.CHANGE_TYPE_FIELD, self.SYMBOL_FIELD_NAME ]]) logger.info(f"end of get sp500 history changes.") return pd.concat(_result, sort=False)
def get_changes_with_history_companies( self, history_companies: pd.DataFrame) -> pd.DataFrame: """get changes with history companies Parameters ---------- history_companies : pd.DataFrame symbol date SH600000 2020-11-11 dtypes: symbol: str date: pd.Timestamp Return -------- pd.DataFrame: symbol date type SH600000 2019-11-11 add SH600000 2020-11-10 remove dtypes: symbol: str date: pd.Timestamp type: str, value from ["add", "remove"] """ logger.info("parse changes from history companies......") last_code = [] result_df_list = [] _columns = [ self.DATE_FIELD_NAME, self.SYMBOL_FIELD_NAME, self.CHANGE_TYPE_FIELD ] for _trading_date in tqdm( sorted(history_companies[self.DATE_FIELD_NAME].unique(), reverse=True)): _currenet_code = history_companies[history_companies[ self.DATE_FIELD_NAME] == _trading_date][ self.SYMBOL_FIELD_NAME].tolist() if last_code: add_code = list(set(last_code) - set(_currenet_code)) remote_code = list(set(_currenet_code) - set(last_code)) for _code in add_code: result_df_list.append( pd.DataFrame( [[ get_trading_date_by_shift( self.calendar_list, _trading_date, 1), _code, self.ADD ]], columns=_columns, )) for _code in remote_code: result_df_list.append( pd.DataFrame( [[ get_trading_date_by_shift( self.calendar_list, _trading_date, 0), _code, self.REMOVE ]], columns=_columns, )) last_code = _currenet_code df = pd.concat(result_df_list) logger.info("end of parse changes from history companies.") return df
def _read_change_from_url(self, url: str) -> pd.DataFrame: """read change from url Parameters ---------- url : str change url Returns ------- pd.DataFrame: symbol date type SH600000 2019-11-11 add SH600000 2020-11-10 remove dtypes: symbol: str date: pd.Timestamp type: str, value from ["add", "remove"] """ resp = retry_request(url).json()["data"] title = resp["title"] if not title.startswith("关于"): return pd.DataFrame() if "沪深300" not in title: return pd.DataFrame() logger.info( f"load index data from https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}" ) _text = resp["content"] date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text) if len(date_list) >= 2: add_date = pd.Timestamp("-".join(date_list[0])) else: _date = pd.Timestamp("-".join( re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0])) add_date = get_trading_date_by_shift(self.calendar_list, _date, shift=0) if "盘后" in _text or "市后" in _text: add_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=1) remove_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=-1) excel_url = None if resp.get("enclosureList", []): excel_url = resp["enclosureList"][0]["fileUrl"] else: excel_url_list = re.findall('.*href="(.*?xls.*?)".*', _text) if excel_url_list: excel_url = excel_url_list[0] if not excel_url.startswith("http"): excel_url = excel_url if excel_url.startswith( "/") else "/" + excel_url excel_url = f"http://www.csindex.com.cn{excel_url}" if excel_url: logger.info( f"get {add_date} changes from excel, title={title}, excel_url={excel_url}" ) try: df = self._parse_excel(excel_url, add_date, remove_date) except ValueError: logger.warning( f"error downloading file: {excel_url}, will parse the table from the content" ) df = self._parse_table(_text, add_date, remove_date) else: logger.info( f"get {add_date} changes from url content, title={title}") df = self._parse_table(_text, add_date, remove_date) return df
def _read_change_from_url(self, url: str) -> pd.DataFrame: """read change from url The parameter url is from the _get_change_notices_url method. Determine the stock add_date/remove_date based on the title. The response contains three cases: 1.Only excel_url(extract data from excel_url) 2.Both the excel_url and the body text(try to extract data from excel_url first, and then try to extract data from body text) 3.Only body text(extract data from body text) Parameters ---------- url : str change url Returns ------- pd.DataFrame: symbol date type SH600000 2019-11-11 add SH600000 2020-11-10 remove dtypes: symbol: str date: pd.Timestamp type: str, value from ["add", "remove"] """ resp = retry_request(url).json()["data"] title = resp["title"] if not title.startswith("关于"): return pd.DataFrame() if "沪深300" not in title: return pd.DataFrame() logger.info( f"load index data from https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}" ) _text = resp["content"] date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text) if len(date_list) >= 2: add_date = pd.Timestamp("-".join(date_list[0])) else: _date = pd.Timestamp("-".join( re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0])) add_date = get_trading_date_by_shift(self.calendar_list, _date, shift=0) if "盘后" in _text or "市后" in _text: add_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=1) remove_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=-1) excel_url = None if resp.get("enclosureList", []): excel_url = resp["enclosureList"][0]["fileUrl"] else: excel_url_list = re.findall('.*href="(.*?xls.*?)".*', _text) if excel_url_list: excel_url = excel_url_list[0] if not excel_url.startswith("http"): excel_url = excel_url if excel_url.startswith( "/") else "/" + excel_url excel_url = f"http://www.csindex.com.cn{excel_url}" if excel_url: try: logger.info( f"get {add_date} changes from the excel, title={title}, excel_url={excel_url}" ) df = self._parse_excel(excel_url, add_date, remove_date) except ValueError: logger.info( f"get {add_date} changes from the web page, title={title}, url=https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}" ) df = self._parse_table(_text, add_date, remove_date) else: logger.info( f"get {add_date} changes from the web page, title={title}, url=https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}" ) df = self._parse_table(_text, add_date, remove_date) return df
def _read_change_from_url(self, url: str) -> pd.DataFrame: """read change from url Parameters ---------- url : str change url Returns ------- pd.DataFrame: symbol date type SH600000 2019-11-11 add SH600000 2020-11-10 remove dtypes: symbol: str date: pd.Timestamp type: str, value from ["add", "remove"] """ resp = retry_request(url) _text = resp.text date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text) if len(date_list) >= 2: add_date = pd.Timestamp("-".join(date_list[0])) else: _date = pd.Timestamp("-".join( re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0])) add_date = get_trading_date_by_shift(self.calendar_list, _date, shift=0) remove_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=-1) logger.info(f"get {add_date} changes") try: excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0] content = retry_request(f"http://www.csindex.com.cn{excel_url}", exclude_status=[404]).content _io = BytesIO(content) df_map = pd.read_excel(_io, sheet_name=None) with self.cache_dir.joinpath( f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.{excel_url.split('.')[-1]}" ).open("wb") as fp: fp.write(content) tmp = [] for _s_name, _type, _date in [("调入", self.ADD, add_date), ("调出", self.REMOVE, remove_date)]: _df = df_map[_s_name] _df = _df.loc[_df["指数代码"] == self.index_code, ["证券代码"]] _df = _df.applymap(self.normalize_symbol) _df.columns = [self.SYMBOL_FIELD_NAME] _df["type"] = _type _df[self.DATE_FIELD_NAME] = _date tmp.append(_df) df = pd.concat(tmp) except Exception as e: df = None _tmp_count = 0 for _df in pd.read_html(resp.content): if _df.shape[-1] != 4: continue _tmp_count += 1 if self.html_table_index + 1 > _tmp_count: continue tmp = [] for _s, _type, _date in [ (_df.iloc[2:, 0], self.REMOVE, remove_date), (_df.iloc[2:, 2], self.ADD, add_date), ]: _tmp_df = pd.DataFrame() _tmp_df[self.SYMBOL_FIELD_NAME] = _s.map( self.normalize_symbol) _tmp_df["type"] = _type _tmp_df[self.DATE_FIELD_NAME] = _date tmp.append(_tmp_df) df = pd.concat(tmp) df.to_csv( str( self.cache_dir.joinpath( f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.csv" ).resolve())) break return df