def report_find_all(report: Report, query: dict, fs_tp: Tuple[str], separate: bool) -> Tuple[int, Dict[str, Dict]]: """ Report의 Page 중 Query 조건에 맞는 페이지 검색후 모든 재무제표 Table 추출 Parameters ---------- report: Report Report query: dict 검색 조건 fs_tp: tuple of str 검색할 재무제표 타입 separate: bool 개별 재무제표 여부 Returns ------- """ count = 0 fs_table = None searched_end = False searched = report.find_all(**query) for key in searched: for page in searched[key]: non_break_space = u'\xa0' html = page.html.replace(non_break_space, ' ') soup = BeautifulSoup(html, 'html.parser') tables = soup.find_all('table', border='1') fs_table = search_fs_table(tables=tables, fs_tp=fs_tp, separate=separate) count = sum( [fs_table[fs_tp]['table'] is not None for fs_tp in fs_table]) if count > 0: searched_end = True break if searched_end: break return count, fs_table
def __init__(self, resp): self._page_no = resp['page_no'] self._page_count = resp['page_count'] self._total_count = resp['total_count'] self._total_page = resp['total_page'] self._report_list = [Report(**x) for x in resp['list']]
def merge_fs(fs_df: Dict[str, DataFrame], label_df: Dict[str, DataFrame], report: Report, fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'), lang: str = 'ko', separate: bool = False): """ 재무제표 DataFrame과 Report의 데이터를 합쳐주는 Method Parameters ---------- fs_df: dict of {str: DataFrame} 재무제표 label_df: dict of {str: DataFrame} 재무제표 검색결과시 추출된 값의 Label report: Report Report fs_tp: tuple of str, optional 'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표 lang: str, optional 'ko' 한글, 'en' 영문 separate: bool, optional 개별재무제표 여부 Returns ------- tuple of dict of {str: DataFrame} 재무제표, 추출된 Label 리스트 """ try: global additional_comparison_function # 보고서의 웹페이지에서 재무제표 추출 nfs_df = analyze_html(report=report, fs_tp=fs_tp, lang=lang, separate=separate) if label_df is None: label_df = {tp: None for tp in fs_tp} for tp in fs_df: if tp in fs_tp: # 추가될 재무제표의 DataFrame df = fs_df[tp] # 새로 추가할 재무제표 ndf = nfs_df[tp] # 재무제표가 없을시 추가 검색 X if df is None: if ndf is None: continue else: fs_df[tp] = ndf.copy(deep=True) df = fs_df[tp] # 검색된 재무제표가 없을시 추가 검색 X if ndf is None: continue # label_df가 없을시 초기화 if label_df.get(tp) is None: concept_column = find_all_columns(df, r'concept_id') ko_column = find_all_columns(df, r'label_ko') # Label_ko 가 없을시 Table 오류 이므로 None 처리 if len(ko_column) == 0: fs_df[tp] = None continue else: ko_column = ko_column[0] date_columns = find_all_columns(df, r'\d{8}') label_columns = [] if len(concept_column) == 1: label_columns.append(( 'default', 'concept_id', )) for column in date_columns: label_columns.append(column) nlabel_columns = pd.MultiIndex.from_tuples(label_columns) label_df[tp] = pd.DataFrame(columns=nlabel_columns) if len(concept_column) == 1: label_df[tp][label_columns[0]] = [ extract_account_title(x) for x in list(df[concept_column[0]]) ] for column in date_columns: label_df[tp][column] = list(df[ko_column]) df_columns = set(df.columns.tolist()) ndf_columns = set(ndf.columns.tolist()) overlap = df_columns.intersection(ndf_columns) date_regex = re.compile(r'\d{8}') diff = [ x for x in (ndf_columns - overlap) if date_regex.search(x[0]) ] diff.sort(key=lambda x: date_regex.findall(x[0])[0], reverse=True) # Data가 동일할 경우 Continue if len(diff) == 0: continue for column in diff: ndata = [None for _ in range(len(df))] nlabels = ['' for _ in range(len(df))] if len(overlap) > 0: ndata, nlabels = compare_df_and_ndf_value( column, df, ndf, ndata, nlabels) for compare_func in additional_comparison_function: ndata, nlabels = compare_func(column, df, ndf, label_df[tp], ndata, nlabels) label_df[tp][column] = nlabels fs_df[tp][column] = ndata return fs_df, label_df except Exception: msg = 'An error occurred while fetching or analyzing {}.'.format( report.to_dict()) raise RuntimeError(msg)