def search_fs_table(tables: List, fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'), separate: bool = False) -> Dict[str, dict]: """ 페이지의 재무제표 테이블을 검색하는 함수 Parameters ---------- tables: list of ResultSet page 내부에서 검색된 모든 Tables fs_tp: tuple of str 'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표 separate: bool 개별 재무제표 여부 Returns ------- dict of {str : dict } 검색된 재무제표 결과 """ fs_table = OrderedDict() # 순서대로 검색 (순서 변경 금지) queryset = { 'bs': str_insert_whitespace('재무상태표') + ' OR ' + str_insert_whitespace('대차대조표'), 'is': str_insert_whitespace('손익계산서'), 'cis': str_insert_whitespace('포괄손익계산서'), 'cf': str_insert_whitespace('현금흐름표'), } for key, query in queryset.items(): if key not in fs_tp: continue # 연결재무제표 검색시 사용할 query 구문 excludes = None if not separate: query = query + ' AND ' + str_insert_whitespace('연결') else: excludes = str_insert_whitespace('연결') if key == 'is': if excludes: excludes += ' OR ' + str_insert_whitespace('포괄') else: excludes = str_insert_whitespace('포괄') if excludes: excludes = str_to_regex(excludes) regex = str_to_regex(query) title, header, tb = seek_table(tables=tables, includes=regex, excludes=excludes) fs_table[key] = {'title': title, 'header': header, 'table': tb} return fs_table
def find_all_columns(df: DataFrame, query: str) -> pd.Index: """ DataFrame의 column을 검색어를 통해 검색하는 함수 Parameters ---------- df: DataFrame 검색할 DataFrame query: str 검색어 Returns ------- tuple of str 검색된 DataFrame의 column """ regex = str_to_regex(query) if df is None: return [] columns = df.columns.tolist() results = [] for column in columns: for item in column: if isinstance(item, str) and regex.search(item): results.append(column) else: if regex.search(' '.join(item)): results.append(column) if len(results) > 0: results = pd.MultiIndex.from_tuples(results) return results
def convert_tbody_to_dataframe(columns: list, fs_table: dict): """ Html의 tbody를 DataFrame으로 변환하는 함수""" column_matrix = OrderedDict() for idx, column in enumerate(columns): key = tuple(column) if column_matrix.get(key): column_matrix[key].append(idx) else: column_matrix[key] = [] column_matrix[key].append(idx) deduplicated = [key for key in column_matrix] df_columns = pd.MultiIndex.from_tuples(deduplicated) df = pd.DataFrame(columns=df_columns) tbody = fs_table['table'].tbody regex = str_to_regex('label_ko OR comment') str_unit = extract_unit_from_header(fs_table['header']) unit = str_unit_to_number_unit(str_unit) unit_regex = re.compile(r'\(단위\s*?:\s*([a-zA-Zㄱ-힣])\)') for idx, tr in enumerate(tbody.find_all('tr')): extracted = [ re.sub(r'\s+|=+', '', td.text) for td in tr.find_all('td') ] row = {key: 0 for key in deduplicated} for key, index_list in column_matrix.items(): for index in index_list: if len(extracted) <= index: row[key] = None elif isinstance(key[1], str): row[key] = extracted[index] elif regex.search(' '.join(key[1])): value = extracted[index] row[key] = value else: value = str_to_float(extracted[index], unit) row[key] += value if isinstance(row[key], float): if abs(row[key]) < 1e-10: row[key] = '' else: row[key] = row[key] * unit ordered_list = [] for column in df_columns.tolist(): ordered_list.append(row.get(column, None)) row_unit = unit_regex.search(ordered_list[0]) if row_unit: row_unit = str_unit_to_number_unit(row_unit.group(1)) for jdx, value in enumerate(ordered_list): if isinstance(value, str): pass else: ordered_list[jdx] = ordered_list[jdx] / unit * row_unit df.loc[idx] = ordered_list return df
def to_DataFrame(self, cls=None, lang='ko', start_dt=None, end_dt=None, label=None, show_abstract=False, show_class=True, show_depth=10, show_concept=True, separator=True): """ Pandas DataFrame으로 변환하는 함수 Parameters ---------- cls: dict, optional classification lang: str, optional 'ko' 한글 or 'en' 영문 start_dt: str, optional 검색 시작 일자 end_dt: str, optional 검색 종료 일자 label: str, optional Column Label에 포함될 단어 show_abstract: bool, optional abtract 표시 여부 show_class: bool, optional class 표시여부 show_depth: int, optional class 표시 깊이 show_concept: bool, optional concept_id 표시 여부 separator: bool, optional 숫자 첫단위 표시 여부 Returns ------- DataFrame 재무제표 DataFrame """ if cls is None: cls = self.cls_filter(start_dt, end_dt, label) cls = cls_merge_type(cls) depth = get_max_depth(self.labels, show_abstract=show_abstract) depth = depth if depth < show_depth else show_depth table = self.parent.get_table_by_code('d999004') unit = get_value_from_dataset( table.cls, table.dataset, 'dart-gcd_EntityReportingCurrencyISOCode') definition = self.definition + ' (Unit: {})'.format(unit[0]) columns = generate_df_columns(definition, cls, depth, lang, show_concept=show_concept, show_class=show_class) if separator: pd.options.display.float_format = '{:,}'.format else: pd.options.display.float_format = '{:}'.format df = pd.DataFrame(columns=columns) rows = generate_df_rows(self.labels, cls, self.dataset, depth, lang=lang, show_abstract=show_abstract, show_concept=show_concept, show_class=show_class) data = flatten(rows) for idx, r in enumerate(data): df.loc[idx] = r regex_pass = str_to_regex( 'concept_id OR label_ko OR label_en OR class') df_count = df.count() drop_columns = [] for key, count in df_count.items(): if regex_pass.search(' '.join(key[1])): pass elif count <= 1: drop_columns.append(key) df = df.drop(drop_columns, axis=1) return df
def convert_thead_into_columns(fs_tp: str, fs_table: dict, separate: bool = False, lang: str = 'ko'): """ thead에서 DataFrame의 columns을 추출하는 Method""" def column_ko_to_en(ko): ko_to_en = { '과목': 'label_ko', '주석': 'comment' } en = ko_to_en.get(ko) return en if en else ko thead = fs_table['table'].thead if thead is None: tt = fs_table['table'].tbody.tr.extract() thead = BeautifulSoup('<thead></thead>', 'html.parser') thead.thead.append(tt) for td in thead.tr.find_all('td'): td.name = 'th' th_colspan_list = [int(th.attrs.get('colspan', 1)) for th in thead.tr.find_all('th')] date_info = extract_date_from_header(fs_table['header']) # Regular Expression for title regex = str_to_regex('과목 OR 주석') fs_string = { 'bs': 'Statement of financial position', 'is': 'Income statement', 'cis': 'Statement of comprehensive income', 'cf': 'Statement of cash flows' } str_unit = extract_unit_from_header(fs_table['header']) str_unit = get_currency_str(str_unit) if str_unit: for key in fs_string: fs_string[key] = fs_string[key] + '(Unit: {})'.format(str_unit) label = { 'ko': { True: '별도재무제표', False: '연결재무제표' }, 'en': { True: 'Separate', False: 'Consolidated' } } # 최대 Col col_length = sum(th_colspan_list) # 최대 Row row_length = len(thead.find_all('tr')) row_length = row_length + 1 if row_length == 1 else row_length # row-sapn, col-span을 처리하기 위한 Matrix columns_matrix = [[None for _y in range(col_length)] for _x in range(row_length)] for idx, tr in enumerate(thead.find_all('tr')): start_idx = 0 for ele_idx, element in enumerate(columns_matrix[idx]): if element is None: start_idx = ele_idx break for jdx, th in enumerate(tr.find_all('th')): row_span = int(th.attrs.get('rowspan', 1)) col_span = int(th.attrs.get('colspan', 1)) text = re.sub(r'\s+', '', th.text) date_list = [datetime(1900, 1, 1)] if idx == 0: if jdx == 0: text = '과목' elif regex.search(text) is None: if len(date_info) > 0: date_list = date_info.pop(0) else: import warnings date = '-'.join([date.strftime('%Y%m%d') for date in date_list]) warnings_text = "Date data length does not match table header."\ + "So last date was set using last data({}). ".format(date) warnings.warn(warnings_text, RuntimeWarning) text = '-'.join([date.strftime('%Y%m%d') for date in date_list]) if regex.search(text): row_span = 2 for mdx in range(row_span): for ndx in range(col_span): new_text = text if mdx == 0 and regex.search(text): new_text = fs_string[fs_tp] columns_matrix[idx + mdx][start_idx + ndx] = new_text start_idx = start_idx + ndx + 1 regex_3month = re.compile(r'3개월') regex_total = str_to_regex(r'누적 OR 금액') columns = [] for jdx in range(len(columns_matrix[0])): column = [] sec_item = [] for idx in range(len(columns_matrix)): item = columns_matrix[idx][jdx] if idx == 0: column.append(item) continue elif idx == 1 and (item is None or regex.search(item) is None): sec_item.append(label[lang][separate]) else: pass if item is None: pass elif str_compare(column[0], item): continue elif regex_3month.search(item): # extract date info date_info = [datetime.strptime(date_str, '%Y%m%d') for date_str in column[0].split('-')] # calculating start_dt delta = relativedelta(months=3) start_dt = date_info[1] - delta start_dt = start_dt.replace(day=1) end_dt = date_info[1] column[0] = '-'.join([date.strftime('%Y%m%d') for date in [start_dt, end_dt]]) elif regex_total.search(item): pass else: sec_item.append(column_ko_to_en(item)) if sec_item[0] in ['label_ko', 'comment']: column.append(sec_item[0]) else: column.append(tuple(sec_item)) columns.append(column) return columns