def search_fs_table(tables: List, fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'), separate: bool = False) -> Dict[str, dict]: """ 페이지의 재무제표 테이블을 검색하는 함수 Parameters ---------- tables: list of ResultSet page 내부에서 검색된 모든 Tables fs_tp: tuple of str 'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표 separate: bool 개별 재무제표 여부 Returns ------- dict of {str : dict } 검색된 재무제표 결과 """ fs_table = OrderedDict() # 순서대로 검색 (순서 변경 금지) queryset = { 'bs': str_insert_whitespace('재무상태표') + ' OR ' + str_insert_whitespace('대차대조표'), 'is': str_insert_whitespace('손익계산서'), 'cis': str_insert_whitespace('포괄손익계산서'), 'cf': str_insert_whitespace('현금흐름표'), } for key, query in queryset.items(): if key not in fs_tp: continue # 연결재무제표 검색시 사용할 query 구문 excludes = None if not separate: query = query + ' AND ' + str_insert_whitespace('연결') else: excludes = str_insert_whitespace('연결') if key == 'is': if excludes: excludes += ' OR ' + str_insert_whitespace('포괄') else: excludes = str_insert_whitespace('포괄') if excludes: excludes = str_to_regex(excludes) regex = str_to_regex(query) title, header, tb = seek_table(tables=tables, includes=regex, excludes=excludes) fs_table[key] = {'title': title, 'header': header, 'table': tb} return fs_table
def get_currency_str(unit: str) -> Union[str, None]: regex_str = ' OR '.join(CURRENCY.keys()) str_unit = str_to_regex(regex_str).search(unit) if str_unit: str_unit = str_unit.group(0) return CURRENCY[str_unit] regex_str = ' OR '.join([v for _, v in CURRENCY.items()]) str_unit = str_to_regex(regex_str).search(unit) if str_unit: return str_unit.group(0) return None
def find_all_columns(df: DataFrame, query: str) -> list: """ DataFrame의 column을 검색어를 통해 검색하는 함수 Parameters ---------- df: DataFrame 검색할 DataFrame query: str 검색어 Returns ------- tuple of str 검색된 DataFrame의 column """ regex = str_to_regex(query) if df is None: return [] columns = df.columns.tolist() results = [] for column in columns: for item in column: if isinstance(item, str) and regex.search(item): results.append(column) else: if regex.search(' '.join(item)): results.append(column) return results
def convert_tbody_to_dataframe(columns: list, fs_table: dict): """ Html의 tbody를 DataFrame으로 변환하는 함수""" column_matrix = OrderedDict() for idx, column in enumerate(columns): key = tuple(column) if column_matrix.get(key): column_matrix[key].append(idx) else: column_matrix[key] = [] column_matrix[key].append(idx) deduplicated = [key for key in column_matrix] df_columns = pd.MultiIndex.from_tuples(deduplicated) df = pd.DataFrame(columns=df_columns) tbody = fs_table['table'].tbody regex = str_to_regex('label_ko OR comment') str_unit = extract_unit_from_header(fs_table['header']) unit = str_unit_to_number_unit(str_unit) unit_regex = re.compile(r'\(단위\s*?:\s*([a-zA-Zㄱ-힣])\)') for idx, tr in enumerate(tbody.find_all('tr')): extracted = [ re.sub(r'\s+|=+', '', td.text) for td in tr.find_all('td') ] row = {key: 0 for key in deduplicated} for key, index_list in column_matrix.items(): for index in index_list: if len(extracted) <= index: row[key] = None elif isinstance(key[1], str): row[key] = extracted[index] elif regex.search(' '.join(key[1])): value = extracted[index] row[key] = value else: value = str_to_float(extracted[index], unit) row[key] += value if isinstance(row[key], float): if abs(row[key]) < 1e-10: row[key] = '' else: row[key] = row[key] * unit ordered_list = [] for column in df_columns.tolist(): ordered_list.append(row.get(column, None)) row_unit = unit_regex.search(ordered_list[0]) if row_unit: row_unit = str_unit_to_number_unit(row_unit.group(1)) for jdx, value in enumerate(ordered_list): if isinstance(value, str): pass else: ordered_list[jdx] = ordered_list[jdx] / unit * row_unit df.loc[idx] = ordered_list return df
def cls_label_check(cls, query): """ classification label에 특정 단어가 포함된지 검색하는 함수 Parameters ---------- cls: cls classification query: str 검색어 Returns ------- bool 질의내용 포함시 True / 미포함시 False """ if query is None: return True regex = str_to_regex(query) label = '' for qname in cls['label']: label = label + cls['label'][qname]['ko'] + cls['label'][qname]['en'] if regex.search(label): return True return False
def test_str_to_regex(): query = '삼성 OR ( 하이 AND 닉스)' regex = str_to_regex(query=query) actual = regex.search('하이닉') expected = None assert actual == expected
def test_str_to_regex_2(): query = '삼성 OR 하이' regex = str_to_regex(query=query) actual = regex.search('삼성이닉스').group(0) expected = '삼성' assert actual == expected
def determinant(value): det1 = str_to_regex(includes).search(value) if includes else True det2 = not str_to_regex(excludes).search(value) if excludes else True return det1 and det2
def convert_thead_into_columns(fs_tp: str, fs_table: dict, separate: bool = False, lang: str = 'ko'): """ thead에서 DataFrame의 columns을 추출하는 Method""" def column_ko_to_en(ko): ko_to_en = {'과목': 'label_ko', '주석': 'comment'} en = ko_to_en.get(ko) return en if en else ko thead = fs_table['table'].thead if thead is None: tt = fs_table['table'].tbody.tr.extract() thead = BeautifulSoup('<thead></thead>', 'html.parser') thead.thead.append(tt) for td in thead.tr.find_all('td'): td.name = 'th' th_colspan_list = [ int(th.attrs.get('colspan', 1)) for th in thead.tr.find_all('th') ] date_info = extract_date_from_header(fs_table['header']) # Regular Expression for title regex = str_to_regex('과목 OR 주석') fs_string = { 'bs': 'Statement of financial position', 'is': 'Income statement', 'cis': 'Statement of comprehensive income', 'cf': 'Statement of cash flows' } str_unit = extract_unit_from_header(fs_table['header']) str_unit = str_to_regex('원 OR USD').search(str_unit) if str_unit: str_unit = str_unit.group(0) str_unit = 'KRW' if str_compare('원', str_unit) else 'USD' for key in fs_string: fs_string[key] = fs_string[key] + '(Unit: {})'.format(str_unit) label = { 'ko': { True: '별도재무제표', False: '연결재무제표' }, 'en': { True: 'Separate', False: 'Consolidated' } } # 최대 Col col_length = sum(th_colspan_list) # 최대 Row row_length = len(thead.find_all('tr')) row_length = row_length + 1 if row_length == 1 else row_length # row-sapn, col-span을 처리하기 위한 Matrix columns_matrix = [[None for y in range(col_length)] for x in range(row_length)] for idx, tr in enumerate(thead.find_all('tr')): start_idx = 0 for ele_idx, element in enumerate(columns_matrix[idx]): if element is None: start_idx = ele_idx break for jdx, th in enumerate(tr.find_all('th')): row_span = int(th.attrs.get('rowspan', 1)) col_span = int(th.attrs.get('colspan', 1)) text = re.sub(r'\s+', '', th.text) date_list = [datetime(1900, 1, 1)] if idx == 0: if jdx == 0: text = '과목' elif regex.search(text) is None: if len(date_info) > 0: date_list = date_info.pop(0) else: import warnings date = '-'.join( [date.strftime('%Y%m%d') for date in date_list]) warnings_text = "Date data length does not match table header."\ + "So last date was set using last data({}). ".format(date) warnings.warn(warnings_text, RuntimeWarning) text = '-'.join( [date.strftime('%Y%m%d') for date in date_list]) if regex.search(text): row_span = 2 for mdx in range(row_span): for ndx in range(col_span): new_text = text if mdx == 0 and regex.search(text): new_text = fs_string[fs_tp] columns_matrix[idx + mdx][start_idx + ndx] = new_text start_idx = start_idx + ndx + 1 regex_3month = re.compile(r'3개월') regex_total = str_to_regex(r'누적 OR 금액') columns = [] for jdx in range(len(columns_matrix[0])): column = [] sec_item = [] for idx in range(len(columns_matrix)): item = columns_matrix[idx][jdx] if idx == 0: column.append(item) continue elif idx == 1 and (item is None or regex.search(item) is None): sec_item.append(label[lang][separate]) else: pass if item is None: pass elif str_compare(column[0], item): continue elif regex_3month.search(item): # extract date info date_info = [ datetime.strptime(date_str, '%Y%m%d') for date_str in column[0].split('-') ] # calculating start_dt delta = relativedelta(months=3) start_dt = date_info[1] - delta start_dt = start_dt.replace(day=1) end_dt = date_info[1] column[0] = '-'.join( [date.strftime('%Y%m%d') for date in [start_dt, end_dt]]) elif regex_total.search(item): pass else: sec_item.append(column_ko_to_en(item)) if sec_item[0] in ['label_ko', 'comment']: column.append(sec_item[0]) else: column.append(tuple(sec_item)) columns.append(column) return columns