Python str_to_regex示例，dart_fss.utils.str_to_regex Python示例

示例#1

0

显示文件

def search_fs_table(tables: List, fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'),
                    separate: bool = False) -> Dict[str, dict]:
    """
    페이지의 재무제표 테이블을 검색하는 함수

    Parameters
    ----------
    tables: list of ResultSet
        page 내부에서 검색된 모든 Tables
    fs_tp: tuple of str
        'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표
    separate: bool
        개별 재무제표 여부

    Returns
    -------
    dict of {str : dict }
        검색된 재무제표 결과
    """
    fs_table = OrderedDict()

    # 순서대로 검색 (순서 변경 금지)
    queryset = {
        'bs': str_insert_whitespace('재무상태표') + ' OR ' + str_insert_whitespace('대차대조표'),
        'is': str_insert_whitespace('손익계산서'),
        'cis': str_insert_whitespace('포괄손익계산서'),
        'cf': str_insert_whitespace('현금흐름표'),
    }

    for key, query in queryset.items():
        if key not in fs_tp:
            continue

        # 연결재무제표 검색시 사용할 query 구문
        excludes = None
        if not separate:
            query = query + ' AND ' + str_insert_whitespace('연결')
        else:
            excludes = str_insert_whitespace('연결')

        if key == 'is':
            if excludes:
                excludes += ' OR ' + str_insert_whitespace('포괄')
            else:
                excludes = str_insert_whitespace('포괄')

        if excludes:
            excludes = str_to_regex(excludes)

        regex = str_to_regex(query)
        title, header, tb = seek_table(tables=tables, includes=regex, excludes=excludes)
        fs_table[key] = {'title': title, 'header': header, 'table': tb}
    return fs_table

示例#2

0

显示文件

def find_all_columns(df: DataFrame, query: str) -> pd.Index:
    """
    DataFrame의 column을 검색어를 통해 검색하는 함수

    Parameters
    ----------
    df: DataFrame
        검색할 DataFrame
    query: str
        검색어

    Returns
    -------
    tuple of str
        검색된 DataFrame의 column
    """
    regex = str_to_regex(query)
    if df is None:
        return []
    columns = df.columns.tolist()

    results = []
    for column in columns:
        for item in column:
            if isinstance(item, str) and regex.search(item):
                results.append(column)
            else:
                if regex.search(' '.join(item)):
                    results.append(column)
    if len(results) > 0:
        results = pd.MultiIndex.from_tuples(results)
    return results

示例#3

0

显示文件

def convert_tbody_to_dataframe(columns: list, fs_table: dict):
    """ Html의 tbody를 DataFrame으로 변환하는 함수"""
    column_matrix = OrderedDict()
    for idx, column in enumerate(columns):
        key = tuple(column)
        if column_matrix.get(key):
            column_matrix[key].append(idx)
        else:
            column_matrix[key] = []
            column_matrix[key].append(idx)
    deduplicated = [key for key in column_matrix]

    df_columns = pd.MultiIndex.from_tuples(deduplicated)
    df = pd.DataFrame(columns=df_columns)

    tbody = fs_table['table'].tbody
    regex = str_to_regex('label_ko OR comment')
    str_unit = extract_unit_from_header(fs_table['header'])
    unit = str_unit_to_number_unit(str_unit)
    unit_regex = re.compile(r'\(단위\s*?:\s*([a-zA-Zㄱ-힣])\)')

    for idx, tr in enumerate(tbody.find_all('tr')):
        extracted = [
            re.sub(r'\s+|=+', '', td.text) for td in tr.find_all('td')
        ]
        row = {key: 0 for key in deduplicated}
        for key, index_list in column_matrix.items():
            for index in index_list:
                if len(extracted) <= index:
                    row[key] = None
                elif isinstance(key[1], str):
                    row[key] = extracted[index]
                elif regex.search(' '.join(key[1])):
                    value = extracted[index]
                    row[key] = value
                else:
                    value = str_to_float(extracted[index], unit)
                    row[key] += value

            if isinstance(row[key], float):
                if abs(row[key]) < 1e-10:
                    row[key] = ''
                else:
                    row[key] = row[key] * unit

        ordered_list = []
        for column in df_columns.tolist():
            ordered_list.append(row.get(column, None))

        row_unit = unit_regex.search(ordered_list[0])
        if row_unit:
            row_unit = str_unit_to_number_unit(row_unit.group(1))
            for jdx, value in enumerate(ordered_list):
                if isinstance(value, str):
                    pass
                else:
                    ordered_list[jdx] = ordered_list[jdx] / unit * row_unit

        df.loc[idx] = ordered_list
    return df

示例#4

0

显示文件

文件： table.py 项目： codacy-badger/dart-fss-1

    def to_DataFrame(self,
                     cls=None,
                     lang='ko',
                     start_dt=None,
                     end_dt=None,
                     label=None,
                     show_abstract=False,
                     show_class=True,
                     show_depth=10,
                     show_concept=True,
                     separator=True):
        """ Pandas DataFrame으로 변환하는 함수

        Parameters
        ----------
        cls: dict, optional
            classification
        lang: str, optional
            'ko' 한글 or 'en' 영문
        start_dt: str, optional
            검색 시작 일자
        end_dt: str, optional
            검색 종료 일자
        label: str, optional
            Column Label에 포함될 단어
        show_abstract: bool, optional
            abtract 표시 여부
        show_class: bool, optional
            class 표시여부
        show_depth: int, optional
            class 표시 깊이
        show_concept: bool, optional
            concept_id 표시 여부
        separator: bool, optional
            숫자 첫단위 표시 여부

        Returns
        -------
        DataFrame
            재무제표 DataFrame
        """
        if cls is None:
            cls = self.cls_filter(start_dt, end_dt, label)
        cls = cls_merge_type(cls)
        depth = get_max_depth(self.labels, show_abstract=show_abstract)
        depth = depth if depth < show_depth else show_depth

        table = self.parent.get_table_by_code('d999004')
        unit = get_value_from_dataset(
            table.cls, table.dataset,
            'dart-gcd_EntityReportingCurrencyISOCode')

        definition = self.definition + ' (Unit: {})'.format(unit[0])
        columns = generate_df_columns(definition,
                                      cls,
                                      depth,
                                      lang,
                                      show_concept=show_concept,
                                      show_class=show_class)

        if separator:
            pd.options.display.float_format = '{:,}'.format
        else:
            pd.options.display.float_format = '{:}'.format
        df = pd.DataFrame(columns=columns)

        rows = generate_df_rows(self.labels,
                                cls,
                                self.dataset,
                                depth,
                                lang=lang,
                                show_abstract=show_abstract,
                                show_concept=show_concept,
                                show_class=show_class)
        data = flatten(rows)
        for idx, r in enumerate(data):
            df.loc[idx] = r

        regex_pass = str_to_regex(
            'concept_id OR label_ko OR label_en OR class')
        df_count = df.count()
        drop_columns = []
        for key, count in df_count.items():
            if regex_pass.search(' '.join(key[1])):
                pass
            elif count <= 1:
                drop_columns.append(key)
        df = df.drop(drop_columns, axis=1)
        return df

示例#5

0

显示文件

def convert_thead_into_columns(fs_tp: str, fs_table: dict, separate: bool = False,
                               lang: str = 'ko'):
    """ thead에서 DataFrame의 columns을 추출하는 Method"""
    def column_ko_to_en(ko):
        ko_to_en = {
            '과목': 'label_ko',
            '주석': 'comment'
        }
        en = ko_to_en.get(ko)
        return en if en else ko

    thead = fs_table['table'].thead

    if thead is None:
        tt = fs_table['table'].tbody.tr.extract()
        thead = BeautifulSoup('<thead></thead>', 'html.parser')
        thead.thead.append(tt)
        for td in thead.tr.find_all('td'):
            td.name = 'th'
    th_colspan_list = [int(th.attrs.get('colspan', 1)) for th in thead.tr.find_all('th')]
    date_info = extract_date_from_header(fs_table['header'])
    # Regular Expression for title
    regex = str_to_regex('과목 OR 주석')

    fs_string = {
        'bs': 'Statement of financial position',
        'is': 'Income statement',
        'cis': 'Statement of comprehensive income',
        'cf': 'Statement of cash flows'
    }

    str_unit = extract_unit_from_header(fs_table['header'])
    str_unit = get_currency_str(str_unit)
    if str_unit:
        for key in fs_string:
            fs_string[key] = fs_string[key] + '(Unit: {})'.format(str_unit)

    label = {
        'ko': {
            True: '별도재무제표',
            False: '연결재무제표'
        },
        'en': {
            True: 'Separate',
            False: 'Consolidated'
        }
    }

    # 최대 Col
    col_length = sum(th_colspan_list)
    # 최대 Row
    row_length = len(thead.find_all('tr'))
    row_length = row_length + 1 if row_length == 1 else row_length
    # row-sapn, col-span을 처리하기 위한 Matrix
    columns_matrix = [[None for _y in range(col_length)] for _x in range(row_length)]
    for idx, tr in enumerate(thead.find_all('tr')):
        start_idx = 0
        for ele_idx, element in enumerate(columns_matrix[idx]):
            if element is None:
                start_idx = ele_idx
                break

        for jdx, th in enumerate(tr.find_all('th')):
            row_span = int(th.attrs.get('rowspan', 1))
            col_span = int(th.attrs.get('colspan', 1))
            text = re.sub(r'\s+', '', th.text)
            date_list = [datetime(1900, 1, 1)]
            if idx == 0:
                if jdx == 0:
                    text = '과목'
                elif regex.search(text) is None:
                    if len(date_info) > 0:
                        date_list = date_info.pop(0)
                    else:
                        import warnings
                        date = '-'.join([date.strftime('%Y%m%d') for date in date_list])
                        warnings_text = "Date data length does not match table header."\
                                + "So last date was set using last data({}). ".format(date)
                        warnings.warn(warnings_text, RuntimeWarning)
                    text = '-'.join([date.strftime('%Y%m%d') for date in date_list])

            if regex.search(text):
                row_span = 2

            for mdx in range(row_span):
                for ndx in range(col_span):
                    new_text = text
                    if mdx == 0 and regex.search(text):
                        new_text = fs_string[fs_tp]
                    columns_matrix[idx + mdx][start_idx + ndx] = new_text
            start_idx = start_idx + ndx + 1

    regex_3month = re.compile(r'3개월')
    regex_total = str_to_regex(r'누적 OR 금액')

    columns = []

    for jdx in range(len(columns_matrix[0])):
        column = []
        sec_item = []
        for idx in range(len(columns_matrix)):
            item = columns_matrix[idx][jdx]
            if idx == 0:
                column.append(item)
                continue
            elif idx == 1 and (item is None or regex.search(item) is None):
                sec_item.append(label[lang][separate])
            else:
                pass

            if item is None:
                pass
            elif str_compare(column[0], item):
                continue
            elif regex_3month.search(item):
                # extract date info
                date_info = [datetime.strptime(date_str, '%Y%m%d') for date_str in column[0].split('-')]

                # calculating start_dt
                delta = relativedelta(months=3)
                start_dt = date_info[1] - delta
                start_dt = start_dt.replace(day=1)

                end_dt = date_info[1]
                column[0] = '-'.join([date.strftime('%Y%m%d') for date in [start_dt, end_dt]])
            elif regex_total.search(item):
                pass
            else:
                sec_item.append(column_ko_to_en(item))
        if sec_item[0] in ['label_ko', 'comment']:
            column.append(sec_item[0])
        else:
            column.append(tuple(sec_item))
        columns.append(column)
    return columns