Пример #1
0
    def extract_related_reports(self):
        """ 연관 보고서 리스트 추출

        Returns
        -------
        list of RelatedReport
            연관 보고서리스트 반환

        """
        if self.html is None:
            self._get_report()
        results = []
        soup = self.html
        family = soup.find('select', id='family')
        related_reports = family.find_all('option')
        for report in related_reports:
            value = report.attrs.get('value')
            if str_compare(value, 'null'):
                continue
            rpt_nm = re.sub(r'\s+', ' ', report.text).strip()
            rcp_no = value.split('=')[1]
            if str_compare(self.rcp_no, rcp_no):
                if self.info.get('rpt_nm') is None:
                    self.info['rpt_nm'] = rpt_nm
                continue
            info = {'rcp_no': rcp_no, 'rpt_nm': rpt_nm, 'parent': self}
            results.append(RelatedReport(**info))
        self._related_reports = sorted(results, key=lambda x: x.rcp_no, reverse=True)
        return self._related_reports
Пример #2
0
def get_value_from_dataset(classification, dataset, concept_id):
    """ dataset에서 값을 추출하는 함수 """
    def str_to_float(val):
        try:
            return float(val)
        except ValueError:
            return val

    if isinstance(classification, dict):
        classification = [classification]

    results = list()
    added_title = list()
    for cls in classification:
        value = float('nan')
        for data in dataset[cls['cls_id']]:
            if str_compare(data.concept.id, concept_id):
                value = str_to_float(data.value)
                break
        title = get_title(cls, 'en')
        if title in added_title:
            index = added_title.index(title)
            if not math.isnan(value):
                results[index] = value
        else:
            results.append(value)
            added_title.append(title)
    return results
Пример #3
0
    def extract_attached_reports(self):
        """ 첨부된 보고서 리스트 추출 및 반환

        Returns
        -------
        list of AttachedReport
            첨부된 보고서 리스트

        """
        if self.html is None:
            self._get_report()
        soup = self.html
        attached = soup.find('p', class_='f_none')
        attached_list = attached.find_all('option')
        attached_reports = []

        for docs in attached_list:
            rpt_nm = re.sub(r'\s+', ' ', docs.text).strip()
            docs_url = docs.attrs.get('value')
            if str_compare(docs_url, 'null'):
                pass
            else:
                info = dict()
                parsed = parse_qs(docs_url)
                info['rcp_no'] = parsed.get('rcpNo')[0]
                info['dcm_no'] = parsed.get('dcmNo')[0]
                info['rpt_nm'] = rpt_nm
                info['parent'] = self
                attached_reports.append(AttachedReport(**info))
        self._attached_reports = sorted(attached_reports, key=lambda x: x.rcp_no, reverse=True)
        return self._attached_reports
Пример #4
0
    def run_test(self):
        fs = self.corp.extract_fs(bgn_de=self.bgn_de,
                                  separate=self.separate,
                                  report_tp=self.report_tp)
        for test in self.test_set:
            tp = test['fs_tp']
            date = test['date']
            column = test['column']
            item = test['item']
            expected = test['expected']

            df = fs[tp]
            date_column = find_all_columns(df=df, query=date)[0]
            label_column = find_all_columns(df=df, query=column)[0]

            actual = None

            for idx in range(len(df)):
                text = df[label_column].iloc[idx].replace(' ', '')
                if str_compare(text, item):
                    actual = df[date_column].iloc[idx]

            if actual != expected:
                pytest.fail("Test failed: corp_code='{}', ".format(
                    self.corp.corp_code) +
                            "corp_name='{}', fs_tp='{}', ".format(
                                self.corp.corp_name, tp) +
                            "start_dt='{}', report_tp='{}', ".format(
                                self.bgn_de, fs.info['report_tp']) +
                            "date='{}', column='{}',".format(date, column) +
                            "item='{}', actual='{}', expected='{}'".format(
                                item, actual, expected))
Пример #5
0
def get_value_from_dataset(
    classification,
    dataset,
    concept_id,
    label_ko=None,
    lang='ko',
):
    """ dataset에서 값을 추출하는 함수 """
    def str_to_float(val):
        try:
            return float(val)
        except ValueError:
            return val

    if isinstance(classification, dict):
        classification = [classification]

    # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드
    currency_unit = None
    if label_ko is not None:
        regex = re.compile(r'\(단위:(.*)\)')
        unit = regex.search(label_ko)
        if unit is not None:
            unit = unit.group(0)
            currency = get_currency_str(unit)
            if currency is not None:
                currency_unit = str_unit_to_number_unit(currency)

    results = list()
    added_title = list()
    for cls in classification:
        value = float('nan')
        for data in dataset[cls['cls_id']]:
            if str_compare(data.concept.id, concept_id):
                value = str_to_float(data.value)
                # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드
                if currency_unit is not None:
                    decimals = str_to_float(data.decimals)
                    # decimals이 없을 경우 0으로 처리
                    if math.isinf(decimals) or math.isnan(decimals):
                        decimals = 0
                    value = value * pow(10, decimals)
                    value = value * currency_unit
                break

        title = get_title(cls, lang)
        if title in added_title:
            index = added_title.index(title)
            if not math.isnan(value):
                results[index] = value
        else:
            results.append(value)
            added_title.append(title)
    return results
Пример #6
0
    def get_table_by_code(self, code: str) -> Union[Table, None]:
        """ Table 코드와 일치하는 Table 반환

        Parameters
        ----------
        code: str
            Table 코드번호

        Returns
        -------
        Table or None
            코드 번호에 맞는 Table 또는 None
        """
        for table in self.tables:
            if str_compare(table.code, code):
                return table
        return None
Пример #7
0
def convert_thead_into_columns(fs_tp: str, fs_table: dict, separate: bool = False,
                               lang: str = 'ko'):
    """ thead에서 DataFrame의 columns을 추출하는 Method"""
    def column_ko_to_en(ko):
        ko_to_en = {
            '과목': 'label_ko',
            '주석': 'comment'
        }
        en = ko_to_en.get(ko)
        return en if en else ko

    thead = fs_table['table'].thead

    if thead is None:
        tt = fs_table['table'].tbody.tr.extract()
        thead = BeautifulSoup('<thead></thead>', 'html.parser')
        thead.thead.append(tt)
        for td in thead.tr.find_all('td'):
            td.name = 'th'
    th_colspan_list = [int(th.attrs.get('colspan', 1)) for th in thead.tr.find_all('th')]
    date_info = extract_date_from_header(fs_table['header'])
    # Regular Expression for title
    regex = str_to_regex('과목 OR 주석')

    fs_string = {
        'bs': 'Statement of financial position',
        'is': 'Income statement',
        'cis': 'Statement of comprehensive income',
        'cf': 'Statement of cash flows'
    }

    str_unit = extract_unit_from_header(fs_table['header'])
    str_unit = get_currency_str(str_unit)
    if str_unit:
        for key in fs_string:
            fs_string[key] = fs_string[key] + '(Unit: {})'.format(str_unit)

    label = {
        'ko': {
            True: '별도재무제표',
            False: '연결재무제표'
        },
        'en': {
            True: 'Separate',
            False: 'Consolidated'
        }
    }

    # 최대 Col
    col_length = sum(th_colspan_list)
    # 최대 Row
    row_length = len(thead.find_all('tr'))
    row_length = row_length + 1 if row_length == 1 else row_length
    # row-sapn, col-span을 처리하기 위한 Matrix
    columns_matrix = [[None for _y in range(col_length)] for _x in range(row_length)]
    for idx, tr in enumerate(thead.find_all('tr')):
        start_idx = 0
        for ele_idx, element in enumerate(columns_matrix[idx]):
            if element is None:
                start_idx = ele_idx
                break

        for jdx, th in enumerate(tr.find_all('th')):
            row_span = int(th.attrs.get('rowspan', 1))
            col_span = int(th.attrs.get('colspan', 1))
            text = re.sub(r'\s+', '', th.text)
            date_list = [datetime(1900, 1, 1)]
            if idx == 0:
                if jdx == 0:
                    text = '과목'
                elif regex.search(text) is None:
                    if len(date_info) > 0:
                        date_list = date_info.pop(0)
                    else:
                        import warnings
                        date = '-'.join([date.strftime('%Y%m%d') for date in date_list])
                        warnings_text = "Date data length does not match table header."\
                                + "So last date was set using last data({}). ".format(date)
                        warnings.warn(warnings_text, RuntimeWarning)
                    text = '-'.join([date.strftime('%Y%m%d') for date in date_list])

            if regex.search(text):
                row_span = 2

            for mdx in range(row_span):
                for ndx in range(col_span):
                    new_text = text
                    if mdx == 0 and regex.search(text):
                        new_text = fs_string[fs_tp]
                    columns_matrix[idx + mdx][start_idx + ndx] = new_text
            start_idx = start_idx + ndx + 1

    regex_3month = re.compile(r'3개월')
    regex_total = str_to_regex(r'누적 OR 금액')

    columns = []

    for jdx in range(len(columns_matrix[0])):
        column = []
        sec_item = []
        for idx in range(len(columns_matrix)):
            item = columns_matrix[idx][jdx]
            if idx == 0:
                column.append(item)
                continue
            elif idx == 1 and (item is None or regex.search(item) is None):
                sec_item.append(label[lang][separate])
            else:
                pass

            if item is None:
                pass
            elif str_compare(column[0], item):
                continue
            elif regex_3month.search(item):
                # extract date info
                date_info = [datetime.strptime(date_str, '%Y%m%d') for date_str in column[0].split('-')]

                # calculating start_dt
                delta = relativedelta(months=3)
                start_dt = date_info[1] - delta
                start_dt = start_dt.replace(day=1)

                end_dt = date_info[1]
                column[0] = '-'.join([date.strftime('%Y%m%d') for date in [start_dt, end_dt]])
            elif regex_total.search(item):
                pass
            else:
                sec_item.append(column_ko_to_en(item))
        if sec_item[0] in ['label_ko', 'comment']:
            column.append(sec_item[0])
        else:
            column.append(tuple(sec_item))
        columns.append(column)
    return columns
Пример #8
0
def extract(corp_code: str,
            bgn_de: str,
            end_de: str = None,
            fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'),
            separate: bool = False,
            report_tp: str = 'annual',
            lang: str = 'ko',
            separator: bool = True) -> FinancialStatement:
    """
    재무제표 검색

    Parameters
    ----------
    corp_code: str
        공시대상회사의 고유번호(8자리)
    bgn_de: str
        검색 시작일자(YYYYMMDD)
    end_de: str, optional
        검색 종료일자(YYYYMMDD)
    fs_tp: tuple of str, optional
        'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표
    separate: bool, optional
        개별재무제표 여부
    report_tp: str, optional
        'annual' 1년, 'half' 반기, 'quarter' 분기
    lang: str, optional
        'ko' 한글, 'en' 영문
    separator: bool, optional
        1000단위 구분자 표시 여부

    Returns
    -------
    FinancialStatement
        제무제표 검색 결과

    """
    if is_notebook():
        from tqdm import tqdm_notebook as tqdm
    else:
        from tqdm import tqdm

    import dart_fss as dart
    dart.utils.spinner.spinner_enable = False

    reports = search_annual_report(corp_code=corp_code, bgn_de=bgn_de, end_de=end_de, separate=separate)
    try:
        length = len(reports)
        statements = None
        label_df = None
        # Spinner disable

        for _ in tqdm(range(length), desc='Annual reports', unit='report'):
            report = reports.pop(0)
            if statements is None:
                statements = analyze_report(report=report,
                                            fs_tp=fs_tp,
                                            separate=separate,
                                            lang=lang,
                                            separator=separator)
                if separate is False and all([statements[tp] is None for tp in statements]):
                    raise NotFoundConsolidated('Could not find consolidated financial statements')
                # initialize label dictionary
                label_df = init_label(statements, fs_tp=fs_tp)

            else:
                nstatements = analyze_report(report=report,
                                             fs_tp=fs_tp,
                                             separate=separate,
                                             lang=lang,
                                             separator=separator)
                statements, label_df = merge_fs(statements, nstatements, fs_tp=fs_tp, label_df=label_df)

        if str_compare(report_tp, 'half') or str_compare(report_tp, 'quarter'):
            half = search_filings(corp_code=corp_code, bgn_de=bgn_de, end_de=end_de,
                                  pblntf_detail_ty='A002', page_count=100, last_reprt_at='Y')
            length = len(half)
            for _ in tqdm(range(length), desc='Semiannual reports', unit='report'):
                report = half.pop(0)
                nstatements = analyze_report(report=report,
                                             fs_tp=fs_tp,
                                             separate=separate,
                                             lang=lang,
                                             separator=separator)
                statements, label_df = merge_fs(statements, nstatements, fs_tp=fs_tp, label_df=label_df)

        if str_compare(report_tp, 'quarter'):
            quarter = search_filings(corp_code=corp_code, bgn_de=bgn_de, end_de=end_de,
                                     pblntf_detail_ty='A003', page_count=100, last_reprt_at='Y')
            length = len(quarter)
            for _ in tqdm(range(length), desc='Quarterly report', unit='report'):
                report = quarter.pop(0)
                nstatements = analyze_report(report=report,
                                            fs_tp=fs_tp,
                                            separate=separate,
                                            lang=lang,
                                            separator=separator)
                statements, label_df = merge_fs(statements, nstatements, fs_tp=fs_tp, label_df=label_df)

        statements = drop_empty_columns(statements)
        label_df = drop_empty_columns(label_df)

        statements = sorting_columns(statements)
        label_df = sorting_columns(label_df)

        info = {
            'corp_code': corp_code,
            'bgn_de': bgn_de,
            'end_de': end_de,
            'separate': separate,
            'report_tp': report_tp,
            'lang': lang,
            'separator': separator
        }
        # Spinner enable
        dart.utils.spinner.spinner_enable = True
        return FinancialStatement(statements, label_df, info)
    except Exception as e:
        msg = 'An error occurred while fetching or analyzing {}.'.format(report.to_dict())
        e.args = (*e.args, msg, )
        raise e
    finally:
        dart.utils.spinner.spinner_enable = True
Пример #9
0
def extract(corp_code: str,
            bgn_de: str,
            end_de: str = None,
            fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'),
            separate: bool = False,
            report_tp: str = 'annual',
            lang: str = 'ko',
            separator: bool = True) -> FinancialStatement:
    """
    재무제표 검색

    Parameters
    ----------
    corp_code: str
        공시대상회사의 고유번호(8자리)
    bgn_de: str
        검색 시작일자(YYYYMMDD)
    end_de: str, optional
        검색 종료일자(YYYYMMDD)
    fs_tp: tuple of str, optional
        'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표
    separate: bool, optional
        개별재무제표 여부
    report_tp: str, optional
        'annual' 1년, 'half' 반기, 'quarter' 분기
    lang: str, optional
        'ko' 한글, 'en' 영문
    separator: bool, optional
        1000단위 구분자 표시 여부

    Returns
    -------
    FinancialStatement
        제무제표 검색 결과

    """
    if is_notebook():
        from tqdm import tqdm_notebook as tqdm
    else:
        from tqdm import tqdm

    # 재무제표 검색 결과
    statements = None
    reports = []
    try:
        # 사업보고서 검색(최종보고서)
        reports = search_filings(corp_code=corp_code,
                                 bgn_de=bgn_de,
                                 end_de=end_de,
                                 pblntf_detail_ty='A001',
                                 page_count=100,
                                 last_reprt_at='Y')
    except NoDataReceived:
        # 감사보고서 검색
        if separate:
            pblntf_detail_ty = 'F001'
        else:
            pblntf_detail_ty = 'F002'
        reports = search_filings(corp_code=corp_code,
                                 bgn_de=bgn_de,
                                 end_de=end_de,
                                 pblntf_detail_ty=pblntf_detail_ty,
                                 page_count=100,
                                 last_reprt_at='Y')
    finally:
        if len(reports) == 0:
            raise RuntimeError('Could not find an annual report')

        next_index = 0
        for idx, _ in enumerate(reports):
            # 가장 최근 보고서의 경우 XBRL 파일을 이용하여 재무제표 검색
            latest_report = reports[idx]
            latest_xbrl = latest_report.xbrl
            # XBRL 파일이 존재할 때
            if latest_xbrl is not None:
                if separate is False and not latest_xbrl.exist_consolidated():
                    raise NotFoundConsolidated(
                        'Could not find consolidated financial statements')

                # XBRL 정보를 이용하여 재무제표 정보 초기화
                analyzed_results = analyze_xbrl(latest_report,
                                                fs_tp=fs_tp,
                                                separate=separate,
                                                lang=lang,
                                                show_abstract=False,
                                                show_class=True,
                                                show_depth=10,
                                                show_concept=True,
                                                separator=separator)
                statements = copy.deepcopy(analyzed_results)
            else:
                statements = analyze_html(latest_report,
                                          fs_tp=fs_tp,
                                          separate=separate,
                                          lang=lang)
            # Report 에 재무제표 정보 없이 수정 사항만 기록된 경우 다음 리포트 검색
            if statements is not None:
                next_index = idx + 1
                break

        if separate is False and all(
            [statements[tp] is None for tp in statements]):
            raise NotFoundConsolidated(
                'Could not find consolidated financial statements')

        label_df = None
        for report in tqdm(reports[next_index:],
                           desc='Annual reports',
                           unit='report'):
            statements, label_df = merge_fs(statements,
                                            label_df,
                                            report,
                                            fs_tp=fs_tp,
                                            separate=separate,
                                            lang=lang)

        if str_compare(report_tp, 'half') or str_compare(report_tp, 'quarter'):
            half = search_filings(corp_code=corp_code,
                                  bgn_de=bgn_de,
                                  end_de=end_de,
                                  pblntf_detail_ty='A002',
                                  page_count=100,
                                  last_reprt_at='Y')
            for report in tqdm(half, desc='Semiannual reports', unit='report'):
                statements, label_df = merge_fs(statements,
                                                label_df,
                                                report,
                                                fs_tp=fs_tp,
                                                separate=separate,
                                                lang=lang)

        if str_compare(report_tp, 'quarter'):
            quarter = search_filings(corp_code=corp_code,
                                     bgn_de=bgn_de,
                                     end_de=end_de,
                                     pblntf_detail_ty='A003',
                                     page_count=100,
                                     last_reprt_at='Y')
            for report in tqdm(quarter, desc='Quarterly report',
                               unit='report'):
                statements, label_df = merge_fs(statements,
                                                label_df,
                                                report,
                                                fs_tp=fs_tp,
                                                separate=separate,
                                                lang=lang)

        statements = drop_empty_columns(statements)
        label_df = drop_empty_columns(label_df)

        statements = sorting_columns(statements)
        label_df = sorting_columns(label_df)

        info = {
            'corp_code': corp_code,
            'bgn_de': bgn_de,
            'end_de': end_de,
            'separate': separate,
            'report_tp': report_tp,
            'lang': lang,
            'separator': separator
        }
        return FinancialStatement(statements, label_df, info)