예제 #1
0
class AriesParser():
    def __init__(self, text):
        self.text = text
        self.head_splitted_account = None
        self.account_utils = AccountUtils()
        self.string_utils = StringUtils()

    def parse(self):
        text = self.__preprocess_text(self.text)
        lines = self.__scan_lines(text)
        return self.__parse_lines(lines)

    def __preprocess_text(self, text):
        text = self.account_utils.concat_account(text)
        text = self.account_utils.remove_eten_separation(text)
        return text

    def __scan_lines(self, text):
        scanner = Scanner(Source(text))
        scanner.scan()
        tokens = scanner.get_tokens()

        lines = []
        tokens_in_line = []
        for token in tokens:
            tokens_in_line.append(token)
            if token.get_token_type() == 'TK_EOL':
                lines.append(tokens_in_line)
                tokens_in_line = []
        return lines

    def __parse_lines(self, lines):
        column_name_list = None
        visited_column_name_list = False
        row_list = []
        for line in lines:
            type_list = [token.get_token_type() for token in line]
            #print type_list
            # pass useless line
            if type_list == ['TK_EOL']:
                continue
            elif type_list == ['TK_SEPERATION', 'TK_EOL']:
                continue
            
            # try to parse column name list
            if not visited_column_name_list:
                if type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    try:
                        column_name_list = self.__parse_column_name_list(line[:2])
                        visited_column_name_list = True
                    except Exception:
                        pass
                elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    try:
                        column_name_list = self.__parse_column_name_list(line[1:3])
                        visited_column_name_list = True
                    except Exception:
                        pass
            # try to parse rest row list
            else:
                row = None
                if type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']:
                    row = self.__parse_account_number_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']:
                    row = self.__parse_account_paren_number_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']:
                    row = self.__parse_account_number_paren_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']:
                    row = self.__parse_account_paren_number_paren_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_EOL']:
                    row = self.__parse_account_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    continue
                else:
                    raise ValueError
                row_list.append(row) if row else None

        assert visited_column_name_list, 'We should parse column name list'
        assert len(row_list) > 0, 'We should parse some rows'
        return column_name_list, row_list   

    def __parse_column_name_list(self, stmt_date_list):
        column_name_list = [u'會計科目']
        assert len(stmt_date_list) == 2, 'There shouble be 2 statement dates' 
        for i in [0, 1]:
            date_period = self.string_utils.from_local_string_to_date_period(stmt_date_list[i].get_value())
            stmt_date = date_period[1]
            column_name_list.append(stmt_date)  
        return column_name_list        

    # ['TK_ACCOUNT', 'TK_EOL']
    def __parse_account_line(self, line):
        return [line[0].get_value()]

    # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']
    def __parse_account_number_number_line(self, line):
        return [
            line[0].get_value(),
            self.string_utils.normalize_number(line[1].get_value()), 
            self.string_utils.normalize_number(line[2].get_value())
        ]

    # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']
    def __parse_account_paren_number_number_line(self, line):
        return [
            line[0].get_value(),
            -self.string_utils.normalize_number(line[2].get_value()),
            self.string_utils.normalize_number(line[4].get_value())
        ]
        
    # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']
    def __parse_account_number_paren_number_line(self, line):
        return [
            line[0].get_value(),
            self.string_utils.normalize_number(line[1].get_value()),
            -self.string_utils.normalize_number(line[3].get_value())
        ]

    # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']
    def __parse_account_paren_number_paren_number_line(self, line):
        return [
            line[0].get_value(),
            -self.string_utils.normalize_number(line[2].get_value()),
            -self.string_utils.normalize_number(line[5].get_value())
        ]
class XbrlIncomeStatementAssembler():
    def __init__(self):
        self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="main_table hasBorder"]'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return IncomeStatementDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr[@class="tblHead"]')
        assert len(tr_tags) == 2, 'invalid tr_tags'

        # traverse and sanity check        
        statement_th_texts = tr_tags[1].xpath('./th/text()')
        assert len(statement_th_texts) == 1, 'invalid statement_th_texts'
        assert unicode(statement_th_texts[0]) == u'綜合損益表', 'invalid statement_th_texts[0]'

        column_name_list = []
        
        # should be account type
        column_th_texts = tr_tags[0].xpath('./th/text()')
        account_type = column_th_texts[0] # of unicode type
        column_name_list.append(account_type)

        # should be date interval
        for local_string in column_th_texts[1:]:
            # of (datetime.date, datetime.date) type
            snapshot_date = self.string_utils.from_local_string_to_date_period(local_string) 
            column_name_list.append(snapshot_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip one row of statement name and one row of column name list
        tr_tags = relative_html_object.xpath('./tr')[2:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        row = []

        td_texts = relative_html_object.xpath('./td/text()')

        # should be account type 
        account_type = td_texts[0].strip()
        row.append(account_type)

        # should be number
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
예제 #3
0
class StringUtilsTest(unittest.TestCase):
    def setUp(self):
        self.string_utils = StringUtils()

    def tearDown(self):
        self.string_utils = None

    def test_normalize_arabic_number(self):
        actual = self.string_utils.normalize_number("33,825,315")
        expected = 33825315
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("0")
        expected = 0
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("-115,859,592")
        expected = -115859592
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(27,540)")
        expected = -27540
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("2.85")
        expected = 2.85
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("170,270,395.00")
        expected = 170270395
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(  10,117,111)")
        expected = -10117111
        self.assertEqual(actual, expected)

    def test_normalize_none_number(self):
        actual = self.string_utils.normalize_number(u"-")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"不適用")
        expected = None
        self.assertEqual(actual, expected)

    def test_normalize_chinese_number(self):
        actual = self.string_utils.normalize_number(u"九十九")
        expected = 99
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"九十")
        expected = 90
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"三")
        expected = 3
        self.assertEqual(actual, expected)

    def test_normalize_percentage(self):
        actual = self.string_utils.normalize_number(u"20.92%")
        expected = 0.2092
        self.assertAlmostEqual(actual, expected)

    def test_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"2013年12月31日")
        expected = datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2012年01月01日")
        expected = datetime.date(2012, 1, 1)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date("1962/02/09")
        expected = datetime.date(1962, 2, 9)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2015/08/13")
        expected = datetime.date(2015, 8, 13)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"民國103年09月")
        expected = datetime.date(2014, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"104")
        expected = datetime.date(2015, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"99年09月30日")
        expected = datetime.date(2010, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"102/05/07")
        expected = datetime.date(2013, 5, 7)
        self.assertEqual(actual, expected)

    def test_from_local_string_to_date_interval(self):
        actual = self.string_utils.from_local_string_to_date_period(u"2013年01月01日至2013年12月31日")
        expected = datetime.date(2013, 1, 1), datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date_period(self):
        actual = self.string_utils.from_local_string_to_date_period(u"九十八年前三季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"九十八年第一季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年第一季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年上半年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"99年上半年度")
        expected = datetime.date(2010, 1, 1), datetime.date(2010, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年前三季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 12, 31)
        self.assertEqual(actual, expected)

    def test_from_date_to_roc_era_string(self):
        actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1))
        expected = "90"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_month_string(self):
        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31))
        expected = "10"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "01"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "02"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "02"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "03"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "03"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "04"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "04"
        self.assertEqual(actual, expected)

    def test_from_date_to_1_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "1"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "1"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "2"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "2"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "3"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "3"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "4"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "4"
        self.assertEqual(actual, expected)

    def test_is_match_seperation(self):
        pattern = u"^(-| |=)*$"
        self.assertTrue(self.string_utils.is_match(pattern, u"======      ======"))
        self.assertTrue(self.string_utils.is_match(pattern, u"------      ------"))
        self.assertFalse(self.string_utils.is_match(pattern, u"同時影響現金及非現金項目之投資活動:"))

    def test_match_account(self):
        pattern = u"^([^\s]*):$"
        actual = self.string_utils.match(pattern, u"營業活動之現金流量:")
        expected = [u"營業活動之現金流量"]
        self.assertEqual(actual, expected)