Пример #1
0
class TaurusParser():
    def __init__(self):
        self.base_xpath = '//html/body/center'
        self.date_utils = DateUtils()
        self.string_utils = StringUtils()

    def parse(self, content):
        html_object = self.__get_html_object(content) 
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__parse_column_name_list(relative_html_object)
        row_list = self.__parse_row_list(relative_html_object)
        release_date = self.__parse_release_date(relative_html_object)
        return column_name_list, row_list, release_date

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __parse_column_name_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 1, 'invalid table_tags'

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table')
        assert len(inner_table_tags) > 0, 'invalid inner_table_tags'

        tr_tags = inner_table_tags[0].xpath('./tr')
        assert len(tr_tags) > 1, 'invalid tr_tags'

        th_texts = tr_tags[1].xpath('./th/text()')
        return th_texts

    def __parse_row_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 1, 'invalid table_tags'

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table')
        assert len(inner_table_tags) > 0, 'invalid inner_table_tags'

        all_tr_tags = []
        # every inner_table represents an industry
        for inner_table_tag in inner_table_tags:
            tr_tags = inner_table_tag.xpath('./tr')
            assert len(tr_tags) > 2, 'invalid tr_tags'
            # first two rows are headers
            # last row is u'合計'
            all_tr_tags += tr_tags[2:-1]
        return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags]

    def __parse_row(self, relative_html_object):
        td_texts = relative_html_object.xpath('./td/text()')
        # record contains extra entry about comment
        assert len(td_texts) == 11, 'invalid td_texts size, should be 11'

        items = td_texts[:2]

        numbers = []
        # skip the last entry about comment
        for td_text in td_texts[2:-1]:
            number = self.string_utils.normalize_number(td_text)
            numbers.append(number)

        return items + numbers    

    def __parse_release_date(self, relative_html_object):
        div_tags = relative_html_object.xpath('./div')
        assert len(div_tags) > 0, 'invalid div_tags'

        groups = self.string_utils.match(u'^出表日期:(.*)$', div_tags[-1].text.strip())
        assert len(groups) > 0, 'could not match ^出表日期:(.*)$'
        
        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date
Пример #2
0
class StringUtilsTest(unittest.TestCase):
    def setUp(self):
        self.string_utils = StringUtils()

    def tearDown(self):
        self.string_utils = None

    def test_normalize_arabic_number(self):
        actual = self.string_utils.normalize_number("33,825,315")
        expected = 33825315
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("0")
        expected = 0
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("-115,859,592")
        expected = -115859592
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(27,540)")
        expected = -27540
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("2.85")
        expected = 2.85
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("170,270,395.00")
        expected = 170270395
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(  10,117,111)")
        expected = -10117111
        self.assertEqual(actual, expected)

    def test_normalize_none_number(self):
        actual = self.string_utils.normalize_number(u"-")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"不適用")
        expected = None
        self.assertEqual(actual, expected)

    def test_normalize_chinese_number(self):
        actual = self.string_utils.normalize_number(u"九十九")
        expected = 99
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"九十")
        expected = 90
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"三")
        expected = 3
        self.assertEqual(actual, expected)

    def test_normalize_percentage(self):
        actual = self.string_utils.normalize_number(u"20.92%")
        expected = 0.2092
        self.assertAlmostEqual(actual, expected)

    def test_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"2013年12月31日")
        expected = datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2012年01月01日")
        expected = datetime.date(2012, 1, 1)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date("1962/02/09")
        expected = datetime.date(1962, 2, 9)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2015/08/13")
        expected = datetime.date(2015, 8, 13)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"民國103年09月")
        expected = datetime.date(2014, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"104")
        expected = datetime.date(2015, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"99年09月30日")
        expected = datetime.date(2010, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"102/05/07")
        expected = datetime.date(2013, 5, 7)
        self.assertEqual(actual, expected)

    def test_from_local_string_to_date_interval(self):
        actual = self.string_utils.from_local_string_to_date_period(u"2013年01月01日至2013年12月31日")
        expected = datetime.date(2013, 1, 1), datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date_period(self):
        actual = self.string_utils.from_local_string_to_date_period(u"九十八年前三季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"九十八年第一季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年第一季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年上半年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"99年上半年度")
        expected = datetime.date(2010, 1, 1), datetime.date(2010, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年前三季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 12, 31)
        self.assertEqual(actual, expected)

    def test_from_date_to_roc_era_string(self):
        actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1))
        expected = "90"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_month_string(self):
        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31))
        expected = "10"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "01"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "02"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "02"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "03"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "03"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "04"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "04"
        self.assertEqual(actual, expected)

    def test_from_date_to_1_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "1"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "1"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "2"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "2"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "3"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "3"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "4"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "4"
        self.assertEqual(actual, expected)

    def test_is_match_seperation(self):
        pattern = u"^(-| |=)*$"
        self.assertTrue(self.string_utils.is_match(pattern, u"======      ======"))
        self.assertTrue(self.string_utils.is_match(pattern, u"------      ------"))
        self.assertFalse(self.string_utils.is_match(pattern, u"同時影響現金及非現金項目之投資活動:"))

    def test_match_account(self):
        pattern = u"^([^\s]*):$"
        actual = self.string_utils.match(pattern, u"營業活動之現金流量:")
        expected = [u"營業活動之現金流量"]
        self.assertEqual(actual, expected)
Пример #3
0
class StockSymbolAssembler():
    def __init__(self):
        self.base_xpath = '//html/body'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content = self.string_utils.normalize_string(param['content'])
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        release_date = self.__assemble_release_date(relative_html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return StockSymbolDao(column_name_list, row_list, release_date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) == 1, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_release_date(self, relative_html_object):
        # try to get release date
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 0, 'invalid table_tags'

        headline_tags = table_tags[0].xpath('./h2')
        assert len(headline_tags) > 0, 'invalid headline_tags'
        
        headline_texts = headline_tags[1].xpath('./strong/center')
        groups = self.string_utils.match(u'^最近更新日期:(.*)$', headline_texts[0].text.strip())
        assert len(groups) > 0, 'could not match ^最近更新日期:(.*)$'
        
        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        # traverse and sanity check
        original_column_name_list = tr_tags[0].xpath('./td/text()')

        # handle the first column name: '有價證券代號及名稱'
        combined_column_name = original_column_name_list[0].strip() 
        assert combined_column_name == u'有價證券代號及名稱', 'should be 有價證券代號及名稱 in unicode'
        # the chinese character '及' means 'and' so we need to seperate this column name
        seperated_column_name_list = combined_column_name.split(u'及')
        assert len(seperated_column_name_list) == 2

        column_name_list = seperated_column_name_list + original_column_name_list[1:]
        assert len(column_name_list) == 8, 'invalid column_name_list size, should be 8'
        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip one row of column name list
        tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')[1:]

        row_list = []
        for tr_tag in tr_tags:
            row = self.__assemble_row(tr_tag)
            # if there is only one cell '股票' in row, skip it
            if row:
                row_list.append(row)
        return row_list

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')

        # we could not handle empty string between td tag if we use xpath './td/text()' 
        # so we need to check each td.text one by one.
        td_texts = self.__get_lxml_text_list(td_tags)

        # if there is only one cell '股票', return None
        if len(td_texts) == 1:
            return None

        # sanity check
        assert len(td_texts) == 7

        # handle the first cell: '有價證券代號及名稱'
        # it should be seperated as stock symbol and stock name
        combined_cell = td_texts[0].strip()
        seperated_cell_list = combined_cell.split()
        assert len(seperated_cell_list) == 2

        # convert to datetime.date type
        listing_date = self.string_utils.from_local_string_to_date(td_texts[2])

        row = seperated_cell_list + [td_texts[1]] + [listing_date] + td_texts[3:]
        return row

    def __get_lxml_text_list(self, tag_list):
        text_list = []
        for tag in tag_list:
            if tag.text is None:
                text_list.append('')
            else:
                text_list.append(tag.text)
        return text_list
Пример #4
0
class GeminiParser:
    def __init__(self):
        self.base_xpath = "//html/body/center"
        self.date_utils = DateUtils()
        self.string_utils = StringUtils()

    def parse(self, content):
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__parse_column_name_list(relative_html_object)
        row_list = self.__parse_row_list(relative_html_object)
        release_date = self.__parse_release_date(relative_html_object)
        return column_name_list, row_list, release_date

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u"<br>", u"")
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, "invalid base_xpath"
        return relative_html_object_list[0]

    def __parse_column_name_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath("./table")
        assert len(table_tags) > 1, "invalid table_tags"

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table")
        assert len(inner_table_tags) > 0, "invalid inner_table_tags"

        tr_tags = inner_table_tags[0].xpath("./tr")
        assert len(tr_tags) > 1, "invalid tr_tags"

        th_texts = tr_tags[1].xpath("./th/text()")
        return th_texts

    def __parse_row_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath("./table")
        assert len(table_tags) > 1, "invalid table_tags"

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table")
        assert len(inner_table_tags) > 0, "invalid inner_table_tags"

        all_tr_tags = []
        # every inner_table represents an industry
        for inner_table_tag in inner_table_tags:
            tr_tags = inner_table_tag.xpath("./tr")
            assert len(tr_tags) > 2, "invalid tr_tags"
            # first two rows are headers
            # last row is u'合計'
            all_tr_tags += tr_tags[2:-1]
        return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags]

    def __parse_row(self, relative_html_object):
        td_texts = relative_html_object.xpath("./td/text()")
        assert len(td_texts) == 10, "invalid td_texts size, should be 10"

        items = td_texts[:2]

        numbers = []
        for td_text in td_texts[2:]:
            number = self.string_utils.normalize_number(td_text)
            numbers.append(number)

        return items + numbers

    def __parse_release_date(self, relative_html_object):
        div_tags = relative_html_object.xpath("./div")
        assert len(div_tags) > 0, "invalid div_tags"

        groups = self.string_utils.match(u"^出表日期:(.*)$", div_tags[-1].text.strip())
        assert len(groups) > 0, "could not match ^出表日期:(.*)$"

        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date