Exemplo n.º 1
0
    def extract_half_infobox(sideid):
        bititle_xpath_template = '//*[@class="basic-info"]/dl[%d]/dt[%%d]' % sideid
        bicontent_xpath_template = '//*[@class="basic-info"]/dl[%d]/dd[%%d]' % sideid

        counter = 1
        while True:
            bititle_elements = root.xpath(bititle_xpath_template % counter)
            bicontent_elements = root.xpath(bicontent_xpath_template % counter)

            if len(bititle_elements) != 1 or len(bicontent_elements) != 1:
                # both are zero, we are exiting
                if len(bititle_elements) == 0 and len(bicontent_elements) == 0:
                    break
                else:
                    error = 'extractor2: should exit with (len(bititle)==0 and len(bicontent)==0), but '\
                            'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements))
                    raise ValueError(error)

            if len(bicontent_elements[0].xpath('.//dl'))>0:
                # we should extract expanded values
                bicontent_elements = bicontent_elements[0].xpath('.//dd')
                remove_tailing_expand_sign=True
            else:
                remove_tailing_expand_sign=False

            bititle = get_inner_text_with_hrefs(bititle_elements[0])
            bicontent = get_inner_text_with_hrefs(bicontent_elements[0])
            if remove_tailing_expand_sign:
                bicontent = bicontent.strip()
                if bicontent[-2:] == u'收起':
                    bicontent = bicontent[:-2]

            yield remove_links(cleanup_verb(bititle)), bicontent

            counter += 1
Exemplo n.º 2
0
    def extract_half_infobox(sideid):
        bititle_xpath_template = '//*[@class="basic-info"]/dl[%d]/dt[%%d]' % sideid
        bicontent_xpath_template = '//*[@class="basic-info"]/dl[%d]/dd[%%d]' % sideid

        counter = 1
        while True:
            bititle_elements = root.xpath(bititle_xpath_template % counter)
            bicontent_elements = root.xpath(bicontent_xpath_template % counter)

            if len(bititle_elements) != 1 or len(bicontent_elements) != 1:
                # both are zero, we are exiting
                if len(bititle_elements) == 0 and len(bicontent_elements) == 0:
                    break
                else:
                    error = 'extractor2: should exit with (len(bititle)==0 and len(bicontent)==0), but '\
                            'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements))
                    raise ValueError(error)

            if len(bicontent_elements[0].xpath('.//dl')) > 0:
                # we should extract expanded values
                bicontent_elements = bicontent_elements[0].xpath('.//dd')
                remove_tailing_expand_sign = True
            else:
                remove_tailing_expand_sign = False

            bititle = get_inner_text_with_hrefs(bititle_elements[0])
            bicontent = get_inner_text_with_hrefs(bicontent_elements[0])
            if remove_tailing_expand_sign:
                bicontent = bicontent.strip()
                if bicontent[-2:] == u'收起':
                    bicontent = bicontent[:-2]

            yield remove_links(cleanup_verb(bititle)), bicontent

            counter += 1
Exemplo n.º 3
0
    def extract_half_infobox(sideid):
        bititle_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/span' % sideid
        bicontent_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/div' % sideid
        bicontent_multiline_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div[@class="biOpenItem"]/div[@class="biOpenItemCon"]/div[@class="biOpenContent"]' % sideid

        counter = 1
        while True:
            bititle_elements = root.xpath(bititle_xpath_template % counter)
            bicontent_elements = root.xpath(bicontent_xpath_template % counter)

            if len(bititle_elements) != 1 or len(bicontent_elements) != 1:
                # if both are zero, we are exiting
                if len(bititle_elements) == 0 and len(bicontent_elements) == 0:
                    break
                else:
                    # some bicontent have multi-lines
                    if len(bititle_elements) == 1 and len(bicontent_elements) > 1:
                        bicontent_elements_multiline = root.xpath(bicontent_multiline_xpath_template % counter)
                        if len(bicontent_elements_multiline) != 1:
                            error = 'extractor1: should find len(bicontent_elements_multiline)==1, but'\
                                    'got %d' %  len(bicontent_elements_multiline)
                            raise ValueError(error)

                        bicontent_elements = bicontent_elements_multiline
                    else:
                        # we are having trouble
                        error = 'extractor1: should exit with (len(bititle)==0 and len(bicontent)==0), but '\
                                'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements))
                        raise ValueError(error)

            bititle = get_inner_text_with_hrefs(bititle_elements[0])
            bicontent = get_inner_text_with_hrefs(bicontent_elements[0])
            yield remove_links(cleanup_verb(bititle)), bicontent

            counter += 1
Exemplo n.º 4
0
    def extract_half_infobox(sideid):
        bititle_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/span' % sideid
        bicontent_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/div' % sideid
        bicontent_multiline_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div[@class="biOpenItem"]/div[@class="biOpenItemCon"]/div[@class="biOpenContent"]' % sideid

        counter = 1
        while True:
            bititle_elements = root.xpath(bititle_xpath_template % counter)
            bicontent_elements = root.xpath(bicontent_xpath_template % counter)

            if len(bititle_elements) != 1 or len(bicontent_elements) != 1:
                # if both are zero, we are exiting
                if len(bititle_elements) == 0 and len(bicontent_elements) == 0:
                    break
                else:
                    # some bicontent have multi-lines
                    if len(bititle_elements
                           ) == 1 and len(bicontent_elements) > 1:
                        bicontent_elements_multiline = root.xpath(
                            bicontent_multiline_xpath_template % counter)
                        if len(bicontent_elements_multiline) != 1:
                            error = 'extractor1: should find len(bicontent_elements_multiline)==1, but'\
                                    'got %d' %  len(bicontent_elements_multiline)
                            raise ValueError(error)

                        bicontent_elements = bicontent_elements_multiline
                    else:
                        # we are having trouble
                        error = 'extractor1: should exit with (len(bititle)==0 and len(bicontent)==0), but '\
                                'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements))
                        raise ValueError(error)

            bititle = get_inner_text_with_hrefs(bititle_elements[0])
            bicontent = get_inner_text_with_hrefs(bicontent_elements[0])
            yield remove_links(cleanup_verb(bititle)), bicontent

            counter += 1