def _fetch_data(self, entry_name, url):
        # url = url.decode('utf-8')
        # if url[:5] == 'http:':
        #     url = 'https' + url[4:]
        # url = url.encode('utf-8')
        original_entry_name = entry_name
        data = dict()
        try:
            with contextlib.closing(urllib2.urlopen(url.encode('utf-8'))) as page_source:
                page_content = page_source.read()
            doc = UnicodeDammit(page_content, is_html=True)
            parser = lxml.html.HTMLParser(encoding=doc.original_encoding)
            doc = lxml.html.document_fromstring(page_content, parser=parser)

            bar_name = doc.xpath('//a[contains(@class, "star_title_h3")]')
            if not bar_name:
                bar_name = doc.xpath('//a[contains(@class, "card_title_fname")]')
            if type(bar_name) is list and len(bar_name) > 0:
                entry_name = bar_name[0].text_content().strip()
            num_visits = doc.xpath('//span[contains(@class, "j_visit_num")]')
            if not num_visits:
                num_visits = doc.xpath('//span[contains(@class, "card_menNum")]')
            num_posts = doc.xpath('//span[contains(@class, "j_post_num")]')
            if not num_posts:
                num_posts = doc.xpath('//span[contains(@class, "card_infoNum")]')
            if type(num_visits) is list and len(num_visits) > 0:
                num_visits = num_visits[0].text_content()
                num_visits = cogtu_misc.get_first_number_from_text(num_visits)
            else:
                num_visits = 0
            if type(num_posts) is list and len(num_posts) > 0:
                num_posts = num_posts[0].text_content()
                num_posts = cogtu_misc.get_first_number_from_text(num_posts)
            else:
                num_posts = 0
            num_groups = doc.xpath("//a[contains(@class, 'star_nav_ico_group')]/span")
            if type(num_groups) is list and len(num_groups) > 0:
                num_groups = num_groups[0].text_content()
                num_groups = cogtu_misc.get_first_number_from_text(num_groups)
            else:
                num_groups = 0
        except urllib2.HTTPError:
            logging.info('urllib2.HTTPError. Skip.')
            return None, None
        except urllib2.URLError:
            logging.info('urllib2.URLError. Skip.')
            return None, None

        data['num_visits'] = int(num_visits)
        data['num_posts'] = int(num_posts)
        data['num_groups'] = int(num_groups)
        data['entry_name'] = entry_name
        data['original_entry_name'] = original_entry_name
        data['url'] = url
        return entry_name, data
示例#2
0
        padding = padding * 4

tpattern = args.pattern + '.txt'
if args.pattern[-4] == '.':
    tpattern = args.pattern[:-3] + 'txt'

if args.unicodedammit:
    from bs4 import UnicodeDammit
    content = args.file.read()
    doc = UnicodeDammit(content, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    doc = html.document_fromstring(content, parser=parser)
else:
    doc = html.parse(args.file)

pages = doc.xpath('//*[@class="ocr_page"]')
for page in pages:
    iname = get_prop(page, 'file')
    if not iname:
        iname = get_prop(page, 'image')
    if args.basename:
        iname = os.path.join(args.basename, os.path.basename(iname))
    if not os.path.exists(iname):
        print("not found:", iname)
        sys.exit(1)
    image = Image.open(iname)
    lines = page.xpath("//*[@class='%s']" % args.element)
    lcount = 1
    for line in lines:
        bbox = [int(x) for x in get_prop(line, 'bbox').split()]
        if padding is not None: