def _fetch_data(self, entry_name, url): # url = url.decode('utf-8') # if url[:5] == 'http:': # url = 'https' + url[4:] # url = url.encode('utf-8') original_entry_name = entry_name data = dict() try: with contextlib.closing(urllib2.urlopen(url.encode('utf-8'))) as page_source: page_content = page_source.read() doc = UnicodeDammit(page_content, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.original_encoding) doc = lxml.html.document_fromstring(page_content, parser=parser) bar_name = doc.xpath('//a[contains(@class, "star_title_h3")]') if not bar_name: bar_name = doc.xpath('//a[contains(@class, "card_title_fname")]') if type(bar_name) is list and len(bar_name) > 0: entry_name = bar_name[0].text_content().strip() num_visits = doc.xpath('//span[contains(@class, "j_visit_num")]') if not num_visits: num_visits = doc.xpath('//span[contains(@class, "card_menNum")]') num_posts = doc.xpath('//span[contains(@class, "j_post_num")]') if not num_posts: num_posts = doc.xpath('//span[contains(@class, "card_infoNum")]') if type(num_visits) is list and len(num_visits) > 0: num_visits = num_visits[0].text_content() num_visits = cogtu_misc.get_first_number_from_text(num_visits) else: num_visits = 0 if type(num_posts) is list and len(num_posts) > 0: num_posts = num_posts[0].text_content() num_posts = cogtu_misc.get_first_number_from_text(num_posts) else: num_posts = 0 num_groups = doc.xpath("//a[contains(@class, 'star_nav_ico_group')]/span") if type(num_groups) is list and len(num_groups) > 0: num_groups = num_groups[0].text_content() num_groups = cogtu_misc.get_first_number_from_text(num_groups) else: num_groups = 0 except urllib2.HTTPError: logging.info('urllib2.HTTPError. Skip.') return None, None except urllib2.URLError: logging.info('urllib2.URLError. Skip.') return None, None data['num_visits'] = int(num_visits) data['num_posts'] = int(num_posts) data['num_groups'] = int(num_groups) data['entry_name'] = entry_name data['original_entry_name'] = original_entry_name data['url'] = url return entry_name, data
padding = padding * 4 tpattern = args.pattern + '.txt' if args.pattern[-4] == '.': tpattern = args.pattern[:-3] + 'txt' if args.unicodedammit: from bs4 import UnicodeDammit content = args.file.read() doc = UnicodeDammit(content, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) doc = html.document_fromstring(content, parser=parser) else: doc = html.parse(args.file) pages = doc.xpath('//*[@class="ocr_page"]') for page in pages: iname = get_prop(page, 'file') if not iname: iname = get_prop(page, 'image') if args.basename: iname = os.path.join(args.basename, os.path.basename(iname)) if not os.path.exists(iname): print("not found:", iname) sys.exit(1) image = Image.open(iname) lines = page.xpath("//*[@class='%s']" % args.element) lcount = 1 for line in lines: bbox = [int(x) for x in get_prop(line, 'bbox').split()] if padding is not None: