def transform_result(cls, text): translator = GenericTranslator() item_xpath = translator.css_to_xpath('div.card.movies a.title') document = html.fromstring(text) elements = document.xpath(item_xpath) log.debug( 'found %r matching elements for xpath %r', len(elements), item_xpath, ) def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)): return urlunsplit(( base.scheme, base.netloc, path, '', '', )) items = ((e.get('title'), e.get('href')) for e in elements) return (SearchResult(title, absolutize(path), cls.SOURCE) for (title, path) in items)
def transform_result(cls, text): translator = GenericTranslator() item_xpath = translator.css_to_xpath( 'div.card.movies a.title' ) document = html.fromstring(text) elements = document.xpath(item_xpath) log.debug( 'found %r matching elements for xpath %r', len(elements), item_xpath, ) def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)): return urlunsplit(( base.scheme, base.netloc, path, '', '', )) items = ( (e.get('title'), e.get('href')) for e in elements ) return ( SearchResult(title, absolutize(path), cls.SOURCE) for (title, path) in items )
def _fetch_img_of_character(char, root_folder, dict_not_found): root_char = os.path.join(root_folder, char) if not os.path.exists(root_char): os.makedirs(root_char) url_root = 'http://www.chineseetymology.org' url = 'http://www.chineseetymology.org/CharacterEtymology.aspx?characterInput=' \ + quote(char) attempts = 0 max_attempts = 20 while attempts < max_attempts: try: page = urlopen(url).read().decode('utf8') break except (TimeoutError, URLError, ConnectionError) as e: attempts += 1 if isinstance(e, TimeoutError): msg = 'Time out when opening page %s. Retrying.' % url elif isinstance(e, URLError): msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % ( e.reason, url) elif isinstance(e, ConnectionError): msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % ( str(e), url) else: msg = 'Reached impossible branch.' _logger.warning(msg) if attempts == max_attempts: msg = 'Max attempts reached. Fail to open page ' + url _logger.error(msg) return page = fromstring(page) gt = GenericTranslator() seal_selector = gt.css_to_xpath("span#SealImages img") lst_selector = gt.css_to_xpath("span#LstImages img") bronze_selector = gt.css_to_xpath("span#BronzeImages img") oracle_selector = gt.css_to_xpath("span#OracleImages img") seal_img = [img.get('src') for img in page.xpath(seal_selector)] lst_img = [img.get('src') for img in page.xpath(lst_selector)] bronze_img = [img.get('src') for img in page.xpath(bronze_selector)] oracle_img = [img.get('src') for img in page.xpath(oracle_selector)] all_img = { "seal": seal_img, "lst": lst_img, "bronze": bronze_img, "oracle": oracle_img } for folder in all_img.keys(): folder_full = os.path.join(root_char, folder) if not os.path.exists(folder_full): os.makedirs(folder_full) for img_src in all_img[folder]: (_, gif_name) = os.path.split(img_src) gif_full_path = os.path.join(folder_full, gif_name) if not os.path.exists(gif_full_path): img_url = url_root + img_src attempts = 0 while attempts < max_attempts: try: urlretrieve(img_url, gif_full_path) break except TimeoutError: msg = 'Time out when downloading %s to %s. Retrying.' % ( img_url, gif_full_path) _logger.warning(msg) except HTTPError as e: msg = 'Error \"%s\" occurs when downloading %s to %s' % ( e.reason, img_url, gif_full_path) if e.code == 404: dict_not_found[gif_full_path] = img_url _logger.warning(msg) break else: msg += ' Retrying.' _logger.warning(msg) except URLError as e: msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % ( e.reason, img_url, gif_full_path) _logger.warning(msg) except ConnectionError as e: msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % ( str(e), img_url, gif_full_path) _logger.warning(msg) if attempts == max_attempts: msg = 'Max attempts reached. Fail to download image ' + img_url _logger.error(msg)
converted_blockquotes.append((elem, pbody, pbody.index(comment))) for elem, pbody, pbodyidx in converted_blockquotes: pbody[pbodyidx] = elem for ul in body.iter(tag=['ul', 'ol']): for li in ul.iter('li'): neighbor = li.getnext() while neighbor is not None and neighbor.tag != 'li': li.append(neighbor) neighbor = li.getnext() css_translator = GenericTranslator() unique_links = set() for link in body.xpath(css_translator.css_to_xpath('a[href]')): unique_links.add(link.attrib['href']) for url in unique_links: elem = html.Element('a') elem.attrib['href'] = url textparts = [] duplinks = css_translator.css_to_xpath('a[href="{}"]'.format(url)) first_dup_link = None more_things = False for duplink in body.xpath(duplinks): if first_dup_link is None: first_dup_link = duplink elif not more_things: more_things = True if duplink.text is None or duplink.text.strip() == '':
def _fetch_img_of_character(char, root_folder, dict_not_found): root_char = os.path.join(root_folder, char) if not os.path.exists(root_char): os.makedirs(root_char) url_root = 'http://www.chineseetymology.org' url = 'http://www.chineseetymology.org/CharacterEtymology.aspx?characterInput=' \ + quote(char) attempts = 0 max_attempts = 20 while attempts < max_attempts: try: page = urlopen(url).read().decode('utf8') break except (TimeoutError, URLError, ConnectionError) as e: attempts += 1 if isinstance(e, TimeoutError): msg = 'Time out when opening page %s. Retrying.' % url elif isinstance(e, URLError): msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (e.reason, url) elif isinstance(e, ConnectionError): msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (str(e), url) else: msg = 'Reached impossible branch.' _logger.warning(msg) if attempts == max_attempts: msg = 'Max attempts reached. Fail to open page ' + url _logger.error(msg) return page = fromstring(page) gt = GenericTranslator() seal_selector = gt.css_to_xpath("span#SealImages img") lst_selector = gt.css_to_xpath("span#LstImages img") bronze_selector = gt.css_to_xpath("span#BronzeImages img") oracle_selector = gt.css_to_xpath("span#OracleImages img") seal_img = [img.get('src') for img in page.xpath(seal_selector)] lst_img = [img.get('src') for img in page.xpath(lst_selector)] bronze_img = [img.get('src') for img in page.xpath(bronze_selector)] oracle_img = [img.get('src') for img in page.xpath(oracle_selector)] all_img = {"seal": seal_img, "lst": lst_img, "bronze": bronze_img, "oracle": oracle_img} for folder in all_img.keys(): folder_full = os.path.join(root_char, folder) if not os.path.exists(folder_full): os.makedirs(folder_full) for img_src in all_img[folder]: (_, gif_name) = os.path.split(img_src) gif_full_path = os.path.join(folder_full, gif_name) if not os.path.exists(gif_full_path): img_url = url_root + img_src attempts = 0 while attempts < max_attempts: try: urlretrieve(img_url, gif_full_path) break except TimeoutError: msg = 'Time out when downloading %s to %s. Retrying.' % (img_url, gif_full_path) _logger.warning(msg) except HTTPError as e: msg = 'Error \"%s\" occurs when downloading %s to %s' % (e.reason, img_url, gif_full_path) if e.code == 404: dict_not_found[gif_full_path] = img_url _logger.warning(msg) break else: msg += ' Retrying.' _logger.warning(msg) except URLError as e: msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % ( e.reason, img_url, gif_full_path) _logger.warning(msg) except ConnectionError as e: msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % ( str(e), img_url, gif_full_path) _logger.warning(msg) if attempts == max_attempts: msg = 'Max attempts reached. Fail to download image ' + img_url _logger.error(msg)