예제 #1
0
    def transform_result(cls, text):

        translator = GenericTranslator()

        item_xpath = translator.css_to_xpath('div.card.movies a.title')

        document = html.fromstring(text)

        elements = document.xpath(item_xpath)

        log.debug(
            'found %r matching elements for xpath %r',
            len(elements),
            item_xpath,
        )

        def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)):

            return urlunsplit((
                base.scheme,
                base.netloc,
                path,
                '',
                '',
            ))

        items = ((e.get('title'), e.get('href')) for e in elements)

        return (SearchResult(title, absolutize(path), cls.SOURCE)
                for (title, path) in items)
예제 #2
0
    def transform_result(cls, text):

        translator = GenericTranslator()

        item_xpath = translator.css_to_xpath(
            'div.card.movies a.title'
        )

        document = html.fromstring(text)

        elements = document.xpath(item_xpath)

        log.debug(
            'found %r matching elements for xpath %r',
            len(elements),
            item_xpath,
        )

        def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)):

            return urlunsplit((
                base.scheme,
                base.netloc,
                path,
                '',
                '',
            ))

        items = (
            (e.get('title'), e.get('href'))
            for e in elements
        )

        return (
            SearchResult(title, absolutize(path), cls.SOURCE)
            for (title, path) in items
        )
예제 #3
0
def _fetch_img_of_character(char, root_folder, dict_not_found):
    root_char = os.path.join(root_folder, char)
    if not os.path.exists(root_char):
        os.makedirs(root_char)

    url_root = 'http://www.chineseetymology.org'
    url = 'http://www.chineseetymology.org/CharacterEtymology.aspx?characterInput=' \
          + quote(char)

    attempts = 0
    max_attempts = 20
    while attempts < max_attempts:
        try:
            page = urlopen(url).read().decode('utf8')
            break
        except (TimeoutError, URLError, ConnectionError) as e:
            attempts += 1
            if isinstance(e, TimeoutError):
                msg = 'Time out when opening page %s. Retrying.' % url
            elif isinstance(e, URLError):
                msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (
                    e.reason, url)
            elif isinstance(e, ConnectionError):
                msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (
                    str(e), url)
            else:
                msg = 'Reached impossible branch.'
            _logger.warning(msg)

    if attempts == max_attempts:
        msg = 'Max attempts reached. Fail to open page ' + url
        _logger.error(msg)
        return

    page = fromstring(page)

    gt = GenericTranslator()
    seal_selector = gt.css_to_xpath("span#SealImages img")
    lst_selector = gt.css_to_xpath("span#LstImages img")
    bronze_selector = gt.css_to_xpath("span#BronzeImages img")
    oracle_selector = gt.css_to_xpath("span#OracleImages img")

    seal_img = [img.get('src') for img in page.xpath(seal_selector)]
    lst_img = [img.get('src') for img in page.xpath(lst_selector)]
    bronze_img = [img.get('src') for img in page.xpath(bronze_selector)]
    oracle_img = [img.get('src') for img in page.xpath(oracle_selector)]

    all_img = {
        "seal": seal_img,
        "lst": lst_img,
        "bronze": bronze_img,
        "oracle": oracle_img
    }

    for folder in all_img.keys():
        folder_full = os.path.join(root_char, folder)
        if not os.path.exists(folder_full):
            os.makedirs(folder_full)
        for img_src in all_img[folder]:
            (_, gif_name) = os.path.split(img_src)
            gif_full_path = os.path.join(folder_full, gif_name)
            if not os.path.exists(gif_full_path):
                img_url = url_root + img_src
                attempts = 0
                while attempts < max_attempts:
                    try:
                        urlretrieve(img_url, gif_full_path)
                        break
                    except TimeoutError:
                        msg = 'Time out when downloading %s to %s. Retrying.' % (
                            img_url, gif_full_path)
                        _logger.warning(msg)
                    except HTTPError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s' % (
                            e.reason, img_url, gif_full_path)
                        if e.code == 404:
                            dict_not_found[gif_full_path] = img_url
                            _logger.warning(msg)
                            break
                        else:
                            msg += ' Retrying.'
                            _logger.warning(msg)
                    except URLError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % (
                            e.reason, img_url, gif_full_path)
                        _logger.warning(msg)
                    except ConnectionError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % (
                            str(e), img_url, gif_full_path)
                        _logger.warning(msg)

                if attempts == max_attempts:
                    msg = 'Max attempts reached. Fail to download image ' + img_url
                    _logger.error(msg)
	converted_blockquotes.append((elem, pbody, pbody.index(comment)))

for elem, pbody, pbodyidx in converted_blockquotes:
	pbody[pbodyidx] = elem

for ul in body.iter(tag=['ul', 'ol']):
	for li in ul.iter('li'):
		neighbor = li.getnext()
		while neighbor is not None and neighbor.tag != 'li':
			li.append(neighbor)
			neighbor = li.getnext()

css_translator = GenericTranslator()

unique_links = set()
for link in body.xpath(css_translator.css_to_xpath('a[href]')):
	unique_links.add(link.attrib['href'])

for url in unique_links:
	elem = html.Element('a')
	elem.attrib['href'] = url
	textparts = []
	duplinks = css_translator.css_to_xpath('a[href="{}"]'.format(url))
	first_dup_link = None
	more_things = False
	for duplink in body.xpath(duplinks):
		if first_dup_link is None:
			first_dup_link = duplink
		elif not more_things:
			more_things = True
		if duplink.text is None or duplink.text.strip() == '':
예제 #5
0
def _fetch_img_of_character(char, root_folder, dict_not_found):
    root_char = os.path.join(root_folder, char)
    if not os.path.exists(root_char):
        os.makedirs(root_char)

    url_root = 'http://www.chineseetymology.org'
    url = 'http://www.chineseetymology.org/CharacterEtymology.aspx?characterInput=' \
          + quote(char)

    attempts = 0
    max_attempts = 20
    while attempts < max_attempts:
        try:
            page = urlopen(url).read().decode('utf8')
            break
        except (TimeoutError, URLError, ConnectionError) as e:
            attempts += 1
            if isinstance(e, TimeoutError):
                msg = 'Time out when opening page %s. Retrying.' % url
            elif isinstance(e, URLError):
                msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (e.reason, url)
            elif isinstance(e, ConnectionError):
                msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (str(e), url)
            else:
                msg = 'Reached impossible branch.'
            _logger.warning(msg)

    if attempts == max_attempts:
        msg = 'Max attempts reached. Fail to open page ' + url
        _logger.error(msg)
        return

    page = fromstring(page)
    
    gt = GenericTranslator()
    seal_selector = gt.css_to_xpath("span#SealImages img")
    lst_selector = gt.css_to_xpath("span#LstImages img")
    bronze_selector = gt.css_to_xpath("span#BronzeImages img")
    oracle_selector = gt.css_to_xpath("span#OracleImages img")

    seal_img = [img.get('src') for img in page.xpath(seal_selector)]
    lst_img = [img.get('src') for img in page.xpath(lst_selector)]
    bronze_img = [img.get('src') for img in page.xpath(bronze_selector)]
    oracle_img = [img.get('src') for img in page.xpath(oracle_selector)]

    all_img = {"seal": seal_img, "lst": lst_img, "bronze": bronze_img, "oracle": oracle_img}

    for folder in all_img.keys():
        folder_full = os.path.join(root_char, folder)
        if not os.path.exists(folder_full):
            os.makedirs(folder_full)
        for img_src in all_img[folder]:
            (_, gif_name) = os.path.split(img_src)
            gif_full_path = os.path.join(folder_full, gif_name)
            if not os.path.exists(gif_full_path):
                img_url = url_root + img_src
                attempts = 0
                while attempts < max_attempts:
                    try:
                        urlretrieve(img_url, gif_full_path)
                        break
                    except TimeoutError:
                        msg = 'Time out when downloading %s to %s. Retrying.' % (img_url, gif_full_path)
                        _logger.warning(msg)
                    except HTTPError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s' % (e.reason, img_url, gif_full_path)
                        if e.code == 404:
                            dict_not_found[gif_full_path] = img_url
                            _logger.warning(msg)
                            break
                        else:
                            msg += ' Retrying.'
                            _logger.warning(msg)
                    except URLError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % (
                            e.reason, img_url, gif_full_path)
                        _logger.warning(msg)
                    except ConnectionError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % (
                            str(e), img_url, gif_full_path)
                        _logger.warning(msg)

                if attempts == max_attempts:
                    msg = 'Max attempts reached. Fail to download image ' + img_url
                    _logger.error(msg)