Exemplo n.º 1
0
    def crawl_nexon_board():
        raw_data = []
        browser = Browser()

        browser.get_page(URL.BASE_URL)
        browser.click_element_by_xpath(ElementXpath.POST_TITLES_ON_BOARD)
        recent_post_id = int(
            re.search('n4articlesn=(\d+)', browser.get_current_url()).group(1))

        for i in range(0, 10):
            url = PathUtils.fromTemplate(URL.POST_URL, str(recent_post_id - i))
            browser.get_page(url)

            title = browser.get_element_by_xpath(
                ElementXpath.TITLE_ON_POST).text
            content = browser.get_element_by_xpath(
                ElementXpath.CONTENT_ON_POST).get_attribute("content")
            comments = list(
                map(
                    lambda element: element.text,
                    browser.get_elements_by_xpath(
                        ElementXpath.COMMENTS_ON_POST)))
            raw_data.append(title)
            raw_data.append(content)
            raw_data.extend(comments)

        return raw_data
Exemplo n.º 2
0
class FarooSearch(object):

    SEARCH_URL = "http://www.faroo.com/api?q=%(query)s&start=%(start)s&length=%(length)s&l=en&src=web&f=json"

    def __init__(self, query, start, length):
        self.query = query.replace(' ', '%20')
        self.start = start
        self.length = length
        self.browser = Browser()

    def _get_results(self):

        url = FarooSearch.SEARCH_URL
        
        actual_url = url % {'query': self.query,
                         'start': self.start,
                         'length': self.length}

        try:
            page = self.browser.get_page(actual_url)
        except BrowserError:
            raise SearchError ("Failed getting %s: %s") % (e.url, e.error)

        #return (unicodedata.normalize('NFKD', page).encode('ascii','ignore')).decode("utf-8")
        return page.decode("utf-8")