def stargazers_handler(task, doc): for url in doc.extract_links('#watchers li > a'): yield Task(url=url, document_type='user') page_links = CSSSelector('.pagination a')(doc.lxml) if page_links: next_url = page_links[-1].attrib['href'] yield Task(url=next_url, document_type='stargazers')
def test_roundtrip(self): task = Task('http://www.example.com', 'plain') q = Queue('test') q.push(task) task2 = q.pop() self.assertEqual(task.url, task2.url)
def explore_handler(task, doc): for url in doc.extract_links('.ranked-repositories h3 a'): if url.count('/') == 4: document_type = 'repo' elif url.count('/') == 3: document_type = 'user' else: continue yield Task(url=url, document_type=document_type)
def repo_handler(task, doc): yield Task(url=task.url + '/stargazers', document_type='stargazers')
def user_handler(task, doc): for url in doc.extract_links('.popular-repos .public a'): yield Task(url=url, document_type='repo')