def test_url(self): url = 'http://www.google.com' data = {'a': 1, 'b': 2, 'c': 3} full = DefaultScraper.encodeurl('POST', url, data) self.assertTrue('<args>' in full) url2, data2 = DefaultScraper.parseurl(full) self.assertEqual(url, url2) self.assertEqual(data, data2)
def _parsepage(self, page, oriurl): page = page[page.index('> ')+2:-5].strip() if ',' in page: page = page.replace(',', '') if page.startswith('1-'): url, data = DefaultScraper.parseurl(oriurl) keyword = data['KEYWORDS'] total = int(page.split(' of ')[1]) phase = Phase(data['fromDate'], data['toDate'], keyword, total) urls = [] for i in xrange(2, phase.pages+1): data['page_no'] = i urls.append(DefaultScraper.encodeurl('POST', url, data)) self._spider.addtask(urls) return phase
def generateseeds(keyword, year, month=None): base = 'http://online.wsj.com/search/term.html?KEYWORDS=' + urllib.quote(keyword) data = {'KEYWORDS': keyword, 'fromDate': '', 'toDate': '', 'source': 'WSJ.com', 'media': 'All', 'page_no': '', 'sorted_by': 'relevance', 'date_range': '90 days', 'adv_search': 'open'} urls = [] if not month: month = list(xrange(1, 13)) for y in year: for m in month: ys = str(y % 100) if y % 100 >= 10 else ('0' + str(y % 100)) ms = str(m) if m >= 10 else ('0' + str(m)) d = lastday(y, m) ds = str(d) if d >= 10 else ('0' + str(d)) data['fromDate'] = ms+'/01/'+ys data['toDate'] = ms+'/'+ds+'/'+ys urls.append(DefaultScraper.encodeurl('POST', base, data)) return urls