예제 #1
0
 def test_url(self):
     url = 'http://www.google.com'
     data = {'a': 1, 'b': 2, 'c': 3}
     full = DefaultScraper.encodeurl('POST', url, data)
     self.assertTrue('<args>' in full)
     url2, data2 = DefaultScraper.parseurl(full)
     self.assertEqual(url, url2)
     self.assertEqual(data, data2)
예제 #2
0
 def _parsepage(self, page, oriurl):
     page = page[page.index('> ')+2:-5].strip()
     if ',' in page:
         page = page.replace(',', '')
     if page.startswith('1-'):
         url, data = DefaultScraper.parseurl(oriurl)
         keyword = data['KEYWORDS']
         total = int(page.split(' of ')[1])
         phase = Phase(data['fromDate'], data['toDate'], keyword, total)
         urls = []
         for i in xrange(2, phase.pages+1):
             data['page_no'] = i
             urls.append(DefaultScraper.encodeurl('POST', url, data))
         self._spider.addtask(urls)
         return phase
예제 #3
0
def generateseeds(keyword, year, month=None):
    base = 'http://online.wsj.com/search/term.html?KEYWORDS=' + urllib.quote(keyword)
    data = {'KEYWORDS': keyword,
            'fromDate': '',
            'toDate': '',
            'source': 'WSJ.com',
            'media': 'All',
            'page_no': '',
            'sorted_by': 'relevance',
            'date_range': '90 days',
            'adv_search': 'open'}
    urls = []
    if not month:
        month = list(xrange(1, 13))
    for y in year:
        for m in month:
            ys = str(y % 100) if y % 100 >= 10 else ('0' + str(y % 100))
            ms = str(m) if m >= 10 else ('0' + str(m))
            d = lastday(y, m)
            ds = str(d) if d >= 10 else ('0' + str(d))
            data['fromDate'] = ms+'/01/'+ys
            data['toDate'] = ms+'/'+ds+'/'+ys
            urls.append(DefaultScraper.encodeurl('POST', base, data))
    return urls