示例#1
0
    def page_requests(self, query, **kwargs):
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            site = site.replace('*.', '') if site.startswith("*.") else site
            query = quote(query) + "+" + quote("site:") + quote(site)
        else:
            query = quote(query)

        if recent_days:
            if recent_days == 1:
                filters = 'ex1%3a"ez1"'
            elif recent_days == 7:
                filters = 'ex1%3a"ez2"'
            elif recent_days == 30:
                filters = 'ex1%3a"ez3"'
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://www.bing.com/search?q={}&filters={}'.format(query, filters)
        else:
            raw_url = 'https://www.bing.com/search?q={}'.format(query)

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&first={}'.format(raw_url, num + 1)
            yield HttpRequest(url)
示例#2
0
    def page_requests(self, query, **kwargs):
        """
        Day: tbs=qdr:d
        Week: tbs=qdr:w
        Year: tbs=qdr:y
        +site%3A*.gov.cn
        """
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            query = quote(query) + "+" + quote("site:") + quote(site)
        else:
            query = quote(query)

        if recent_days:
            if recent_days == 1:
                tbs = 'qdr:d'
            elif recent_days == 7:
                tbs = 'qdr:w'
            elif recent_days == 30:
                tbs = 'qdr:m'
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://www.google.com.hk/search?q={}&tbs={}'.format(query, tbs)
        else:
            raw_url = 'https://www.google.com.hk/search?q={}'.format(query)

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&start={}'.format(raw_url, num + 1)
            yield HttpRequest(url)
示例#3
0
文件: so.py 项目: jadbin/metase
    def page_requests(self, query, **kwargs):
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            query = query + " site:" + site

        if recent_days:
            if recent_days == 1:
                adv_t = 'd'
            elif recent_days == 7:
                adv_t = 'w'
            elif recent_days == 30:
                adv_t = 'm'
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://www.so.com/s?q={}&adv_t={}'.format(
                quote(query), adv_t)
        else:
            raw_url = 'https://www.so.com/s?q={}'.format(quote(query))

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&pn={}'.format(raw_url, num // self.page_size + 1)
            yield HttpRequest(url)
示例#4
0
    def page_requests(self, query, **kwargs):
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')
        if max_records is None:
            max_records = self.page_size

        if site:
            query = query + " site:" + site

        if recent_days:
            today = datetime.now()
            if recent_days == 1:
                start = today + timedelta(days=-1)
            elif recent_days == 7:
                start = today + timedelta(days=-7)
            elif recent_days == 30:
                start = today + timedelta(days=-30)
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            start, end = int(time.mktime(start.timetuple())), int(
                time.mktime(today.timetuple()))
            raw_url = 'http://www.baidu.com/s?wd={}&gpc=stf%3D{}%2C{}|stftype%3D1'.format(
                quote(query), start, end)
        else:
            raw_url = 'http://www.baidu.com/s?wd={}'.format(quote(query))

        for num in range(0, max_records, self.page_size):
            url = '{}&pn={}'.format(raw_url, num)
            yield HttpRequest(url)
示例#5
0
文件: yahoo.py 项目: jadbin/metase
    def page_requests(self, query, **kwargs):
        """
        btf=d; btf=w; btf=m
        https://hk.search.yahoo.com/search?p=%E5%8C%97%E4%BA%AC+site%3A*.gov.cn
        """
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            site = site.replace('*.', '') if site.startswith("*.") else site
            query = quote(query) + "+" + quote("site:") + quote(site)
        else:
            query = quote(query)

        if recent_days:
            if recent_days == 1:
                btf = 'd'
            elif recent_days == 7:
                btf = 'w'
            elif recent_days == 30:
                btf = 'm'
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://hk.search.yahoo.com/search?q={}&btf={}'.format(
                query, btf)
        else:
            raw_url = 'https://hk.search.yahoo.com/search?q={}'.format(query)

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&b={}'.format(raw_url, num + 1)
            yield HttpRequest(url)
示例#6
0
文件: sogou.py 项目: jadbin/metase
    def page_requests(self, query, **kwargs):
        """
        tsn=1&sourceid=inttime_day
        tsn=2&sourceid=inttime_week
        tsn=3&sourceid=inttime_month
        北京+site%3A*.gov.cn
        """
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            query = query + " site:" + site
        else:
            query = query

        if recent_days:
            if recent_days == 1:
                tsn, sourceid = 1, "inttime_day"
            elif recent_days == 7:
                tsn, sourceid = 2, "inttime_week"
            elif recent_days == 30:
                tsn, sourceid = 3, "inttime_month"
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://www.sogou.com/web?query={}&tsn={}&sourceid={}'.format(quote(query), tsn, sourceid)
        else:
            raw_url = 'https://www.sogou.com/web?query={}'.format(quote(query))

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&page={}&ie=utf8'.format(raw_url, num // self.page_size + 1)
            yield HttpRequest(url)
示例#7
0
文件: chinaso.py 项目: jadbin/metase
 def page_requests(self, query, **kwargs):
     max_records = kwargs.get('data_source_results')
     if max_records is None:
         max_records = self.page_size
     for num in range(0, max_records, self.page_size):
         url = 'http://www.chinaso.com/search/pagesearch.htm?q={}&page={}&wd={}'.format(
             quote(query), num // self.page_size + 1, quote(query))
         yield HttpRequest(url)
示例#8
0
 def page_requests(self, query, **kwargs):
     max_records = kwargs.get('data_source_results')
     if max_records is None:
         max_records = self.page_size
     for num in range(0, max_records, self.page_size):
         url = 'https://www.search.ask.com/web?q={}&page={}'.format(
             quote(query), num // self.page_size + 1)
         yield HttpRequest(url)
示例#9
0
 async def _get():
     slave = self._slave_available(name)
     if slave is None:
         return
     try:
         real_url_req = HttpRequest(result['url'],
                                    allow_redirects=False)
         resp = await slave.fetch_url(real_url_req, name)
         location = resp['data']
         if location is not None:
             result['url'] = urljoin(result['url'], location)
     except Exception as e:
         log.warning('Failed to get real location %s: %s',
                     result['url'], e)
示例#10
0
 async def update_cookies(self):
     """
     避免被BAN,定时通过主页刷新Cookie
     """
     while True:
         try:
             req = HttpRequest('http://www.baidu.com/')
             await self.extension.handle_request(req)
             resp = await self.downloader.fetch(req)
             self.cookies.update(self.get_cookies_in_response(resp))
         except Exception as e:
             log.warning('Failed to update cookies: %s', e)
         finally:
             await asyncio.sleep(5 * 60)
示例#11
0
文件: chinaso.py 项目: jadbin/metase
 async def update_cookies(self):
     while True:
         try:
             url = 'http://www.chinaso.com/search/pagesearch.htm?q={}'.format(
                 quote('中国搜索'))
             try:
                 req = HttpRequest(url, allow_redirects=False)
                 await self.extension.handle_request(req)
                 resp = await self.downloader.fetch(req)
             except HttpError as e:
                 resp = e.response
             cookies = self.get_cookies_in_response(resp)
             self.cookies.update(cookies)
         except Exception as e:
             log.warning('Failed to update cookies: %s', e)
         finally:
             await asyncio.sleep(5 * 60)
示例#12
0
 def parse(self, response):
     selector = Selector(response.text)
     for quote in selector.css('div.quote'):
         text = quote.css('span.text')[0].text
         author = quote.css('small.author')[0].text
         author_url = quote.css('small+a')[0].attr('href')
         author_url = urljoin(str(response.url), author_url)
         tags = quote.css('div.tags a').text
         self.quotes.append(
             dict(text=text,
                  tags=tags,
                  author=author,
                  author_url=author_url))
     next_page = selector.css('li.next a')
     if len(next_page) > 0:
         next_page_url = urljoin(str(response.url),
                                 next_page[0].attr('href'))
         yield HttpRequest(next_page_url, callback=self.parse)
示例#13
0
文件: slave.py 项目: jadbin/metase
    async def _fetch(self, request, name, rtype):
        body = pickle.dumps(request)
        timeout = self.config.get('timeout')
        req_headers = {'Content-Type': 'application/octet-stream'}

        timestamp = str(int(time.time()))
        nonce = str(random.randint(0, 1e8))
        signature = self.sign(body, name, rtype, timestamp, nonce)

        url = '{}?name={}&rtype={}&timestamp={}&nonce={}&signature={}'.format(
            self.api_url, name, rtype, timestamp, nonce, signature)
        req = HttpRequest(url,
                          method='POST',
                          headers=req_headers,
                          body=body,
                          timeout=timeout)
        resp = await self.downloader.fetch(req)
        real_resp = json.loads(resp.body)
        return real_resp
示例#14
0
# coding=utf-8

from xpaw import make_requests, HttpRequest

if __name__ == '__main__':
    requests = [
        'http://unknown', 'http://python.org',
        HttpRequest('http://python.org')
    ]
    results = make_requests(requests)
    print(results)
示例#15
0
 def start_requests(self):
     yield HttpRequest('http://quotes.toscrape.com/', callback=self.parse)
示例#16
0
    def start_requests(self):

        yield HttpRequest('http://www.baidu.com',
                          headers=self.headers,
                          callback=self.login)
示例#17
0
 def start_requests(self):
     for url in self.start_urls:
         yield HttpRequest(url, errback=self.handle_error)
示例#18
0
 def set_cookie_header(self, request: HttpRequest, cookies: SimpleCookie):
     if request.headers is None:
         request.headers = HttpHeaders()
     h = '; '.join('{}={}'.format(k, v.value) for k, v in cookies.items())
     request.headers.add('Cookie', h)
示例#19
0
 def start_requests(self):
     yield HttpRequest("http://news.baidu.com/",
                       callback=self.parse,
                       dont_filter=True)
示例#20
0
 async def parse(self, response):
     selector = Selector(response.text)
     tags = selector.xpath("//div[contains(@class, 'tags-box')]//a").text
     self.log("Top ten tags: %s", tags)
     yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse)