示例#1
0
class SearchApi(object):
    base_url = 'https://ajax.googleapis.com/ajax/services/search/web'
    page_size = 8
    base_params   = {'v':'1.0', 'rsz':page_size}

    def __init__(self, n_results=8):
        self.n_results = n_results
        self.request_maker = RequestMaker()

    def run(self, query):
        logging.info("Fetching search results for '%s'" % query)
        result_pages = self._fetch_results_all(query)
        pages_formatted = [self._format_result_page(p) for p in result_pages]
        results = list(chain(*pages_formatted))
        logging.info("Retrieved %d search results" % len(results))
        return results

    def _fetch_results_all(self, query):
        pages = [self._fetch_results_one_page(query, start)
                    for start in range(0, self.n_results, self.page_size)]
        full_count = self._get_full_count(pages)
        if full_count:
            logging.info("Search request returned %d results." % full_count)
        else:
            logging.error("Search request returned no result!")
            return []
        return pages

    def _fetch_results_one_page(self, query, start=0):
        params = {'q': query, 'start': start}
        params.update(self.base_params)
        req = self.request_maker.run(self.base_url, params=params)
        if req.status_code != 200:
            raise RequestException("Error during search request '%s'!\n" % query+\
                    "(HTTP status code: %s)" % req.status_code)
        results = req.json()
        google_status_code = results.get('responseStatus')
        if google_status_code != 200:
            raise RequestException("Error during search request '%s'!\n" % query+\
                    "(Google status code: %s)" % google_status_code)
        return req.json()

    def _format_result_page(self, results):
        responseData = results.get('responseData', {})
        raw_results = responseData.get('results', [])
        return [r.get('titleNoFormatting', '')+' '+r.get('content', '')
                            for r in raw_results]

    def _get_full_count(self, result_pages):
        if not result_pages:
            return 0
        responseData = result_pages[0].get('responseData', {})
        count = responseData.get('cursor', {}).get('estimatedResultCount', 0)
        try:
            return int(str(count).replace(',',''))
        except ValueError as e:
            return 0

    def is_empty(self):
        return not bool(self.results)
示例#2
0
def scrape_category(original_requester, output_dir, year, term, category):
    """
    Scrape a category within a term (i.e. Anthropology, Engineering)
    """
    try:
        requester = RequestMaker.copy(original_requester)

        category_url_fmt = ('/course_evaluation_reports/fas/guide_dept?'
                            'dept={category}&term={term}&year={year}')
        courses = requester.make_request(category_url_fmt.format(
            category=category, term=term, year=year))
        course_links = [c.attrs['href'] for c in courses.select('.course a')]
        course_ids = [int(link.split('=')[1]) for link in course_links]

        for cid in course_ids:
            # Scrape data from each course
            scrape_course(requester, output_dir, cid, year, term)
    except:
        traceback.print_exc()
        raise
示例#3
0
 def __init__(self, n_results=8):
     self.n_results = n_results
     self.request_maker = RequestMaker()
示例#4
0
def _helper(t):
    (requester, output_dir, year, term) = t
    scrape_term(RequestMaker.copy(requester), output_dir, year, term)