예제 #1
0
def search (keyword, max_results = 10):
    base_url = "http://www.imdb.com/find?s=all&q=%s" % (keyword)
    responses = helper.parallel_fetch([base_url])

    exact_titles = get_exact_matches.search(responses.values()[0])
    popular_titles = get_popular_matches.search(responses.values()[0])
    partial_titles = get_partial_matches.search(responses.values()[0])
    titles = []

    if popular_titles:
      titles += [popular_titles.group(0)]

    if exact_titles:
      titles += [exact_titles.group(0)]

    if partial_titles:
      titles += [partial_titles.group(0)]

    titles = '\n'.join(titles).strip()
    if not titles: return []

    urls = ['http://www.imdb.com/title/%s/usercomments' %  link for link in
        get_movies.findall( titles )[:5]]
    responses = helper.parallel_fetch(urls)
    results = []
    for url in responses:
      results += process_url(url, responses[url])
    return results
예제 #2
0
def search (keyword, max_results = 5):
    base_url = "http://www99.epinions.com/search/?search_string=%s" % (keyword)
    responses = helper.parallel_fetch([base_url])
    urls = set(['http://www99.epinions.com/reviews/%s' % link
             for link in get_movies.findall( responses.values()[0] )[:5]])

    responses = helper.parallel_fetch(urls)
    reviews = []
    for url in responses:
      reviews += ["http://www99.epinions.com/review/%s" % review for review in
        get_review.findall(responses[url])]
    reviews = set(reviews)

    responses = helper.parallel_fetch(reviews)
    results = []
    for url in responses:
      results += process_url(url, responses[url])
    return results
예제 #3
0
def search(keyword):
    print 'searching amazon'
    url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + keyword + "%s&x=0&y=0"
    soup = BeautifulSoup(urllib.urlopen(url).read())
    #print soup
    urls = [x.a['href'] for x in soup.findAll('div', {"class": "productTitle"})][:5]
    responses = helper.parallel_fetch(urls)
    results = []
    for url in responses:
        results += parse_url(url, responses[url])
    #print results
    print 'returning amazon'
    return results
예제 #4
0
  def fetch(self, requests, suffix):
    misses = 0
    active_processes = []
    googleRawResults = helper.parallel_fetch(requests.values())
    logger.info(len(self.sitelist))
    for site in self.sitelist:
        data = googleRawResults[requests[site]]
        urls = get_urls(data, site)
        
        logger.info(site)
        logger.info(len(urls))
        if not urls:
          logger.warning('GOOGLE NOT RETURNING ANYTHING')
          logger.warning(requests[site])
                                                                           
          # Don't cache throttle page
          if os.path.isfile('cache/%s.cache' % hash(requests[site])) and \
            'solving the above CAPTCHA' in data or not data:
            os.remove('cache/%s.cache' % hash(requests[site]))
            logger.error('Throttled: Cache cleared')
                                                                           
          # Sleep on misses, so bad searches don't DOS google
          misses += 1
          continue
          
        t = multiprocessing.Process(None, parseBlogs.parseBlogs, site,
              (site,
               self.curlLock,
               urls,
               '%s/%s.%s' % 
                  (self.directory, site.replace('/', '_'), suffix),
               requests[site],
              )
            )
        t.url = requests[site]
        t.start()
                                                                           
        active_processes += [t]
        if self.max_processes <= len(active_processes):
          t = active_processes.pop(0)
          t.join(460)
          if t.is_alive():
            t.terminate()
            logger.error('Terminated Process %s' % t.url)
                                                                           
                                                                           
    for process in active_processes:
        process.join(600)
        process.terminate()

    return misses
예제 #5
0
def search (keywords, max_results = 10):
    """
    Function search(keywords)
    Searches buzillions for the current set of keywords, note buzillions
    seems to aggressively throttle when more then 3 links are opened ?
    parameters:
    keywords - A string with all the keywords to search
    Output:
    Outputs a dictionary with the following keys
    title    - The title of this review
    title_section - The entire section containing the title in this review
    content  - The content of this review
    link     - The link that lead to this review
    """
    base_url = ("http://www.buzzillions.com/x/s?N=4294811422&D=x&cat=&extra=all-product&Ntt=%s" % keywords)
    responses = helper.parallel_fetch([base_url])
    urls = set([('http://www.buzzillions.com/reviews/%s' % link).split('#')[0]
             for link in get_movies.findall( responses.values()[0] )[:5]])
    responses = helper.parallel_fetch(urls)
    results = []
    for url in responses:
      results += process_url(url, responses[url])
    return results
예제 #6
0
def search(keyword):
    base_url = "http://reviews.cnet.com/1770-5_7-0.html?query=%s&tag=srch" % (keyword)
    output = urllib.urlopen(base_url).read()
    soup = BeautifulSoup(output)
    urls = [
        ("http://reviews.cnet.com" + x.find("a", {"class": "resultName"})["href"] + "?tag=contentMain;contentBody;1r")
        for x in soup.findAll("div", {"class": "resultInfo"})
    ]
    urls = [x for x in urls if x.find("http://", 1) == -1]
    responses = helper.parallel_fetch(urls)
    results = []
    for url in responses:
        results += [parse(url, responses[url])]
    return [x for x in results if x]
예제 #7
0
def search(keyword):
    query = urllib.urlencode(
        {'q': 'site:http://www.wired.com/reviews ' + keyword})
    url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % (
        query)
    search_results = urllib.urlopen(url)
    json = simplejson.loads(search_results.read())
    results = json['responseData']['results']
    urls = [x['url'] for x in results]
    responses = helper.parallel_fetch(urls)
    results = []
    for url in responses:
        results += [parse(url, responses[url])]
    return [x for x in results if x]
예제 #8
0
def search(keyword):
    base_url = "http://reviews.cnet.com/1770-5_7-0.html?query=%s&tag=srch" % (
        keyword)
    output = urllib.urlopen(base_url).read()
    soup = BeautifulSoup(output)
    urls = [("http://reviews.cnet.com" +
             x.find("a", {"class": "resultName"})['href'] +
             "?tag=contentMain;contentBody;1r")
            for x in soup.findAll("div", {"class": "resultInfo"})]
    urls = [x for x in urls if x.find("http://", 1) == -1]
    responses = helper.parallel_fetch(urls)
    results = []
    for url in responses:
        results += [parse(url, responses[url])]
    return [x for x in results if x]
예제 #9
0
  def parse_all(self, articles):
    data = helper.parallel_fetch(articles, replace_redirects=True)
    content = self.contentExtractor.parse_all(data)
    comments = self.commentExtractor.parse_all(data)
    results = {}
    for url in content:
      result_content = content[url]
      comment_url = "Error Url Not Found"
      result_comments = [error_comment]
      try:
        comment_url = self.commentExtractor.url_next[url] 
        result_comments = comments[url] 
      except KeyError:
        pass

      results[url] = Result(result_content, comment_url, result_comments) 

    return results
예제 #10
0
  def parse_all(self, url_site):
    self.url_site = url_site
    self.set_template_urls()
    self.url_data = {url:[] for url in url_site}
    
    self.found = set(url_site.keys())
    iteration = 1
    for iteration in xrange(1, self.max_iterations + 1):
      url_next = self.getNextUrls(iteration)
      if iteration == 1: self.url_next = url_next
      if not url_next: break
      if iteration > 1: time.sleep(10)
      nexturl_site = helper.parallel_fetch(url_next.values(), replace_redirects = True)
      self.url_site = {url : nexturl_site[newurl] for url, newurl in url_next.iteritems()}
      self.mapping = {url : newurl for url, newurl in url_next.iteritems()}
      self.found = set()
      self.process_urls()
      logger.info('iteration %s complete', iteration)
      logger.info('%s uncompleted', len(self.found))

    if self.max_iterations > 1 and iteration == self.max_iterations:
      logger.error('Iterated %s times, possible infinite loop', iteration)

    return self.url_data
예제 #11
0
  def get_extension(url):
    parsed_url = urlparse.urlparse(url)
    ext = parsed_url.path.split('.')[-1]
    if '/' in ext: return 'None'
    return ext

  toFetch = [url for url in url_extractor.findall(raw_json)
    if get_extension(url) not in 
    frozenset(['css', 'gif', 'ico', 'jpg', 'png', 'swf', 'woff', 'xml'])]
  ## Filter urls
  ## Search interesting urls

  with open('debug/%s.%s' % (parsed_url.netloc, i), 'a') as f:
    f.write('\n' + '\n'.join(toFetch))

  results = helper.parallel_fetch(toFetch)

  with open('text/%s.%s' % (parsed_url.netloc, i), 'w') as f:
    for url in results:
      f.write(url)
      f.write('\n')
      f.write(results[url])
      f.write('\n-------\n')

  correct_urls = [url for url in results if
     sum([s in str(results[url]) for s in comments])]

  with open('results/%s.%s' % (parsed_url.netloc, i), 'w') as f:
    # Yahoo was the only one returned with multiple urls
    # but both urls had all the data soooo....
    if not correct_urls: