def parse_mobile(params): global mobiles, errors, req url = params['url'] comp_name = params['name'] print 'Request sent for - ' + url req += 1 try: mobiles += 1 tree = html.fromstring(session.get(url).text) name = tree.xpath('//h1[@class="specs-phone-name-title"]//text()')[0] print 'Total[%i], Mobile found : %s' % (len(data.keys()), name) data[name] = {} d = data[name] d['Url'] = url d['Brand'] = comp_name for x in tree.xpath('//div[@id="specs-list"]//table//tr'): name = x.xpath('.//td[@class="ttl"]/a/text()') if name != []: d[name[0]] = x.xpath('.//td[@class="nfo"]/text()') d['Battery'] = tree.xpath( '//th[text()="Battery"]/ancestor::tr//td[@class="nfo"]/text()') except Exception as e: print 'Error in %s Restarting.... \n Error message : %s' % (url, e.message) queue.put_nowait((parse_mobile, params)) errors += 1
def recv(self, sock, queue): """Receiver.""" # Note that this is not really an actor, but we want to send # messages anyway, so we need to access the actor. sock.setTimeout(None) while True: data = sock.read() queue.put_nowait(data) self.send(self.address, ('data',))
def UpdateTweet(self, tweet_text, tweet_dic): for regexp_watcher in self.regexp_watcher_list.values(): if 're_prog' not in regexp_watcher or 'queue' not in regexp_watcher: continue re_prog = regexp_watcher['re_prog'] queue = regexp_watcher['queue'] if re_prog is None or queue is None: continue match_result = re_prog.findall(tweet_text) if not match_result: continue #print "streaming hit for %s (%s)" % (regexp_watcher['user_name'].decode('utf-8'), regexp_watcher['description'].decode('utf-8')) queue.put_nowait((tweet_dic, match_result))
def handle_post(post): if not post.get('approxLoc'): print 'Post %r does not have approxLoc field' % post['id'] return print "New post", post['id'] data = { 'post': anonymize(post), 'lat': post['approxLoc']['lat'], 'lng': post['approxLoc']['lng'], } try: queue.put_nowait(data) except gevent.queue.Full: return
def parse_comp(params): global errors, req url = params['url'] comp_name = params['name'] dp.append(url) print 'Request sent for [%s] - %s' % (comp_name, url) req += 1 try: tree = html.fromstring(session.get(url).text) for x in tree.xpath('//div[@class="nav-pages"]//a/@href'): if domain + x not in dp: queue.put_nowait((parse_comp, { 'url': domain + x, 'name': comp_name })) for x in tree.xpath('//div[@class="makers"]//a/@href'): queue.put_nowait((parse_mobile, { 'url': domain + x, 'name': comp_name })) except Exception as e: print 'Error in %s Restarting.... \n Error message : %s' % (url, e.message) queue.put_nowait((parse_comp, params)) errors += 1
def scrape_base_url(): global data startTime = datetime.now() tree = html.fromstring(session.get(base_url).text) func = lambda x: queue.put_nowait((parse_comp, { 'url': domain + x.xpath('./@href')[0], 'name': x.xpath('./text()')[0] })) [ func(x) for x in tree.xpath('//div[@class="st-text"]//td/a') if x.xpath('./text()') != [] ] while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): t = queue.get_nowait() pool.start(pool.spawn(t[0], t[1])) pool.join() print 'Time Taken : ', datetime.now() - startTime with open('data.json', 'w') as fp: json.dump(data, fp)
def SendHeartbeat(self): for regexp_watcher in self.regexp_watcher_list.values(): queue = regexp_watcher['queue'] #print "send heartbeat to %s (%s)" % (regexp_watcher['user_name'].decode('utf-8'), regexp_watcher['description'].decode('utf-8')) queue.put_nowait((None, None))
self.put_to_queue(link['href']) def run_parse(self): try: self._queue.put(self._start_url) self.parse_html(self.get_content()) except Exception, e: print e, self._start_url, self.get_content() def pack_url_md5(self, url): m = hashlib.md5(url) return m.hexdigest()[8:-8] queue = queue.Queue() queue.put_nowait("") threads = [] threads.append( gevent.spawn( Consumers(queue, 'thread1', "http://beijing.lashou.com/").run_parse)) threads.append( gevent.spawn( Consumers(queue, 'thread2', "http://beijing.lashou.com/").run_parse)) threads.append( gevent.spawn( Consumers(queue, 'thread3', "http://beijing.lashou.com/").run_parse)) gevent.joinall(threads) # if __name__ == "__main__": # pass # parse_html(get_content("http://beijing.lashou.com/")) # print pack_url_md5("http://beijing.lashou.com/")
htmlobj = BeautifulSoup(response.read()) links = htmlobj.findAll(href=re.compile(r'^(http|/page)')) print response.geturl() for link in links: print link['href'],link.string self.put_to_queue(link['href']) def run_parse(self): try: self._queue.put(self._start_url) self.parse_html(self.get_content()) except Exception,e: print e,self._start_url,self.get_content() def pack_url_md5(self,url): m = hashlib.md5(url) return m.hexdigest()[8:-8] queue = queue.Queue() queue.put_nowait("") threads = [] threads.append(gevent.spawn(Consumers(queue,'thread1',"http://beijing.lashou.com/").run_parse)) threads.append(gevent.spawn(Consumers(queue,'thread2',"http://beijing.lashou.com/").run_parse)) threads.append(gevent.spawn(Consumers(queue,'thread3',"http://beijing.lashou.com/").run_parse)) gevent.joinall(threads) # if __name__ == "__main__": # pass # parse_html(get_content("http://beijing.lashou.com/")) # print pack_url_md5("http://beijing.lashou.com/")
def _notify(waiters, data): for queue in waiters: queue.put_nowait(data)