def fetch_source_URI(self, sourceURI): if not Util.is_valid_uri(sourceURI, is_source_uri=True): raise CachingError("Invalid URI: %s" % sourceURI) visited = set() uri_queue = Queue() uri_queue.put(sourceURI) thread_list = [] for index in xrange(FETCHING_THREAD_NUM): parser = _URIParser(visited, uri_queue, cache_root=self.cache_dir, fetch_data=True, print_out=self.print_out) thread_list.append(parser) parser.start() compiled_list = list() for t in thread_list: t.join() compiled_list.extend(t.compiled_list) # update to REDIS since it saved to cache self.update_cachedfile_info(compiled_list) Util.organize_compiled_list(compiled_list) return compiled_list
def get_compiled_URIs(cache_root, sourceURI): """Return list of URIItem Items in the URIItem list has tree structure, which is necessary for FUSE """ if sourceURI.endswith("/") == False: sourceURI += "/" if not Util.is_valid_uri(sourceURI, is_source_uri=True): msg = "Invalid URI: %s" % sourceURI raise CachingError(msg) uri_queue = Queue() visited = set() uri_queue.put(sourceURI) visited.add(sourceURI) thread_list = [] for index in xrange(FETCHING_THREAD_NUM): parser = _URIParser(visited, uri_queue, \ cache_root=cache_root, fetch_data=False) thread_list.append(parser) parser.start() compiled_list = list() try: while len(thread_list) > 0: t = thread_list[0] t.join(timeout=1.0) if not t.is_alive(): compiled_list.extend(t.compiled_list) thread_list.remove(t) except KeyboardInterrupt, e: for t in thread_list: t.terminate() t.join() sys.stderr.write("Keyboard Interrupt")
def download_page(self,page): print "page:",page url = "http://123.57.80.206:8080/user/search?pagesize=20&business=&uid=725&industy=&area=&page="+str(page) print "url:",url stime = time.time() print "<download start> at time: %s"%time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(stime)) #get rss data urllib2.socket.setdefaulttimeout(10) response = urllib2.urlopen(url) print response.headers data = response.read() response.close() etime = time.time() print "<download end> at time: %s cost:%f"%(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(etime)),etime-stime) # parse stime = etime self.parse_json(page,json.loads(data) ); etime = time.time() print "<parse end> at time: %s cost:%f"%(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(etime)),etime-stime) self.db.close() def start(self,page): self.download_page(page) #parser = WeiboParser("http://m.tvie.com.cn/mcms/api2/mod/sns/feeds.php?uid=2214257545") #parser = WeiboParser("http://m.tvie.com.cn/mcms/api2/mod/sns/feeds.php?uid=1640601392") page = int(sys.argv[1]) parser = ContactParser() parser.start(page) #print sohu.jsonMap