def process(keyword): addr_list = tools.getIP() addr = addr_list[random.randint(0, len(addr_list)-1)] # 随机选择一个代理IP """代理设置""" proxy = Proxy( { 'proxy_type': ProxyType.MANUAL, 'http_proxy': addr } ) desired_capabilities = webdriver.DesiredCapabilities.PHANTOMJS proxy.add_to_capabilities(desired_capabilities) """1) 构造driver对象,并设置窗口尺寸""" driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities) driver.maximize_window() """2) 通过driver对象,实例化Sina类""" sina = Sina(DRIVER=driver) """3) 实现登录,两种登录方式二选一""" sina.login2() # sina.login() """4) 搜索""" sina.search(keyword) """连接到SSDB数据库""" client = SSDB(host='', port=8884) """获取结果集""" results = list() page = 1 while len(results) < 100: #至少返回100条数据 print "+++++++++++++++++++++++++++++++++++++++", "page", page block_list = sina.extract() results.extend(block_list) page += 1 if sina.nextPage(): continue else: break key = "weibo_"+keyword+"_"+tools.getTimeAsStr() value = json.dumps(results) client.set(key, value) print "ssdb save", key, len(results)
class SSDBKV(object): def __init__(self, host="127.0.0.1", port=8888, max_connections=10, timeout=60): self.host = host self.port = port self.max_connections = max_connections self.timeout = timeout pool = BlockingConnectionPool(connection_class=Connection, max_connections=max_connections, timeout=timeout, host=host, port=port) self.ssdb = SSDB(connection_pool=pool) def set(self, key, value): return self.ssdb.set(key, value) def get(self, key): return self.ssdb.get(key) def delete(self, key): return self.ssdb.delete(key) def keys(self, name_start=0, name_end=0xFFFFFFFF, limit=10): return self.ssdb.keys(name_start, name_end, limit) def exists(self, key): return self.ssdb.exists(key)
values = dict([[i["username"], timestamp] for i in res]) ssdb.multi_zset("ig-last-crawled", **values) ssdb.multi_zset("ig-last-updated", **values) values = dict([[i[_id], i["username"]] for i in res]) ssdb.multi_hset("ig-username-id", **values) for c, user in enumerate(json.loads(open("results.json").read())[:]): print c un = user["username"] cols = ["profile_pic_url","full_name","followers","following","username"] vals = [user[c] for c in cols] info = dict(zip(cols, vals)) #Information ssdb.set("ig-{0}-user-bio".format(un), user["biography"]) ssdb.set("ig-{0}-user-info".format(un), info) # TODO multizset ssdb.zset("ig-{0}-followers".format(un), timestamp, user["followers"]) ssdb.zset("ig-{0}-following".format(un), timestamp, user["following"]) ssdb.zset("ig-{0}-picture-count".format(un), timestamp, user["picture-count"]) onlyfiles = [f for f in listdir(".") if isfile(join(".", f))] onlyfiles = [i for i in onlyfiles if "pictures-" in i] for file_name in onlyfiles: timestamp = arrow.utcnow().timestamp res = json.loads(open(file_name).read())[:] values = dict([[i["code"], 0 for i in res]) ssdb.multi_zset("instagram-pictures", **values)
class QueueSSDB(QueueBase.QueueBase): """ base class , only provide interface for sub class to implement """ def __init__(self, name, host='localhost', port=8888, **kwargs): QueueBase.QueueBase.__init__(self, name, host, port) self.__conn = SSDB(connection_pool=BlockingConnectionPool(host=self.host, port=self.port)) #queue @QueueBase.catch def put(self, value, *args, **kwargs): """ put an item in the back of a queue :param value: :param args: :param kwargs: :return: """ return self.__conn.qpush_back(self.name, json.dumps(value, ensure_ascii=False).encode('utf-8') if isinstance(value, dict) or isinstance(value, list) else value) def save(self, value, *args, **kwargs): """ put an item in the back of a queue :param value: :param args: :param kwargs: :return: """ return self.__conn.qpush_back(self.name, json.dumps(value, ensure_ascii = False).encode('utf-8') if isinstance(value, dict) or isinstance( value, list) else value) @QueueBase.catch def get(self, *args, **kwargs): """ get element from the from of queue :param args: :param kwargs: :return: """ value = self.__conn.qpop_front(self.name) return value[0] if value else value @QueueBase.catch def getMore(self, *args, **kwargs): """ get element from the from of queue :param args: :param kwargs: :return: """ value = self.__conn.qpop_front(self.name, **kwargs) return value @QueueBase.catch def size(self, *args, **kwargs): return self.__conn.qsize(self.name) @QueueBase.catch def changeTable(self, name): """ change the queue name to operate :param name: :return: """ self.name = name @QueueBase.catch def select_queue(self, name): """ change the queue name to operate :param name: :return: """ self.name = name @QueueBase.catch def qclaerQueue(self): return self.__conn.qclear(self.name) #KV @QueueBase.catch def keySet(self,key,value): """ Set the value at key ``name`` to ``value`` . :param key: :param value: :return: """ value = json.dumps(value, ensure_ascii = False).encode('utf-8') if isinstance(value,dict) or isinstance(value, list) else value return self.__conn.set(key,value) @QueueBase.catch def keySetx(self,name, value, ttl=-1): """ Set the value of key ``name`` to ``value`` that expires in ``ttl`` seconds. ``ttl`` can be represented by an integer or a Python timedelta object. :param name: :param value: :param ttl: :return: """ return self.__conn.setx(name,value,ttl=ttl) @QueueBase.catch def keyTtl(self,key): """ Returns the number of seconds until the key ``name`` will expire. :return: """ self.__conn.ttl(key) @QueueBase.catch def keyGet(self,key): """ Return the value at key ``name``, or ``None`` if the key doesn't exist :param key: :return: """ return self.__conn.get(key) @QueueBase.catch def keyDel(self,key): """ Delete the key specified by ``name`` . :param key: :return: """ return self.__conn.delete(key) @QueueBase.catch def keyKeys(self,key_start='',key_end=''): """ Return a list of the top ``limit`` keys between ``name_start`` and ``name_end`` :param key_start: :param key_end: :return: """ return self.__conn.keys(name_start=key_start,name_end=key_end,limit=100000) @QueueBase.catch def keyexists(self,key): """ :param key: :return: """ return self.__conn.exists(key) #SET @QueueBase.catch def zsetSet(self,field,score = 1): if field: if isinstance(field, dict) or isinstance(field, list): field = json.dumps(field) field = field if len(field) < 100 else field[:100] return self.__conn.zset(self.name, field, score) @QueueBase.catch def zgetSet(self,key): return self.__conn.zget(self.name,key) @QueueBase.catch def zexistsSet(self,name,field): return self.__conn.zexists(name,field) @QueueBase.catch def zkeysSet(self): return self.__conn.zkeys(self.name,'','','',limit=100000000) @QueueBase.catch def zdelSet(self,key): return self.__conn.zdel(self.name,key) @QueueBase.catch def multi_zgetSet(self,*keys): return self.__conn.multi_zget(self.name,*keys) #Hash @QueueBase.catch def hgetallHash(self,key): return self.__conn.hgetall(key) @QueueBase.catch def hincrHash(self,name,key): return self.__conn.hincr(name,key,amount=1) @QueueBase.catch def multi_hsetHash(self,name,**mapping): return self.__conn.multi_hset(name, **mapping) @QueueBase.catch def hlistHash(self,start,end): return self.__conn.hlist(start, end, limit = 10000000) @QueueBase.catch def hclearHash(self,key): return self.__conn.hclear(key) @QueueBase.catch def hset(self, key,value): return self.__conn.hset(self.name, key,json.dumps(value, ensure_ascii = False).encode('utf-8') if isinstance(value,dict) or isinstance(value, list) else value) @QueueBase.catch def hsize(self): return self.__conn.hsize(self.name) @QueueBase.catch def hget(self, key = None): if key: return self.__conn.hget(self.name, key) else: if self.__conn.hsize(self.name) > 0: keys=self.__conn.hkeys(self.name,"", "",limit=1) if keys: key=keys[0] v=self.__conn.hget(self.name, key) self.__conn.hdel(self.name, key) return v