def __init__(self, **kwargs): super(SougouWeixinFilterSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("sougou.json").get("sougou").get("sougou_weixin") self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(BaiduSinaNewsFilterSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get("sina_news") self.Wait_Element = self.xpathConf.get("wait_element")
def _process_item(self, item, spider): cols,vals,key = self.item_key(item, spider) print cols print vals print key mutations = [Mutation(column=col, value=val) for col,val in zip(cols,vals)] self.client.mutateRow(self.tableName,confUtil.getMd5(key),mutations,None) return item
def __init__(self, **kwargs): super(BaiduSearchBySiteKeywordSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get("search_conf") self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(WeiboSearchStartSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_com").get(self.spider_type) self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(SougouWeixinFilterSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("sougou.json").get("sougou").get( "sougou_weixin") self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(BaiduSearchBySiteKeywordSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get( "search_conf") self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(WeiboSearchStartSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weibo.json").get( "weibo_com").get(self.spider_type) self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(BaiduSinaNewsFilterSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("baidu.json").get("baidu").get( "sina_news") self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self,**kwargs): super(WeixinContentSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weixin.json").get(self.siteName).get(self.spider_type) self.Wait_Element = self.xpathConf.get("wait_element") self.itemKeys.append("title")
def hbase_tables(self): tables = self.client.getTableNames() print tables cols =['detail:publish_time', 'detail:site_source', 'detail:site_type', 'detail:site_url', 'detail:task_id', 'detail:author', 'detail:catch_date' ] vals = ['2015-03-10 02:39', 'news.sina.com.cn', 'news', 'http://news.sina.com.cn/c/2015-03-10/023931587440.shtml', '-1', u'\u4eac\u534e\u65f6\u62a5'.encode("utf-8"), '2015-03-27' ] key = "http://news.sina.com.cn/c/2015-03-10/023931587440.shtml" print confUtil.getMd5(key) mutations = [Mutation(column=col, value=val) for col,val in zip(cols,vals)] self.client.mutateRow(self.tableName,confUtil.getMd5(key),mutations,None)
def __init__(self, **kwargs): super(WeiboComUserInfoContentSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #爬去结果输出到Redist的Key。 self.out_key=self.name[0:self.name.index(":")] #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_cn").get("user_info") self.Wait_Element = self.xpathConf.get("wait_element")
def __init__(self, **kwargs): super(WeixinContentSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weixin.json").get( self.siteName).get(self.spider_type) self.Wait_Element = self.xpathConf.get("wait_element") self.itemKeys.append("title")
def __init__(self, **kwargs): super(WeiboComUserInfoContentSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #爬去结果输出到Redist的Key。 self.out_key = self.name[0:self.name.index(":")] #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_cn").get( "user_info") self.Wait_Element = self.xpathConf.get("wait_element")
def _process_item(self, item, spider): cols, vals, key = self.item_key(item, spider) print cols print vals print key mutations = [ Mutation(column=col, value=val) for col, val in zip(cols, vals) ] self.client.mutateRow(self.tableName, confUtil.getMd5(key), mutations, None) return item
def __init__(self, **kwargs): super(WeiboComSearchSpider,self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weibo.json").get("weibo_com").get("search_conf") self.Wait_Element = self.xpathConf.get("wait_element") self.itemKeys.append('attitude') self.itemKeys.append('comments') self.itemKeys.append('repost') self.itemKeys.append('user_url')
def __init__(self, **kwargs): super(WeiboComSearchSpider, self).__init__(**kwargs) self.name = kwargs.get("name") self.redis_key = kwargs.get("redis_key") #获取微博的XPath配置 self.xpathConf = confUtil.getJsonStr("weibo.json").get( "weibo_com").get("search_conf") self.Wait_Element = self.xpathConf.get("wait_element") self.itemKeys.append('attitude') self.itemKeys.append('comments') self.itemKeys.append('repost') self.itemKeys.append('user_url')
def hbase_tables(self): tables = self.client.getTableNames() print tables cols = [ 'detail:publish_time', 'detail:site_source', 'detail:site_type', 'detail:site_url', 'detail:task_id', 'detail:author', 'detail:catch_date' ] vals = [ '2015-03-10 02:39', 'news.sina.com.cn', 'news', 'http://news.sina.com.cn/c/2015-03-10/023931587440.shtml', '-1', u'\u4eac\u534e\u65f6\u62a5'.encode("utf-8"), '2015-03-27' ] key = "http://news.sina.com.cn/c/2015-03-10/023931587440.shtml" print confUtil.getMd5(key) mutations = [ Mutation(column=col, value=val) for col, val in zip(cols, vals) ] self.client.mutateRow(self.tableName, confUtil.getMd5(key), mutations, None)
def __init__(self): self.local_ip = confUtil.getLocalIp() self.redisConfUtil = RedisConfUtil()
def __init__(self): self.local_ip = confUtil.getLocalIp() self.redisConfUtil = RedisConfUtil()