Пример #1
0
    def __init__(self, server, key, debug=False, **kwargs):
        super().__init__(server, key, debug)
        spider_settings = kwargs.get('spider_settings')
        if not spider_settings:
            raise EnvironmentError(
                "Please ensure you are using 'scrapy_ddiy.utils.scheduler.SchedulerDdiy' as the SCHEDULER."
            )

        self.server = Client(host=spider_settings.get('REDIS_HOST'),
                             port=spider_settings.get('REDIS_PORT'),
                             **spider_settings.get('REDIS_PARAMS'))
        assert self.server.ping(
        ), 'Redis failed to establish a connection, please check the settings'
        error_rate = spider_settings.getfloat('REDIS_BLOOM_ERROR_RATE')
        capacity = spider_settings.getint('REDIS_BLOOM_CAPACITY')
        assert capacity, "Please set the 'REDIS_BLOOM_CAPACITY' for the spider"
        assert error_rate, "Please set the 'REDIS_BLOOM_ERROR_RATE' for the spider"
        if not self.server.keys(self.key):
            try:
                # By default, bloom-filter is auto-scaling
                self.server.bfCreate(self.key, error_rate, capacity)
            except redis.exceptions.ResponseError:
                raise EnvironmentError(
                    'The redis not loaded the redis-bloom module. See the doc [ xx ]'
                )
Пример #2
0
 def __init__(self,
              *,
              redis_client: Client = None,
              redis_host: str = None,
              redis_port: int = None,
              max_command_params: int = DEFAULT_MAX_COMMAND_PARAMS):
     if redis_client is not None:
         self.redis_client = redis_client
     else:
         self.redis_client = Client(host=redis_host, port=redis_port)
     self.__max_command_params = max_command_params
Пример #3
0
    def connect_to_redis(self):
        try:
            self.conn = Client(host=self.host,
                               port=self.port,
                               db=self.db,
                               password=self.pwd)
        except Exception as e:
            print(e)
            return False

        return True
Пример #4
0
class RedisBloomFilterClient(BloomFilterClient):
    """
    A RedisBloom-based bloom filter client.
    """
    def __init__(self,
                 *,
                 redis_client: Client = None,
                 redis_host: str = None,
                 redis_port: int = None,
                 max_command_params: int = DEFAULT_MAX_COMMAND_PARAMS):
        if redis_client is not None:
            self.redis_client = redis_client
        else:
            self.redis_client = Client(host=redis_host, port=redis_port)
        self.__max_command_params = max_command_params

    def exists(self,
               key: str,
               objects: List[T],
               value_func: Callable[[T], str] = str) -> List[T]:
        # If the bloom filter key doesn't exist, all values should be returned.

        if not key or not objects or not self.redis_client.exists(key):
            return objects

        if not isinstance(objects, list):
            raise ValueError("The objects parameter must be a list")

        # Split the provided object list according to the maximum number of parameters allowed per Redis command.

        if self.__max_command_params:
            commands = [
                chunk for chunk in chunked(objects, self.__max_command_params)
            ]
        else:
            commands = [objects]

        # Create a pipeline to send all the commands to Redis at once.

        pipeline = self.redis_client.pipeline(transaction=False)

        for command in commands:
            pipeline.bfMExists(key, *map(value_func, command))

        # Execute and get results for all the pipelined commands.

        results = pipeline.execute()

        # Build the final results.

        return [
            value for value in chain(
                *[compress(*pairs) for pairs in zip(commands, results)])
        ]
Пример #5
0
    def __init__(self, settings, stats):
        self.logger = logging.getLogger(__name__)
        self.settings = settings
        self.stats = stats

        REDIS_HOST = self.settings.get('REDIS_HOST')
        REDIS_PORT = self.settings.get('REDIS_PORT')
        REDIS_PASSWORD = self.settings.get('REDIS_PASSWORD')

        try:
            self.rb = Client(host=REDIS_HOST,
                             port=REDIS_PORT,
                             password=REDIS_PASSWORD)
            self.logger.info(f"Successfully connected to redis server")
        except Exception as e:
            self.logger.error(f"Unable to connect to redis server: {e}")
Пример #6
0
 def __init__(self, config):
     """Follow类初始化"""
     self.rb = Client()
     self.filter_redis_key = 'uidfilter'
     self.validate_config(config)
     self.cookie = {'Cookie': config['cookie']}
     user_id_list = config['user_id_list']
     if not isinstance(user_id_list, list):
         if not os.path.isabs(user_id_list):
             user_id_list = os.path.split(
                 os.path.realpath(__file__))[0] + os.sep + user_id_list
         user_id_list = self.get_user_list(user_id_list)
     self.user_id_list = user_id_list  # 要爬取的微博用户的user_id列表
     self.user_id = ''
     self.follow_list = []  # 存储爬取到的所有关注微博的uri和用户昵称
     self.fans_list = []  # 存储爬取到的所有粉丝微博的uri和用户昵称
     self.file_name = 'user_id_list' + str(time()) + '.txt'
Пример #7
0
def load_data():

    if environ.get('REDIS_SERVER') is not None:
        redis_server = environ.get('REDIS_SERVER')
    else:
        redis_server = 'localhost'

    if environ.get('REDIS_PORT') is not None:
        redis_port = int(environ.get('REDIS_PORT'))
    else:
        redis_port = 6379

    if environ.get('REDIS_PASSWORD') is not None:
        redis_password = environ.get('REDIS_PASSWORD')
    else:
        redis_password = ''

    rdb = redis.Redis(host=redis_server,
                      port=redis_port,
                      password=redis_password)
    rb = RedisBloom(host=redis_server,
                    port=redis_port,
                    password=redis_password)
    rts = RedisTimeseries(host=redis_server,
                          port=redis_port,
                          password=redis_password)

    rdb.set("CONFIG", "YES")

    rts.create('s-unfiltered', retention_ms=60000)
    rts.create('s-filtered', retention_ms=60000)
    rts.create('unfiltered', labels={'Type': 'Final'}, retention_ms=86400000)
    rts.create('filtered', labels={'Type': 'Final'}, retention_ms=86400000)
    rts.createrule('s-unfiltered', 'unfiltered', 'last', 1000)
    rts.createrule('s-filtered', 'filtered', 'last', 1000)

    for gear in ['./dedup.py']:
        file = open(gear, mode='r')
        g = file.read()
        rdb.execute_command('RG.PYEXECUTE', g)
        file.close()

    if environ.get('REDIS_SCRABBLE') is not None:
        for line in fileinput.input("2019_Collins_Scrabble_Words.txt"):
            rb.bfAdd("Scrabble-Bloom", line.rstrip())
Пример #8
0
class RedisManager():
    def __init__(self, settings, stats):
        self.logger = logging.getLogger(__name__)
        self.settings = settings
        self.stats = stats

        REDIS_HOST = self.settings.get('REDIS_HOST')
        REDIS_PORT = self.settings.get('REDIS_PORT')
        REDIS_PASSWORD = self.settings.get('REDIS_PASSWORD')

        try:
            self.rb = Client(host=REDIS_HOST,
                             port=REDIS_PORT,
                             password=REDIS_PASSWORD)
            self.logger.info(f"Successfully connected to redis server")
        except Exception as e:
            self.logger.error(f"Unable to connect to redis server: {e}")

    def _bf_add_url_(self, url):
        try:
            bf_add = self.rb.bfAdd('bf_urls', url)
            if bf_add:
                self.stats.inc_value('redis/bloomfilter/added_urls')
                self.logger.info(f"Added '{url}' to bloomfilter.")
            else:
                self.logger.error(f"Couldn't add '{url}' to bloomfilter")
        except Exception as e:
            self.logger.error(e)

    def _bf_check_url_pres_(self, url):
        if self.rb.bfExists('bf_urls', url):
            self.logger.debug(f"Found '{url}' in bloomfilter")
            self.stats.inc_value('redis/bloomfilter/existing_urls')
            return True
        else:
            self.logger.debug(f"Couldn't find '{url}' in bloomfilter")
            self.stats.inc_value('redis/bloomfilter/not_existing_urls')
            return False


# if __name__ == '__main__':
#     rm = RedisManager()
#     rm._bf_add_url_("test1")
Пример #9
0
    def __init__(self):
        host = os.getenv("REDIS_HOST")
        port = os.getenv("REDIS_PORT")

        if not host or not port:
            raise Exception(
                "No Redis Host or Port provided. Please provide Host and Port in docker run command as env"
            )

        port = int(port)
        self.redis_client = redis.Redis(host=host, port=port)
        self.bloom_client = Client(host=host, port=port)
Пример #10
0
class RedisBloomDupeFilter(RFPDupeFilter):
    """
    Redis-bloom request duplicates filter for redis-spider.
    This class can also be used with default Scrapy's scheduler.
    """
    def __init__(self, server, key, debug=False, **kwargs):
        super().__init__(server, key, debug)
        spider_settings = kwargs.get('spider_settings')
        if not spider_settings:
            raise EnvironmentError(
                "Please ensure you are using 'scrapy_ddiy.utils.scheduler.SchedulerDdiy' as the SCHEDULER."
            )

        self.server = Client(host=spider_settings.get('REDIS_HOST'),
                             port=spider_settings.get('REDIS_PORT'),
                             **spider_settings.get('REDIS_PARAMS'))
        assert self.server.ping(
        ), 'Redis failed to establish a connection, please check the settings'
        error_rate = spider_settings.getfloat('REDIS_BLOOM_ERROR_RATE')
        capacity = spider_settings.getint('REDIS_BLOOM_CAPACITY')
        assert capacity, "Please set the 'REDIS_BLOOM_CAPACITY' for the spider"
        assert error_rate, "Please set the 'REDIS_BLOOM_ERROR_RATE' for the spider"
        if not self.server.keys(self.key):
            try:
                # By default, bloom-filter is auto-scaling
                self.server.bfCreate(self.key, error_rate, capacity)
            except redis.exceptions.ResponseError:
                raise EnvironmentError(
                    'The redis not loaded the redis-bloom module. See the doc [ xx ]'
                )

    def request_seen(self, request):
        """Returns True if request was already seen"""
        fp = self.request_fingerprint(request)
        # This returns the number of values added, zero if already exists.
        added = self.server.bfAdd(self.key, fp)
        return added == 0
def setup_rebloom():
  # create the client
  client = Client()

  # remove any old keys
  client.delete('ufo_words', 'ufo_shapes')

  # setup some Top-K action!
  client.topkReserve('ufo_words', k=10, width=400, depth=10, decay=0.9)
  client.topkReserve('ufo_shapes', k=10, width=20, depth=10, decay=0.9)

  # return the client
  return client
Пример #12
0
    def redisbloom_client(cls, host: str, port: int):
        """
        Returns a redisbloom Client, installing the redisbloom package if necessary.  If/when a proper virtualenv
        setup is available on Glue and redisbloom can be pre-installed, this function will still serve to create
        as few instances of the redisbloom Client as possible (one per forked Python thread per Spark executor).

        NOTE: Intended ONLY for use on a Glue (PySpark) executor, likely as the first step of a foreachParititon statement.

        :param host: The Redis host URL
        :param port: The Redis port
        :return: An instance of redisbloom.client.Client
        """
        if not (host, port) in cls.__redisbloom_client:
            try:
                from redisbloom.client import Client
            except ImportError:
                # Only install the redisbloom package to fix the failed import
                cls.__install_packages(['redisbloom==0.4.0'])
                from redisbloom.client import Client
            cls.__redisbloom_client[(host, port)] = Client(host=host,
                                                           port=port)
        return cls.__redisbloom_client[(host, port)]
Пример #13
0
    def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000):
        """
        Initialize last two layers of cache

        :param node_name:
        :param lru_size:
        """
        super(FullLayeredCache, self).__init__(node_name, lru_size)

        # Set to true so we add a timeout to layer 2 redis key value stores
        self.set_timeout = True

        # Create the bloom filter client object
        self.bloom = RedisBloom(port=6378)

        # Create a dgraph client, stub, and transaction
        self.dgraph, self.stub = get_client()
        self.txn = self.dgraph.txn()

        # Initialize the bloom filter (if it doesnt already exist)
        try:
            self.bloom.bfInfo(node_name)
        except exceptions.ResponseError:
            self.bloom.bfCreate(node_name, p, n)
Пример #14
0
ielements = 1024;
offset = 123456789;
t = 5*1048576;
a = [];
b = [];
i = 0;


# Test Element
test_element = randint(ielements,offset-1);
filter_name = str(test_element);



# Create the Cuckoo Filter
r = Client()
r.cfCreate(filter_name, cfsize);


# Insert a fraction of the elements 
for x in range(1,ielements-1):
  r.cfAdd(filter_name, str(x));

# Test a large number of elements 
for x in range(offset,t+offset):
  pos = r.cfExists(filter_name, str(x));
  #print(pos,x)
  if pos == 0:
      a.append(x) 

# Print FPR and set size
Пример #15
0
from redisbloom.client import Client

# 因为我使用的是虚拟机中docker的redis, 填写虚拟机的ip地址和暴露的端口
rb = Client(host='node01', port=6379)
rb.bfAdd('urls', 'baidu')
rb.bfAdd('urls', 'google')
print(rb.bfExists('urls', 'baidu'))  # out: 1
print(rb.bfExists('urls', 'tencent2'))  # out: 0

rb.bfMAdd('urls', 'a', 'b')
print(rb.bfMExists('urls', 'google', 'baidu', 'tencent'))  # out: [1, 1, 0]
Пример #16
0
def redis_attack():

    try:

        # connect to Redis server
        # targe item
        target = 'ASDFGHJKLZXCVBNM'

        # rb = Client()
        # rb.cmsInitByDim('dim', 1000, 5)
        # rb.cmsIncrBy('dim', ['foo'], [5])
        # rb.cmsIncrBy('dim', ['foo', 'bar'], [5, 15])
        # msg = rb.cmsQuery('dim', 'foo')
        count = 0
        dbkey = 'cms4'

        rb = Client()
        rb.cmsInitByDim(dbkey, 4096, 4)
        finish = True

        rb.cmsIncrBy(dbkey, [target], [5])
        estimate = rb.cmsQuery(dbkey, target)[0]
        set = []

        print(estimate)

        while (finish):

            # update the cms
            random_string = randomword(16)
            estimate = rb.cmsQuery(dbkey, target)[0]
            rb.cmsIncrBy(dbkey, [random_string], [1])
            count = count + 1

            if estimate < rb.cmsQuery(dbkey, target)[0]:
                print('Element Found!')
                print(random_string)
                set.append(random_string)
                finish = False

                for i in range(10):
                    estimate = rb.cmsQuery(dbkey, target)[0]

                    for x in set:
                        rb.cmsIncrBy(dbkey, [x], [10])

                    if estimate == rb.cmsQuery(dbkey, target)[0]:
                        finish = True
                        print('False positive!')
                        break

                else:
                    print("Finally finished!\n")

        print("Attack set found!")

        for x in set:
            print(x)

        #print(msg)
        print("Test for the attack set... ")

        print("Target element estimate before attack -> " +
              str(rb.cmsQuery(dbkey, target)[0]))

        for x in set:
            print("Inserting..." + x)
            rb.cmsIncrBy(dbkey, [x], [1])

        print("Target element estimate after attack -> " +
              str(rb.cmsQuery(dbkey, target)[0]))
        print("Operations needed -> " + str(count))
    except Exception as e:
        print(e)
Пример #17
0
class Redis(BaseDb):
    '''
    proxy以 proxy:IP:port作为key,以hash方式存储,field为type, protocol,score, ctime
    '''

    __slots__ = ('_filter_name')

    @property
    def filter_name(self):
        return self._filter_name

    @filter_name.setter
    def filter_name(self, value):
        self._filter_name = value

    def __init__(self, host, pwd=None, port=6379, db=0):
        super().__init__()
        self.host = host
        self.pwd = pwd
        self.port = port
        self.db = db
        self._filter_name = ''

    def connect_to_redis(self):
        try:
            self.conn = Client(host=self.host,
                               port=self.port,
                               db=self.db,
                               password=self.pwd)
        except Exception as e:
            print(e)
            return False

        return True

    def gen_key_name(self, record):
        # print(record)
        # print('ip' in record)
        # print('port' in record)
        if 'ip' in record and 'port' in record:
            return 'Proxy:%s:%s' % (record['ip'], record['port'])
        else:
            return None

    def exists(self, key_name):
        '''
        判断key是否已经存在,普通方式,和bf做对比,实际不使用
        :param key_name:
        :return: 0(false)/1(True)
        '''
        return self.conn.exists(key_name)

    def delete(self, key_name):
        return self.conn.delete(key_name)

    def delete_all(self):
        return self.conn.flushdb()

    # def hdelete(self, key_name):
    #     return self.conn.hdel(key_name)

    def hmset(self, record, validate_time):
        valid_fields = ['ip', 'port', 'proxy_type', 'protocol', 'score']
        # print(record)
        for single_valid_field in valid_fields:
            # print(single_valid_field)
            # print(single_valid_field not in record)
            if single_valid_field not in record:
                raise InvalidFieldException(single_valid_field)

        key_name = self.gen_key_name(record)
        field_value = {
            'proxy_type': record['proxy_type'],
            'protocol': record['protocol'],
            'score': record['score'],
            # 'ctime': record['ctime']
        }

        self.conn.hmset(key_name, field_value)
        self.conn.expire(key_name, validate_time)

    def multi_hmet(self, records, validate_time):
        for single_record in records:
            # print(single_record)
            self.hmset(single_record, validate_time)

    def time_interval_in_seconds(self, old_date_time, new_date_time):
        '''
        计算old_date_time和new_date_time之间时间间隔,单位秒
        :param old_date_time:
        :param new_date_time:
        :return:    int
        '''

        if not helper.match_expect_type(old_date_time, 'datetime.datetime'):
            if helper.match_expect_type(old_date_time, 'str'):
                old_date_time = datetime.datetime.strptime(
                    old_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('old_date_time的格式不正确')

        if not helper.match_expect_type(new_date_time, 'datetime.datetime'):
            if helper.match_expect_type(new_date_time, 'str'):
                new_date_time = datetime.datetime.strptime(
                    new_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('new_date_time的格式不正确')

        # datetime.datetime.now()+datetime.timedelta(days=1)
        return int((new_date_time - old_date_time).total_seconds())
        # print((new_date_time - old_date_time).total_seconds())

    def expire(self, key_name, ttl):
        return self.conn.expire(key_name, ttl)

    def bf_create(self, fpp=0.001, capacity=1000, expansion=1):
        '''
        创建一个bloom过滤器
        :param filter_name: 过滤器名称
        :param fpp: 假阳性概率
        :param capacity: 过滤器存储元素的个数
        :param expansion: 当filter填满后,新建的子filter的capacity是当前filter的几倍大小。1,说明同样大小
        :return: 0(create fail)/1(create success)
        '''
        try:
            self.conn.bfCreate(key=self._filter_name,
                               errorRate=fpp,
                               capacity=capacity,
                               expansion=expansion)
        except redis.exceptions.ResponseError as e:
            # print(e)    #item exists
            return 0
        return 1

    def bf_madd(self, records):
        items = ''
        for single_record in records:
            items += self.gen_key_name(single_record)
        self.conn.bfMAdd(self._filter_name, items)

    def bf_add(self, record):
        item = self.gen_key_name(record)

        self.conn.bfMAdd(self._filter_name, item)

    def bf_exists(self, item):
        return self.conn.bfExists(self._filter_name, item)

    def bf_mexists(self, items):
        '''
        :param items: 是一个list,调用bfMExists,加*变成可变参数
        :return:
        '''
        return self.conn.bfMExists(self._filter_name, *items)
Пример #18
0
class Follow(object):
    def __init__(self, config):
        """Follow类初始化"""
        self.rb = Client()
        self.filter_redis_key = 'uidfilter'
        self.validate_config(config)
        self.cookie = {'Cookie': config['cookie']}
        user_id_list = config['user_id_list']
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            user_id_list = self.get_user_list(user_id_list)
        self.user_id_list = user_id_list  # 要爬取的微博用户的user_id列表
        self.user_id = ''
        self.follow_list = []  # 存储爬取到的所有关注微博的uri和用户昵称
        self.fans_list = []  # 存储爬取到的所有粉丝微博的uri和用户昵称
        self.file_name = 'user_id_list' + str(time()) + '.txt'

    def validate_config(self, config):
        """验证配置是否正确"""
        user_id_list = config['user_id_list']
        if (not isinstance(user_id_list,
                           list)) and (not user_id_list.endswith('.txt')):
            sys.exit(u'user_id_list值应为list类型或txt文件路径')
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            if not os.path.isfile(user_id_list):
                sys.exit(u'不存在%s文件' % user_id_list)

    def deal_html(self, url):
        """处理html"""
        try:
            html = requests.get(url, cookies=self.cookie, verify=False).content
            selector = etree.HTML(html)
            return selector
        except Exception as e:
            print('Error: ', e)
            traceback.print_exc()

    def get_page_num(self):
        """获取关注列表页数"""
        url = "https://weibo.cn/%s/follow" % self.user_id
        selector = self.deal_html(url)
        if selector.xpath("//input[@name='mp']") == []:
            page_num = 1
        else:
            page_num = (int)(
                selector.xpath("//input[@name='mp']")[0].attrib['value'])
        return page_num

    def get_one_page(self, page):
        """获取第page页的user_id"""
        print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30))
        url = 'https://weibo.cn/%s/follow?page=%d' % (self.user_id, page)
        selector = self.deal_html(url)
        table_list = selector.xpath('//table')
        if (page == 1 and len(table_list) == 0):
            print(u'cookie无效或提供的user_id无效')
        else:
            for t in table_list:
                im = t.xpath('.//a/@href')[-1]
                uri = im.split('uid=')[-1].split('&')[0].split('/')[-1]
                nickname = t.xpath('.//a/text()')[0]
                # if {'uri': uri, 'nickname': nickname} not in self.follow_list:
                if self.rb.bfExists(self.filter_redis_key, uri) == 0:
                    self.rb.bfAdd(self.filter_redis_key, uri)
                    self.follow_list.append({'uri': uri, 'nickname': nickname})
                    print(u'%s %s' % (nickname, uri))

    def get_follow_list(self):
        """获取关注用户主页地址"""
        page_num = self.get_page_num()
        print(u'用户关注页数:' + str(page_num))
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
            self.get_one_page(page)

            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        print(u'用户关注列表爬取完毕')

    def get_fans_page_num(self):
        """获取关注列表页数"""
        url = "https://weibo.cn/%s/fans" % self.user_id
        selector = self.deal_html(url)
        if selector.xpath("//input[@name='mp']") == []:
            page_num = 1
        else:
            page_num = (int)(
                selector.xpath("//input[@name='mp']")[0].attrib['value'])
        return page_num

    def get_fans_one_page(self, page):
        """获取第page页的user_id"""
        print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30))
        url = 'https://weibo.cn/%s/fans?page=%d' % (self.user_id, page)
        selector = self.deal_html(url)
        table_list = selector.xpath('//table')
        if (page == 1 and len(table_list) == 0):
            print(u'cookie无效或提供的user_id无效')
        else:
            for t in table_list:
                im = t.xpath('.//a/@href')[-1]
                uri = im.split('uid=')[-1].split('&')[0].split('/')[-1]
                nickname = t.xpath('.//a/text()')[0]
                #if {'uri': uri, 'nickname': nickname} not in self.fans_list:
                if self.rb.bfExists(self.filter_redis_key, uri) == 0:
                    self.rb.bfAdd(self.filter_redis_key, uri)
                    self.fans_list.append({'uri': uri, 'nickname': nickname})
                    print(u'%s %s' % (nickname, uri))

    def get_fans_list(self):
        """获取关注用户主页地址"""
        page_num = self.get_fans_page_num()
        print(u'用户关注页数:' + str(page_num))
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
            self.get_fans_one_page(page)

            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        print(u'用户粉丝列表爬取完毕')

    def write_to_txt(self):
        with open(self.file_name, 'ab') as f:
            for user in self.follow_list:
                f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode(
                    sys.stdout.encoding))
            for user in self.fans_list:
                f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode(
                    sys.stdout.encoding))

    def get_user_list(self, file_name):
        """获取文件中的微博id信息"""
        with open(file_name, 'rb') as f:
            try:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8-sig') for line in lines]
            except UnicodeDecodeError:
                sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name)
            user_id_list = []
            for line in lines:
                info = line.split(' ')
                if len(info) > 0 and info[0].isdigit():
                    user_id = info[0]
                    if user_id not in user_id_list:
                        user_id_list.append(user_id)
        return user_id_list

    def initialize_info(self, user_id):
        """初始化爬虫信息"""
        self.follow_list = []
        self.fans_list = []
        self.user_id = user_id

    def check_unique(self, user_id):
        """查看user_id是否已经保存过"""

    def start(self):
        """运行爬虫"""
        for user_id in self.user_id_list:
            self.initialize_info(user_id)
            print(u'开始抓取:' + user_id)
            print('*' * 100)
            try:
                self.get_follow_list()  # 爬取关注列表
                self.get_fans_list()  # 爬取粉丝列表
            except Exception as e:
                print('Error: ', e)
                traceback.print_exc()
                sleep(10)  # 如果出错则跳过用户,而不是退出
            self.write_to_txt()
            print(u'信息抓取完毕')
            print('*' * 100)
Пример #19
0
import ujson as json
from redis.exceptions import ResponseError
from rediscluster import RedisCluster
from redisbloom.client import Client

import config
rc_list = json.loads(config.config(section='rediscluster')['rediscluster'])
redisbloomclient = Client(host=config.config()['host'],
                          port=config.config()['port'])

rediscluster_client = RedisCluster(startup_nodes=rc_list,
                                   decode_responses=True)

import os
from concurrent.futures import ThreadPoolExecutor, as_completed
n_cpus = os.cpu_count()
print(f'Number of CPUs: {n_cpus}')
executor = ThreadPoolExecutor(max_workers=n_cpus)

from pathlib import Path

datapath = Path('../input')


def parse_json_body_text(json_filename):
    print("Processing ..", json_filename.stem)
    with open(json_filename) as json_data:
        data = json.load(json_data)
        for body_text in data['body_text']:
            para = body_text['text']
            yield para
import ujson as json
from redis.exceptions import ResponseError
from rediscluster import RedisCluster
from redisbloom.client import Client



import config
rc_list=json.loads(config.config(section='rediscluster')['rediscluster'])
redisbloomclient = Client(host=config.config()['host'],port=config.config()['port'])


rediscluster_client = RedisCluster(startup_nodes=rc_list, decode_responses=True)

import os
from concurrent.futures import ThreadPoolExecutor, as_completed
n_cpus = os.cpu_count()
print(f'Number of CPUs: {n_cpus}')
executor = ThreadPoolExecutor(max_workers=n_cpus)

from pathlib import Path


datapath = Path('../input')


def parse_json_body_text(json_filename):
    print("Processing ..", json_filename.stem)
    with open(json_filename) as json_data:
        data = json.load(json_data)
def load_data():

    if environ.get('REDIS_SERVER') is not None:
        redis_server = environ.get('REDIS_SERVER')
    else:
        redis_server = 'localhost'

    if environ.get('REDIS_PORT') is not None:
        redis_port = int(environ.get('REDIS_PORT'))
    else:
        redis_port = 6379

    if environ.get('REDIS_PASSWORD') is not None:
        redis_password = environ.get('REDIS_PASSWORD')
    else:
        redis_password = ''

    rdb = redis.Redis(host=redis_server,
                      port=redis_port,
                      password=redis_password)
    rb = RedisBloom(host=redis_server,
                    port=redis_port,
                    password=redis_password)
    rts = RedisTimeseries(host=redis_server,
                          port=redis_port,
                          password=redis_password)

    with open('./users.csv', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count > 0:
                rdb.hset("user:%s" % (row[0].replace(" ", '')),
                         mapping={
                             'Name': row[0],
                             'AgeDemo': row[1],
                             'IncomeDemo': row[2],
                             'Sex': row[3]
                         })
                rdb.lpush("USERLIST", row[0])
            line_count += 1

    with open('./campaigns.csv', encoding='utf-8') as csv_file:
        rts.create('TOTALREVENUE')
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count > 0:
                rdb.zadd("campaign:%s" % (row[0].replace(" ", '')),
                         {row[2]: row[1]})
                rb.bfCreate(row[2], 0.01, 1000)
                rb.set("counter:%s" % (row[2].replace(" ", '')), row[3])
                rts.create("ADVIEW:%s" % (row[2].replace(" ", '')))
                rb.sadd("AdStats", row[2])
            line_count += 1

    for gear in ['./adgear.py', './adstats.py']:
        file = open(gear, mode='r')
        g = file.read()
        rdb.execute_command('RG.PYEXECUTE', g)
        file.close()
Пример #22
0
 def setUp(self):
     global rb
     rb = RedisBloom(port=port)
     rb.flushdb()
Пример #23
0
import redis
from redisbloom.client import Client
import os
import sys
from dotenv import load_dotenv

load_dotenv()

redisClient = Client.from_url(os.getenv('REDIS_URL'), decode_responses=True)


def run(consumer, group='cdr_stats_worker', stream='events:cdr'):
    """
    Subscribe to CDR events and write to hashes
    """
    print(f'Starting {group}/{consumer} consumer listen on {stream}')
    try:
        redisClient.xgroup_create(stream, group, id='0', mkstream=True)
    except redis.exceptions.ResponseError as error:
        print(error)
        if not str(error) == 'BUSYGROUP Consumer Group name already exists':
            raise error

    if not redisClient.exists('stats:callers:top50'):
        redisClient.topkReserve('stats:callers:top50', 50, 2000, 7, 0.925)

    while True:
        for offset in ['0', '>']:
            for _, entries in redisClient.xreadgroup(group,
                                                     consumer,
                                                     {stream: offset},
Пример #24
0
def get_item(key, item):
    """判断是否存在"""
    rb = Client(connection_pool=pool)
    return rb.bfExists(key, item)
Пример #25
0
def add_item(key, item):
    """添加值"""
    rb = Client(connection_pool=pool)
    return rb.bfAdd(key, item)
Пример #26
0
"""
基于redis布隆过滤器的误判率的测试
"""
import time
from redisbloom.client import Client
# pip install redisbloom
rb = Client(host='node01', port=6379)


def insert(size, key='book'):
    """插入数据"""
    # 一条条插入速度太慢了
    # for i in range(size):
    #     rb.bfAdd(key, f'book{i}')
    s = time.time()
    step = 1000  # 每次插入1000条数据
    for start in range(0, size, step):
        stop = start + step
        if stop >= size:
            stop = size
        rb.bfMAdd(key, *range(start, stop))
    print('插入结束... 花费时间: {:.4f}s'.format(time.time() - s))


def select(size, key='book'):
    """查询数据"""
    # 统计误判个数
    count = 0

    s = time.time()
Пример #27
0
def create_key(key, error, capacity):
    rb = Client(connection_pool=pool)
    rb.bfCreate(key, errorRate=error, capacity=capacity)
Пример #28
0
from flask import Flask, jsonify, send_from_directory

from redisbloom.client import Client

client = Client()

# the Flask app
app = Flask(__name__, instance_relative_config=True)


# this route returns the TopK shapes as JSON
@app.route('/shapes')
def shapes():

    top_shapes = [{
        'shape': shape,
        'count': client.topkCount('ufo_shapes', shape)[0]
    } for shape in client.topkList('ufo_shapes')]

    return jsonify(top_shapes)


# this route returns the TopK shapes as JSON
@app.route('/words')
def words():

    top_words = [{
        'word': word,
        'count': client.topkCount('ufo_words', word)[0]
    } for word in client.topkList('ufo_words')]
Пример #29
0
class FullLayeredCache(LayeredCache):
    """
    Multi-Layered key value store with bloom filter and dgraph.

    Layer 1: In Memory LRU Key Value Map
    Layer 2: Redis Key Value Store
    Layer 3: Bloom filter
    Layer 4: DGraph

    The primary difference between this class and the LayeredCache class is that this
    one includes the bloom filter and DGraph.
    """
    def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000):
        """
        Initialize last two layers of cache

        :param node_name:
        :param lru_size:
        """
        super(FullLayeredCache, self).__init__(node_name, lru_size)

        # Set to true so we add a timeout to layer 2 redis key value stores
        self.set_timeout = True

        # Create the bloom filter client object
        self.bloom = RedisBloom(port=6378)

        # Create a dgraph client, stub, and transaction
        self.dgraph, self.stub = get_client()
        self.txn = self.dgraph.txn()

        # Initialize the bloom filter (if it doesnt already exist)
        try:
            self.bloom.bfInfo(node_name)
        except exceptions.ResponseError:
            self.bloom.bfCreate(node_name, p, n)

    def __contains__(self, key: str) -> bool:
        """
        Check to see if key is in a layer of the cache. We will start at
        layer 1 and go walk through each layer until we find a result.
        We will update previous layers if we cache miss.

        We'll return True if the key was found at a layer, False if we
        cache miss.

        :param key:
        :return:
        """

        # Check layer 1 and 2
        if super(FullLayeredCache, self).__contains__(key):
            return True

        # Check the layer 3 bloom filter
        exists_in_bloom = self.bloom.bfExists(self.node_name,
                                              self._get_key(key))
        if exists_in_bloom == 1:
            # Unfortunately, we can't store the actual value in the bloom filter.
            # For this, we can't update previous layers with the value for this key.
            return True

        # All else has failed, we must now check dgraph. This is super super slow.
        query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name
        dgraph_result = self.txn.query(query, variables={"$a": str(key)})
        thing = json.loads(dgraph_result.json)
        if len(thing["all"]) > 0:
            # Update previous layers
            self[key] = thing["all"][0]["uid"]
            return True

        # Cache miss, return False
        return False

    def __getitem__(self, key: str) -> Union[str, None]:
        """
        Check each layer iteratively for the key specified. If we find the result
        at a given layer, we update previous layers with the result.

        If the result was not found, return None.

        :param key:
        :return:
        """
        # Check layer 1 and 2
        item = super(FullLayeredCache, self).__getitem__(key)
        if item is not None:
            return item

        # Check layer 3 bloom filter
        exists_in_bloom = self.bloom.bfExists(self.node_name,
                                              self._get_key(key))
        if exists_in_bloom == 1:
            return True

        # All else has failed, we must now check dgraph. This is super super slow.
        query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name
        dgraph_result = self.txn.query(query, variables={"$a": str(key)})
        thing = json.loads(dgraph_result.json)
        if len(thing["all"]) > 0:
            # Update previous layers
            self[key] = thing["all"][0]["uid"]
            return thing["all"][0]["uid"]

        # Cache miss, return None
        return None

    def close(self):
        """
        Close all outstanding connections

        :return:
        """

        # Close the layer 2 redis connection
        super(FullLayeredCache, self).close()

        # Close layer 3 bloom filter connection
        self.bloom.close()

        # Close layer 4 dgraph connections
        self.stub.close()