Exemplo n.º 1
0
    def __init__(self,
                 max_inmemory_size=1000,
                 sparse_offset=300,
                 segment_size=50,
                 persist_segments=True,
                 path=None,
                 merge_threshold=3):
        """

        :param max_inmemory_size: maximum number of entries to hold in memory.
        :param sparse_offset: frequency of key offsets kept in memory. (Eg: if `sparse_offset=5`, one key offset is kept
         in memory for every 5 entries.)
        :param segment_size: maximum number of entries in a given segment.
        :param persist_segments: if set to false, cleans up segment files in the end. Otherwise, retains the files in disk
        :param merge_threshold: number of segment to keep in intact before merging
        :param path: absolute path to scan into for pre-existing segments, and to  store current segments.
        If none provided,  the default is sst_dir
        """
        self._mem_table = MemTable(max_inmemory_size)
        self.max_inmemory_size = max_inmemory_size
        self._immutable_segments = []
        self._sparse_memory_index = SortedDict()
        self.sparse_offset = sparse_offset
        self._segment_size = segment_size
        self._bloom_filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.persist = persist_segments
        self._merge_threshold = merge_threshold
        self._base_path = None
        if path:
            self._base_path = path
            self._scan_path_for_segments(path)
Exemplo n.º 2
0
    async def run(self) -> None:
        try:
            with open('/data/bloom-filter', 'rb') as f:
                log('debug', 'Using saved bloom-filter')
                self.filter = ScalableBloomFilter.fromfile(f)
        except FileNotFoundError:
            log('debug', 'Creating new bloom-filter')
            self.filter = ScalableBloomFilter(initial_capacity=100000)

        self.conn_pool = await retry(
            partial(asyncpg.create_pool,
                    host='db',
                    user='******',
                    database='ipfs_crawler'), 'database', gaierror,
            ConnectionRefusedError, asyncpg.CannotConnectNowError)

        # start consumers
        for _ in range(8):
            self.workers.append(asyncio.ensure_future(self.worker()))
        # start producer
        self.producer: Future = asyncio.ensure_future(self.read_logs())
        log('info', 'Started crawling')

        # If an exception is thrown in the background task,
        # our crawler should not ignore it and continue to run, but throws it.
        await asyncio.gather(self.producer, *self.workers)
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        super(AutohomeValueSpider, self).__init__(**kwargs)
        self.counts = 0
        self.carnum = 800000
        self.name = 'autohome_value'
        self.carid = list()

        self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                              settings['MONGODB_PORT'])
        db = self.connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_READ_COLLECTION']]
        num = (self.collection.count()) * 1.5
        self.df = ScalableBloomFilter(initial_capacity=num, error_rate=0.001)
        # filename = '../blm/' + settings['MONGODB_WRITE_COLLECTION'] + '.blm'
        # filename = settings["BLM_PATH"] + '/' + settings['MONGODB_COLLECTION'] + '.blm'
        # filename = './test.blm'
        # self.fa = open(filename, "a")
        for i in self.collection.find():
            if "familyid" in i.keys():
                item = i["familyid"]
                item = md5(item.encode("utf8")).hexdigest()
                if not self.df.add(item):
                    # self.fa.writelines(i["familyid"] + '\n')
                    self.carid.append(i["familyid"])
        self.connection.close()
Exemplo n.º 4
0
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file, 'rb') as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path, 'rb') as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
Exemplo n.º 5
0
    def __init__(self, settings):
        # mysql
        # self.conn = create_engine(
        #     f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8')
        # mongo
        self.connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        self.db = self.connection[settings['MONGODB_DB']]

        # count
        self.mongocounts = 0
        self.dropcounts = 0

        # mongo
        self.collection = self.db[settings['MONGODB_COLLECTION']]

        # bloomfilter
        try:
            num = (int(settings['CRAWL_NUM']) + self.collection.count()) * 1.5
        except:
            num = settings['CRAWL_NUM']
        self.df = ScalableBloomFilter(initial_capacity=num, error_rate=0.001)

        self.settings = settings
Exemplo n.º 6
0
    def __init__(self, settings, idle_number, crawler):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )
        # mongo
        # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/'
        # self.connection = pymongo.MongoClient(uri)
        # self.connection = pymongo.MongoClient(
        #     settings['MONGODB_SERVER'],
        #     settings['MONGODB_PORT']
        # )
        # db = self.connection[settings['MONGODB_DB']]
        # self.collection = db[settings['MONGODB_COLLECTION']]
        # # count
        self.mongocounts = 0
        self.counts = 0
        self.CrawlCar_Num = 1000000
        self.settings = settings
        self.add_num = 0
        self.drop_num = 0

        # 爬取时间
        self.start_date = time.strftime('%Y-%m-%d %X', time.localtime())
        self.end_date = time.strftime('%Y-%m-%d %X', time.localtime())
        self.scrapy_date = None

        # redis 信号
        self.crawler = crawler
        self.idle_number = idle_number
        self.idle_list = []
        self.idle_count = 0

        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']

        self.df_result = pd.DataFrame()

        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01)
        # # read
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
Exemplo n.º 7
0
    def __init__(self, root_urls, capacity=0,
                 black_patterns=(CONFIG_URL_FILTER_PATTERN,)):
        """
        constructor, use variable of BloomFilter if capacity else variable of set
        """
        self._re_black_list = [re.compile(pattern, flags=re.IGNORECASE) \
                                  for pattern in black_patterns] \
                              if black_patterns else []

        # original white patterns = (r"^https?://",)
        # self._re_white_list = [re.compile(pattern, flags=re.IGNORECASE) \
        #                           for pattern in white_patterns] \
        #                       if white_patterns else []

        self._re_white_list = []
        prefix = r"^https?://(www\.)?"

        # add the domain of each root URLs to white list
        for url in root_urls:
            # remove http and www prefix first
            postfix = re.sub(prefix, '', url)
            # allow URLs in form of api-west1.amazon.com
            pattern = prefix + r"([\w\-]+\.)*" + postfix
            p = re.compile(pattern, flags=re.IGNORECASE)
            self._re_white_list.append(p)

        # bloom filter share the same interface with set()
        if capacity:
            from pybloom_live import ScalableBloomFilter
            self._url_set = ScalableBloomFilter(capacity, error_rate=0.001)
        else:
            self._url_set = set()
        return
Exemplo n.º 8
0
class Bloomfilter(object):

    logger = None

    def __init__(self,spidername="",*args,**kwargs):
        self.sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH, error_rate=1e-6)
        self.setlogger(spidername)
    
    def setlogger(self,spidername=""):
        #防止读取时logger指向原logger而不存在
        self.logger = logging.getLogger(spidername+".bloomfilter")
    
    def clearlogger(self):
        self.logger = None

    def md5(self, url):
        md5 = hashlib.md5()
        md5.update(url.encode("utf-8"))
        return md5.hexdigest()
    
    def check(self, url):
        url = self.md5(url)
        if url in self.sbf:
            return True
        else:
            return False

    def add_in_sbf(self, url):
        try:
            self.sbf.add(self.md5(url))
        except Exception as e:
            self.logger("[%s] bloomfilter exception<<<<<<<<< [%s]" % (url, str(e)))
Exemplo n.º 9
0
    def __init__(self,keyList_queue,writer,contain):
        super(Spider_related, self).__init__()
        self.keyList_queue = keyList_queue
        self.writer = writer
        self.contain = contain

        # 可自动扩容的布隆过滤器
        self.bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)
Exemplo n.º 10
0
 def create_from_file(self, filename):
     '''
     从文件中获得一个过滤器,同样,不会保存
     '''
     what = ScalableBloomFilter(100000000, 0.001)
     t = temp_stupid(filename)
     for i in t.read():
         what.add(i)
     return what
Exemplo n.º 11
0
    def __init__(self, settings, idle_number, crawler):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )
        # db = pymysql.connect(settings["MYSQL_SERVER"], settings["MYSQL_USER"], settings["MYSQL_PWD"], settings["MYSQL_DB"], charset='utf8', port=3306)
        # db = create_engine(f'mysql+pymysql://{"baogang"}:{"Baogang@2019"}@{"192.168.2.120"}:{"3306"}/{"baogang"}?charset=utf8')
        # mongo
        # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/'
        # self.connection = pymongo.MongoClient(uri)
        self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                              settings['MONGODB_PORT'])
        self.db = self.connection[settings['MONGODB_DB']]
        self.collection = self.db[settings['MONGODB_COLLECTION']]
        # self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"]
        # count
        self.mongocounts = 0
        self.dropcounts = 0

        # mongo 临时表
        self.collection_tmp = self.db[settings['MONGODB_COLLECTION'] + "_tmp"]

        # pandas
        self.df_end = pd.DataFrame()

        # redis 信号
        self.crawler = crawler
        self.idle_number = idle_number
        self.idle_list = []
        self.idle_count = 0

        self.settings = settings
        self.CrawlCar_Num = 1000000
        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']
        # pybloom
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)

        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
Exemplo n.º 12
0
 def __init__(self, name=None):
     self.timers = TimerRegistry(callback=self._trigger_frame_handler)
     super().__init__(name=name)
     self.states.should_stop = False
     self.states.running = False
     self.loop = None  # asyncio.get_event_loop()
     self._spawn_on_start = set()
     self._seen_frames = ScalableBloomFilter(
                 mode=ScalableBloomFilter.LARGE_SET_GROWTH, error_rate=0.001)
Exemplo n.º 13
0
def generate_bloom(conf, capacity, cursor):
    b = ScalableBloomFilter(initial_capacity=capacity, error_rate=conf.bloom_error_rate)
    while True:
        row = cursor.fetchone()
        if not row:
            break
        if row[0]:
            b.add(row[0].rstrip())
    return b
Exemplo n.º 14
0
 def __init__(self, site_name):
     self.client = MongoClient('localhost', 27017)
     self.db = self.client.crawlSpider
     self.col_url = self.db[site_name + "_url"]
     self.col_content = self.db[site_name + "_content"]
     self.sbf = ScalableBloomFilter(initial_capacity=100)
     for item in self.col_url.find():
         self.sbf.add(item["url"])
     self.insert_url = []
     self.insert_content = []
Exemplo n.º 15
0
class MyBloom:
    def __init__(self):
        self.sbf = ScalableBloomFilter(initial_capacity=100)

    def isExist(self, title):
        if title in self.sbf:
            return False
        else:
            self.sbf.add(title)
            return True
Exemplo n.º 16
0
 def __init__(self):
     redis_db = redis.Redis(host='127.0.0.1',
                            port=6379,
                            db=0,
                            decode_responses=True)
     result = redis_db.smembers('spider:url')
     self.sbf = ScalableBloomFilter(
         mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     for item in result:
         self.sbf.add(item)
Exemplo n.º 17
0
 def __init__(self):
     # mail
     self.mailer = MailSender.from_settings(settings)
     # mongo
     self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                           settings['MONGODB_PORT'])
     db = self.connection[settings['MONGODB_DB']]
     self.collection = db[settings['MONGODB_COLLECTION']]
     self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"]
     self.collectionwrong = db[settings['MONGODB_COLLECTION'] +
                               "_wrongurllog"]
     #bloom file
     # filename= settings['BLM_PATH'] + settings['MONGODB_DB']+'/'+settings['MONGODB_COLLECTION']+'.blm'
     filename = 'blm/' + settings['MONGODB_DB'] + '/' + settings[
         'MONGODB_COLLECTION'] + '.blm'
     #pybloom
     num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.1
     self.df = ScalableBloomFilter(initial_capacity=num, error_rate=0.01)
     #read
     isexists = os.path.exists(filename)
     self.fa = open(filename, "a")
     if isexists:
         fr = open(filename, "r")
         lines = fr.readlines()
         for line in lines:
             line = line.strip('\r\n')
             self.df.add(line)
         fr.close()
     else:
         for i in self.collection.find():
             if "status" in i.keys():
                 item = i["status"]
                 item = md5(item).hexdigest()
                 self.df.add(item)
                 self.fa.writelines(item + '\r\n')
     #count
     self.mongocounts = 0
     self.sqlcounts = 0
     #mysql
     self.mysqlconnection = MySQLdb.connect(settings['MYSQLDB_SERVER'],
                                            settings['MYSQLDB_USER'],
                                            settings['MYSQLDB_PASS'],
                                            settings['MYSQLDB_DB'],
                                            port=settings['MYSQLDB_PORT'])
     self.dbc = self.mysqlconnection.cursor()
     self.mysqlconnection.set_character_set('utf8')
     self.dbc.execute('SET NAMES utf8;')
     self.dbc.execute('SET CHARACTER SET utf8;')
     self.dbc.execute('SET character_set_connection=utf8;')
     # self.table = settings['MONGODB_COLLECTION']+ '_' +time.strftime("%Y%W")
     self.table = settings['MONGODB_COLLECTION'] + '_online'
     self.items = []
     self.caritemlist = car_parse.Parse_conf(settings['MONGODB_COLLECTION'])
Exemplo n.º 18
0
def get_updated():
    db = pymysql.connect(host="192.168.2.97", user="******", password='******', database="spider", port=3306)
    cursor = db.cursor()
    sql = "select title_url from spider_high_talent"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0] for i in db_data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom
Exemplo n.º 19
0
    def __init__(self, settings):

        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )
        # mongo
        self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                              settings['MONGODB_PORT'])
        self.db = self.connection[settings['MONGODB_DB']]
        # mongo
        # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/'
        # self.connection = pymongo.MongoClient(uri)
        self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                              settings['MONGODB_PORT'])
        db = self.connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"]

        # date
        self.start_date = None
        self.end_date = None
        self.scrapy_date = f'{self.start_date}  -   {self.end_date}'

        # count
        self.mongocounts = 0
        self.counts = 0
        self.CrawlCar_Num = 1000000
        self.settings = settings
        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']
        # pybloom
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)

        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
def get_updated():
    db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306)
    cursor = db.cursor()
    sql = "select product_license_num from spider_sh_company_medical_equipment_entrust_produce"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0] for i in db_data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom
Exemplo n.º 21
0
class BloomDupeFilter():
    def __init__(self):
        self.fingerprints = ScalableBloomFilter(
            initial_capacity=2000000,
            error_rate=0.00001,
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, url):
        if url in self.fingerprints:
            return True
        else:
            self.fingerprints.add(url)
            return False
Exemplo n.º 22
0
def get_updated(sum):
    sql = "select * from tyc_source_data"
    cursor.execute(sql)
    data = cursor.fetchall()
    if len(data)==0:
        return sum
    data = [i[2] for i in data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=100,error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    sum = [i for i in sum if i['园区id'] not in bloom]
    return sum
Exemplo n.º 23
0
class PageDupeFilter(BaseDupeFilter):
    """
    PageDupeFilter

    The filter uses a bloom filter inside.
    It load processed page url from postgresql
    """

    def __init__(self):
        self.__pg_client = get_database()
        self.__filter = ScalableBloomFilter(initial_capacity=2 * 10e5)

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request: scrapy.Request) -> bool:
        host, path = divide_url(request.url)
        # Foot print = sha1(host + path)
        fp_s = (host + path).encode()
        fp = hashlib.sha1(fp_s).hexdigest()
        if fp in self.__filter:
            return True

        self.__filter.add(fp)
        return False

    def open(self):
        size = 100
        sql = 'SELECT DISTINCT host || path FROM public.pages WHERE publish_date IS NOT NULL'
        with self.__pg_client.cursor() as cursor:
            cursor.execute(sql)
            b_continue = True

            while b_continue:
                result = cursor.fetchmany(size)
                for r in result:
                    fp_s = (r[0]).encode()
                    fp = hashlib.sha1(fp_s).hexdigest()
                    self.__filter.add(fp)
                if len(result) < size:
                    b_continue = False

        self.__pg_client.close()
        self.__pg_client = None

    def close(self, reason):
        pass

    def log(self, request, spider):
        pass
Exemplo n.º 24
0
    def __init__(self, settings):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )
        # mongo
        # self.connection = pymongo.MongoClient(
        #     settings['MONGODB_SERVER'],
        #     settings['MONGODB_PORT']
        # )
        # self.db = self.connection[settings['MONGODB_DB']]

        # count
        self.mongocounts = 0
        self.dropcounts = 0

        # mongo
        # self.collection = self.db[settings['MONGODB_COLLECTION']]
        # print(settings['MONGODB_COLLECTION'])
        # print("*"*100)
        # bloomfilter
        # num = (int(settings['CRAWL_NUM']) + self.collection.count()) * 1.5
        self.settings = settings

        self.CrawlCar_Num = 1000000
        self.settings = settings
        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']
        # pybloom
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01)
        # # read
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
Exemplo n.º 25
0
def get_updated():
    db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306)
    cursor = db.cursor()
    sql = "select record_id from spider_sh_ralated_GMP_license"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    # print(db_data)
    # exit()
    data = [i[0] for i in db_data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=10000,error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom
Exemplo n.º 26
0
 def fiktergenerator(self, mode):
     '''
     从数据库某一字段读取生成过滤器
     '''
     if 'token' in mode:
         mode = 'TOKEN'
     if 'id' in mode:
         mode = 'ID'
     cc = self.c.execute('SELECT ' + mode + ' FROM id2token')
     r = cc.fetchall()
     bloom = ScalableBloomFilter(100000000, 0.001)
     for i in r:
         bloom.add(i[0])
     return bloom
Exemplo n.º 27
0
def updated():
    db = pymysql.connect(host="192.168.2.97",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = "select url from spider_2_company_revoke"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0].strip() for i in db_data]
    bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001)
    for i in data:
        bloom.add(i)
    return bloom
Exemplo n.º 28
0
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
        ignore_filter_dir='../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
            try:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+')
            	f.write(self.ignore_filter)
            except IOError:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
            f.close()
        else:
            if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))):
                f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir='../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
  patterns.load('./pos-patterns')

  # Load list of stopwords
  print(colored('Loading stopwords...','cyan'))
  stopwords = []
  with open('./pos-stopwords') as f:
    stopwords = list(f.readlines())

  # Initialise a crawling dataset connection
  print(colored('Initialising wikipedia crawling collection...','cyan'))
  crawl_collection = init_crawl_collection()

  # Iterate through the crawling database
  n = 0
  print(colored('Iterating over crawling database...','cyan'))
  bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
  for topic,sentence in iter_topic(crawl_collection,args['start']):
    
    # Clean topic string
    topic = topic.replace("'",'').replace('\n','')

    # Check if the number of processed topic exceed the limit?
    if topic not in bf:
      bf.add(topic)
      if len(bf) > args['limit']:
        print(colored('[Topics limit reached] ... BYE','cyan'))
        sys.exit(0)

    # Break the sentence into knowledge nodes
    pos      = TextStructure.pos_tag(sentence)
    kb_nodes = patterns.capture(pos)  
Exemplo n.º 30
0
    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e
Exemplo n.º 31
0
class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
        ignore_filter_dir='../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
            try:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+')
            	f.write(self.ignore_filter)
            except IOError:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
            f.close()
        else:
            if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))):
                f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir='../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or
                (not filt.regex and filt.pattern in url)):
                return True
        return False