Пример #1
0
 def __init__(self, path=FILTER_PATH, debug=False):
     if os.path.exists(FILTER_PATH):
         self.url_filter = BloomFilter.open(FILTER_PATH)
     else:
         print "created a new bloom filter. "
         self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
     super(DuplicateFilter, self).__init__(path, debug)
Пример #2
0
class LinkFilter():
    
    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
    
    def index_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf_index.add(link.url):
                new_links.append(link)
        return new_links

    def html_filter(self, links):
        new_links = []
        for link in links:
            #log.msg('This is a link : %s' % link, level=log.WARNING)
            if not self.bf_html.add(link.url):
                new_links.append(link)
        return new_links
Пример #3
0
def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"):
    """From a given FASTA reference sequence creates a bloom filter file
    from each read.
    """

    if format == "fasta":
    	file_it = FastaIterator
        record = lambda it: (seq.seq for seq in it)
    elif format == "fastq":
        file_it = FastqGeneralIterator
        record = lambda it: (seq for _, seq, _ in it)

    capacity = total_reads(reference_file)
    with open(reference_file) as handle:
        it = file_it(handle)
        read_it = record(it)
        read_len = 109
        read_in = []
        read = []
        buffer = []
        
        bf = BloomFilter(capacity, error_rate, bf_file)
        sequence = read_it.next()

        step = read_len
        
        i = 0
        while i < len(sequence):
            read = sequence[i:i + read_len - 1]
            i += step
            print(read)
            bf.update(read)
                
        bf.close()
Пример #4
0
def main():
   #Check for command line arguments
   if len(sys.argv) != 2:
      print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
      sys.exit(1)

   #Read arguments from command line
   inFile = sys.argv[1]


   bf1 = BloomFilter(100000000, 0.001, 'bf1')   
   bf2 = BloomFilter(100000000, 0.001, 'bf2')
     
   outputFileName="converted-"+sys.argv[1]
   f = open(outputFileName, "a")



   for line in open(inFile,'r'):
      if (line[0:2]=="W," or line[0:2]=="R,"):
         hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
         hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
         if (bf1.add(hash1) and bf2.add(hash2)):
         	f.write('%s,%d\n' % (line[0],hash1*10000) )
         else:
        	   f.write('%s,%d\n' % (line[0],hash2*10000) )  
      elif(line==''):
         break
      else:
         pass
   f.close()
Пример #5
0
	def __init__(self, seeds, done_que, run_que):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_que = run_que
		self.done_que = done_que
		self.tasks = []
		self.done = 1

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 60
		self.poolmaxfree = 20
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		self.tt = 1

		self.done_sites_fname='done_sites.bin'
		try:
			self.bfdone = BloomFilter.open(self.done_sites_fname)
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M 

		if self.run_que.qsize() == 0:
			for seed in seeds:
				self.run_que.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
		else:
			self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)
Пример #6
0
    def __init__(self):
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile("new_filter.bloom"):
            self.bf = BloomFilter.open("new_filter.bloom")
        else:
            self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
Пример #7
0
def dedup(fname):
    bf = BloomFilter(1E8, 0.01)
    
    with open(fname, 'r') as fin:
        with open('deduped.tsv', 'w') as fout:
            for line in fin:
                splitLine = line.split('\t')
                description = splitLine[5]
                if bf.add(md5.new(description).digest()):
                    continue
                else:
                    fout.write(line)
Пример #8
0
  def __init__(self, node_n, seen_persist, Q_logs=None):
    self.node_n = node_n
    self.Q_logs = Q_logs
    self.total_crawled = 0
    self.payloads_dropped = 0

    # single variable for tracking whether node should be active or not
    self.active = True
    
    # crawl task Queue
    # Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ]
    self.Q_crawl_tasks = Queue.PriorityQueue()

    # host queue dict
    # { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] }
    self.hqs = {}
    
    # seen url check
    # Bloom Filter ~ [ url ]
    if seen_persist:
      try:
        self.seen = BloomFilter.open(BF_FILENAME)
      except:
        self.Q_logs.put('Error opening bloom filter, creating new one')
        self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
    else:
      self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)

    # DNS Cache
    # { netloc: (host_addr, time_last_checked) }
    self.DNScache = {}

    # overflow url Queue
    # Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ]
    self.Q_overflow_urls = Queue.Queue()

    # host queue cleanup Queue
    # Priority Queue ~ [ (time_to_delete, host_addr) ]
    self.Q_hq_cleanup = Queue.PriorityQueue()

    # active url count queue- for counting/tracking active
    # Queue ~ [ True ]
    self.Q_active_count = Queue.Queue()

    # thread active url dict- a dict of active urls by thread using, for restart dump
    # { thread_name: active_url }
    # NOTE: note that there are problems with this methodology, but that errors will only lead
    # to data redundancy (as opposed to omission)...
    self.thread_active = {}
    
    # Queue of messages to be sent to other nodes
    # Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ]
    self.Q_to_other_nodes = Queue.Queue()
Пример #9
0
def create_bf():
	bf = BloomFilter(count, error_rate, 'filter_base.bloom')
	keyDigest_list = []
	FILE = open(keyDigestFile, 'r')
	
	for i in range(count):
		keyDigest = FILE.read(keyDigestLen)
		keyDigest_list.append(keyDigest)
		
	FILE.close()
	
	for publicKeyID in keyDigest_list:
		bf.add(publicKeyID)
Пример #10
0
    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
Пример #11
0
    def __init__(self, start_url, basic_url):
        self.basic_url = basic_url
        self.start_url = start_url
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile('filter.bloom'):
            self.bf = BloomFilter.open('filter.bloom')
        else:
            self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
Пример #12
0
class URLBloomFilter:
    dbconn = None
    cur = None
    urlbf = None
    sql = None

    def initdb(self, host = 'localhost', user = '******', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'):
        self.dbconn = MySQLConnection.MySQLConn()
        self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db)
        self.cur = self.dbconn.cursor()

    def initfilter(self, filename = './url.filter'):
        if os.path.isfile(filename):
            self.urlbf = BloomFilter.open(filename)
        else:
            self.urlbf = BloomFilter(10000000, 0.001, filename)

    def initsql(self, m_sql):
        self.sql = m_sql

    def add(self, url):
        if not self.urlbf.add(url):
            self.cur.execute(self.sql, url)
            return True
        else:
            return False

    def close(self):
        self.dbconn.close()
Пример #13
0
class DuplicatesPipeline(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()

    def process_item(self, item, spider):
        print '************%d pages visited!*****************' %len(self.bf)
        if self.bf.add(item['url']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            #print '%d pages visited!'% len(self.url_seen)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
Пример #14
0
class DuplicatedFlowFilter(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')

    def add(self, flow):
        """
        :param flow: the flow dict received from Proxy.
        :return: if the flow already in the filter.
        """
        f = (flow[METHOD], flow[URL])
        return self.bf.add(f)

    def __contains__(self, flow):
        f = (flow[METHOD], flow[URL])
        return self.bf.__contains__(f)
Пример #15
0
class MongoDBPipeline(object):

    def __init__(self):
        connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.si = SearchIndex()
        self.si.SearchInit()
        
    def process_item(self, item, spider):
        if self.bf.add(item['link']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            for data in item:
                if not data:
                    raise DropItem("Missing data!")
            self.collection.update({'link': item['link']}, dict(item), upsert=True)
            log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
            self.si.AddIndex(item)
            return item
        
    def __del__(self):
        self.si.IndexDone()
Пример #16
0
 def __init__(self):
     bc = config.get_boolmfilter_config()
     if os.path.exists(bc['bin_path']):
         self.bloomfilter = BloomFilter.open(bc['bin_path'])
     else:
         self.bloomfilter = BloomFilter(
             bc['capacity'], bc['wrong_rate'], bc['bin_path'])
Пример #17
0
def vote(request, poll):
    try:
        choice_name = request.POST['choice']
        selected_choice = poll.choice_set.get(choice=choice_name)
    except (KeyError, Choice.DoesNotExist):
        return render_to_response('detail.html', {'poll':poll, 'error_message':"You didn't select a choice."},
                context_instance= RequestContext(request))

    if not (poll.has_expired() or already_voted(request, poll)):
        hash = request_hash(request)
        poll.total_votes += 1
        selected_choice.votes += 1
        poll.vote_set.create(hash=hash)
        selected_choice.save()

        #Update the seen ips
        from pybloomfilter import BloomFilter
        bf = BloomFilter.from_base64('/tmp/bloom.filter', poll.ips_seen)
        alreadyseen = bf.add(request.META['REMOTE_ADDR'])

        if not alreadyseen:
            poll.ips_seen = bf.to_base64()
            poll.ips_count += 1

        poll.save()

    return None
Пример #18
0
def count_matches(fastq_file, bf_files, sampling):
    """Goes through a fastq file and checks a sample of reads if they
    occur in the specified bloom filter.
    """
    if isinstance(bf_files, basestring):
        bf_files = [bf_files]

    bf = {}
    observed = {}
    for bf_file in bf_files:
        bf[bf_file] = BloomFilter.open(bf_file)
        observed[bf_file] = 0

    fastq_handle = open(fastq_file)
    fastq_it = FastqGeneralIterator(fastq_handle)
    checked = 0
    sampling = int(sampling)
   # import ipdb
   # ipdb.set_trace()
    for i, (_, read, _) in enumerate(fastq_it):
        if not i + 1 % sampling:
            continue

        print read

        checked += 1
        for bf_file in bf_files:
            if read in bf[bf_file]:
                observed[bf_file] += 1

    fastq_handle.close()

    return checked, observed
Пример #19
0
 def __init__(self, settings, debug = False):
     self.capacity = settings.getint("DUPEFILTER_CAPACITY")
     self.filename = settings.get("DUPEFILTER_FILENAME")
     self.debug = debug
     self.error_rate = 0.01
     self.logger = logging.getLogger(__name__)
     self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
Пример #20
0
 def __init__(self, roots,
              exclude=None, strict=True,  # What to crawl.
              max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, *, loop=None):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)
     self.seen_urls = BloomFilter(10000000, 0.01)
     self.done = []
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None
Пример #21
0
    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        self.db = MySQLdb.connect("localhost","root","","storecount")
        self.cursor = self.db.cursor()
        self.cursor.execute("DROP TABLE IF EXISTS POPULAR")
        sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);"""
    
        try:
            self.cursor.execute(sql1)
            self.db.commit()
#             print "cao create"
        except:
            traceback.print_exc()
            self.db.rollback()
#         self.dbpool = adbapi.ConnectionPool('MySQLdb',
#                                             host = '127.0.0.1',
#                                             db = 'storecount',
#                                             user = '******',
#                                             passwd = '',
#                                             cursorclass = MySQLdb.cursors.DictCursor,
#                                             charset = 'utf8',
#                                             use_unicode = True)
        self.mark = 0
Пример #22
0
class URIBloomFilter(BaseDupeFilter):
    def __init__(self, settings, debug = False):
        self.capacity = settings.getint("DUPEFILTER_CAPACITY")
        self.filename = settings.get("DUPEFILTER_FILENAME")
        self.debug = debug
        self.error_rate = 0.01
        self.logger = logging.getLogger(__name__)
        self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
    
    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(settings, debug)
    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if self.check(fp):
            return True
        else:
            self.insert(fp)

    ###-------todo-------##
    def request_fingerprint(self, request):
        return request_fingerprint(request)
    
    def check(self, request):

        ret = request in self.bloom_filter_
        return ret
    
    def insert(self, request):
        self.bloom_filter_.add(request)
        #print len(self.bloom_filter_)
        #print self.bloom_filter_.hash_seeds
        #print self.bloom_filter_.num_bits
        #print self.bloom_filter_.num_hashes
    
    def reset(self):
        self.bloom_filter_.clear_all()
    
    def save(self):
        pass
    def load(self):
        self.bloom_filter_.sync()
        self.bloom_filter_.open("bloom.dump") 
        pass
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
Пример #23
0
def _process_one(data_file):
    ''' Process one output file to generate a bloom filter'''
    path, dump_name = os.path.split(data_file)
    _, parent_dir = os.path.split(path)

    # ensure the containing folder exists
    bf_dir_path = os.path.join('bloom_filters', parent_dir)
    if not os.path.isdir(bf_dir_path):
        os.mkdir(bf_dir_path)
    bf_file_path = os.path.join(bf_dir_path, dump_name)

    if not os.path.isfile(bf_file_path):
        ncpu, _, nparts, _, _, _, ids = read_output(data_file, header_only=False)
        bf = BloomFilter(nparts, 1./ncpu, bf_file_path)
        bf.update(ids)

    return bf_file_path
Пример #24
0
class SpamCheck (object): 
	def __init__(self):  
		# Setup the logging
	        self.ilog= logging.getLogger('prog')
		self.ilog.setLevel(logging.INFO)
	        self.console = logging.StreamHandler(sys.stderr)
	        self.console.setLevel(logging.INFO)
	        self.console.setFormatter(logging.Formatter('%(message)s'))
		self.ilog.addHandler(self.console)


		# Try loading the filter
	        try: 
	          self.__loadFilter__()
		  ilog.debug("loading filter.." ) 
		  
		# Create the filter if not present
		except: 
		       self.ilog.debug("Exception in loading ...." )
		       self.__create__()
		       self.ilog.debug("Creating the file ... ")

        def __loadFilter__(self): 
		self.bf = BloomFilter.open('filter.bloom')

	def __create__(self): 
		self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
		# Let us initalize the first time, it hacky but ok
		self.spam("000")
		# Generate the filter from a file
		with open("bad_numbers.txt") as f:
		    for nums in f:
		            self.bf.add(nums.rstrip())
			    self.ilog.debug(".")

	def spam(self, bad_entity): 
		with open("bad_numbers.txt","a+") as f: 
			f.write(bad_entity) 
			f.write("\n")
			self.ilog.info("Added bad entry to file")
		self.bf.add(bad_entity) 
		
	  	 
	def isSpam(self, entity): 
		return entity in self.bf 
Пример #25
0
	def __create__(self): 
		self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
		# Let us initalize the first time, it hacky but ok
		self.spam("000")
		# Generate the filter from a file
		with open("bad_numbers.txt") as f:
		    for nums in f:
		            self.bf.add(nums.rstrip())
			    self.ilog.debug(".")
Пример #26
0
	def __init__(self,seeds):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_queue = Queue()
		self.tasks = []
		self.done = 0

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		
		#self.whitelist = ['html','htm','php','shtml','asp','jsp','do','action','aspx']
		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','worldpress.com','blogspot.com','youtube','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 100
		self.poolmaxfree = 40
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.mutex = gevent.coros.RLock()

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		
		try:
			self.bfdone = BloomFilter.open('done_sites.bin')
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), 'done_sites.bin')

		if self.run_queue.qsize() == 0:
			for seed in seeds:
				self.run_queue.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile('href=[\"\']http://([^/?#\"\']+)')
		else:
			self.urlpatern = re.compile('href=[\"\']http[s]?://([^/?#\"\'"]+)')
Пример #27
0
def createBloomFilter(contentFile, filterFilename):
    bf = BloomFilter(10000000, 0.9999999, filterFilename)
    total = 0
    count = 0
    failed = 0
    with open(contentFile, "r") as f:
        for domain in f:
            total += 1
            d = domain.rstrip()

            if bf.add(d):
                count += 1
                print(d)
            else:
                failed += 1

    print "Total ", total
    print "Added ", count
    print "Conflicted", failed
Пример #28
0
 def __init__(self):
     connection = pymongo.MongoClient(
         settings['MONGODB_SERVER'],
         settings['MONGODB_PORT']
     )
     db = connection[settings['MONGODB_DB']]
     self.collection = db[settings['MONGODB_COLLECTION']]
     self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
     self.si = SearchIndex()
     self.si.SearchInit()
Пример #29
0
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
 
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
 
    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))
 
    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
 
    def close(self, reason):
        self.fingerprints = None
Пример #30
0
def cpu_containing(particles, bloom_filters, yieldAll=True):
    ''' Iterate over all bloom filter and yield the one containing the particle'''
    for cpu in tqdm(range(len(bloom_filters))):
        bf = BloomFilter.open(bloom_filters[cpu])
        yieldCPU = False
        cpu_contains = []
        for p in particles:
            if p in bf:
                yieldCPU = True
                cpu_contains.append(p)
        if yieldAll or yieldCPU:
            yield cpu+1, cpu_contains
Пример #31
0
 def _get_bloom_filter(self) -> BloomFilter or None:
     """
     Retrieve the bloom filter from storage server
     :return: Bloom filter
     """
     resp = self.get(f"{self.STORAGESERVER}/bloom")
     suc = resp.json()['success']
     if suc:
         log.debug("Successfully retrieved bloom filter.")
         tmp = helpers.get_temp_file() + '.bloom'
         b = BloomFilter.from_base64(tmp, resp.json()['bloom'].encode())
         atexit.register(shutil.rmtree, tmp, True)  # Remove and ignore
         # errors
         return b
     else:
         msg = resp.json()['msg']
         raise RuntimeError(f"Failed to retrieve bloom filter: {msg}")
Пример #32
0
	def __init__(self,*args,**kwargs):
			super(CollectorSpider,self).__init__(*args,**kwargs)
			#用一个list来存放所有的json配置中的k,v,变成了一个元祖list,遍历这个list
			#scrapy.log.start("./log.txt",loglevel=INFO,logstdout=True)
			self.log = open("/home/hong/文档/sina_working/2to3_test/log.txt",'a')
			print("\n\n__________________________分割线___________________________________", file=self.log)
			print("At Time %s : 爬虫启动............"%time.ctime(), file=self.log)
			self.now = time.time()
			self.one_month_ago = datetime.datetime(time.localtime(self.now).tm_year,time.localtime(self.now).tm_mon-1,time.localtime(self.now).tm_mday)
			self.config = []
			self.Index_Url = ""
			self.flag = 0
			#这里必须初始化bf,否则首次循环下面会报错
			self.bf = ""
			self.isexists=os.path.exists("/home/hong/文档/sina_working/2to3_test/filter.bloom")
			if self.isexists:
					print("存在filter.bloom,打开!!!!",file=self.log)
					self.bf = BloomFilter.open("/home/hong/文档/sina_working/2to3_test/filter.bloom")
Пример #33
0
    def bloom(self):
        """
        Return bloom filter containing the record hashes (as base64 encoding).
        Needs to be a property to avoid concurrency problems with mutltiple
        threads. Initialize with database contents it no bloom filter exists.

        :return: Bloom Filter
        """
        if self._bloom is None:
            # Initialize
            bloom_file = self.data_dir + config.BLOOM_FILE
            if os.path.isfile(bloom_file):
                self._bloom = BloomFilter.open(filename=bloom_file)
                log.info(f"Bloom Filter loaded from file {bloom_file}!")
            else:
                # new Bloom filter
                self._initialize_bloom_filter()
        return self._bloom
Пример #34
0
def threaded_crawl(tid, n, proxies, lock, output_dir="."):
    global count
    global failures
    fails = 0
    logger = logging.getLogger(__name__)
    fptr = open("top-1m.csv", "r")
    fail_thresh = 10  # Use a different proxy after 10 failed requests in a row
    proxy = dict()
    linum = fails = 0
    start = tid * n  # First seed site to crawl
    end = tid * n + n  # Last seed site to crawl
    seed = BloomFilter(n * 1000000, 0.1, '/tmp/{}.bloom'.format(tid).encode())
    frontier = deque()
    logger.info('[tid {}] Loading seed URLs {} - {}'.format(tid, start, end))
    for line in fptr:
        if linum >= start and linum < end:
            url = "http://" + line.split(',')[1].strip()
            seed.add(url.encode())
            frontier.append(url)
        linum += 1
    fptr.close()
    while True:
        url = frontier.popleft()
        urls = []
        try:
            urls = parse_url(url, proxy, output_dir)
        except Exception as e:
            logger.error(
                "[tid {}] Fatal error occured while crawling: {}.".format(
                    tid, url))
        if len(urls) == 0:
            with lock:
                failures += 1
            fails += 1
            if fails > fail_thresh:
                proxy['http'] = proxies[randint(0, len(proxies) - 1)]
                logger.error("[tid {}] Failure: Activating proxy:{}".format(
                    tid, proxy['http']))
                fails = 0
        for u in urls:
            link = u.encode()
            if link not in seed:
                seed.add(link)
                frontier.append(link)
        with lock:
            count += 1
            if (count % 1000 == 0):
                logger.info('Page count: {}'.format(count))
        if len(frontier) % 1000 == 0:
            logger.info("[tid {}] Frontier count: {}".format(
                tid, len(frontier)))
    def __init__(
        self,
        directory: Path,
        filter_capacity: int,
        filter_error_rate: float,
        batch_count: int,
        batch_duration_sec: int,
    ):
        """Create a BatchedBloomFilter from a set of files, named `<unix_timestamp>.bloom`."""
        self.directory = directory
        self.filter_capacity = filter_capacity
        self.filter_error_rate = filter_error_rate
        self.batch_count = batch_count
        self.batch_duration_sec = batch_duration_sec

        files = list(self.directory.glob('*.bloom'))

        timestamps = []
        timestamp_to_path = {}
        for path in files:
            try:
                timestamp = int(path.stem)
            except ValueError:
                log.info(
                    'Ignoring invalid file name (expecting <unix_timestamp>.bloom): %s',
                    path)
            else:
                timestamps.append(timestamp)
                timestamp_to_path[timestamp] = path

        recent_timestamps = sorted(timestamps)[-self.batch_count:]
        try:
            self.last_batch_ts = recent_timestamps[-1]
        except IndexError:
            self.last_batch_ts = 0

        self.batches = [
            BloomFilter.open(str(timestamp_to_path[ts]))
            for ts in recent_timestamps
        ]
        log.info('Found existing bloom filters: %s',
                 dict(zip(recent_timestamps, self.batches)))
        self.rotate_if_needed()
Пример #36
0
    def create(infile, outfile, capacity: int, error_rate: float = 0.05):
        import tqdm
        import urllib
        from pybloomfilter import BloomFilter

        bf = BloomFilter(capacity, error_rate, outfile)
        with open(infile) as f:
            for _, word in enumerate(tqdm.tqdm(f, total=capacity)):
                if "%" in word:
                    word = urllib.parse.unquote(word).lower()
                word = word.rstrip()
                bf.add(word)

        bf.close()
Пример #37
0
class FilterPipeline(object):
    def __init__(self):
        self.bloomname = "filter"

    def open_spider(self, spider):
        isexists = os.path.exists(self.bloomname + ".bloom")
        if isexists == True:
            self.bf = BloomFilter.open(self.bloomname + ".bloom")
        else:
            self.bf = BloomFilter(100000000, 0.001, self.bloomname + '.bloom')

    def process_item(self, item, spider):
        #这里使用url和歌名作一个去重,如果在同一url取得同一首歌名,即认为其是重复数据
        token = (str(item['url']) + str(item['song_info']))
        flag = self.bf.add(token)
        #这里False表示元素添加进去了,如果里面有相同元素返回True
        if flag == False:
            return item
        else:
            raise DropItem("find this link in bloomfilter!!!")
Пример #38
0
    def fit(self, tokens, class_label):
        #if class_label not in self.class_to_toks_bf:
        #	self.class_to_toks_bf[class_label] = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate)

        if class_label not in self.vocab_sizes:
            self.vocab_sizes[class_label] = BloomFilter(
                capacity=self.initial_capacity, error_rate=self.error_rate)

        self.tokens_per_class[class_label] = self.tokens_per_class.get(
            class_label, 0) + len(tokens)
        tok_freqs = self.makeTokenFreqmap(tokens)

        for token, token_freq in tok_freqs.iteritems():
            #self.class_to_toks_bf[class_label].add(token)
            self.token_type_bf.add(token)
            #conditional_counts_bf[token+'_'+class_label] += token_freq
            self.class_conditional_counts[token + '_' +
                                          class_label] += token_freq
            self.vocab_sizes[class_label].add(token)

        self.class_freqs[class_label] = self.class_freqs.get(class_label,
                                                             0) + 1
        self.N += 1
    def rotate_if_needed(self):
        """Remove stale filters, create a new filter if needed, named `<unix_timestamp>.bloom`."""
        ts = int(time.time())
        if ts - self.last_batch_ts > self.batch_duration_sec:
            retained = self.batch_count - 1
            stale = self.batches[:-retained]
            self.batches = self.batches[-retained:]

            for stale_bf in stale:
                file_name = Path(stale_bf.filename)
                stale_bf.close()
                file_name.unlink()
                log.info('Closed stale bloom filter: %s', file_name)

            bloom_filter_file = self.directory / f'{ts}.bloom'
            self.batches.append(
                BloomFilter(self.filter_capacity, self.filter_error_rate,
                            str(bloom_filter_file)))
            self.last_batch_ts = ts
            log.info('Created a new bloom filter: %s', bloom_filter_file)

            log.info('Operating with filters: %r',
                     [(bf.filename, bf) for bf in self.batches])
Пример #40
0
class DuplicateFilter(RFPDupeFilter):
    """
    A dupe filter for url
    """
    def __init__(self, path=FILTER_PATH, debug=False):
        if os.path.exists(FILTER_PATH):
            self.url_filter = BloomFilter.open(FILTER_PATH)
        else:
            print "created a new bloom filter. "
            self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
        super(DuplicateFilter, self).__init__(path, debug)

    def request_fingerprint(self, request):
        return request_fingerprint(request)

    def request_seen(self, request):
        if request.url.startswith("http://www.dianping.com/shop/"):
            fp = self.request_fingerprint(request)
            if self.url_filter.add(fp):
                print ">" * 5 + "filtered " + request.url + "<" * 5
                return True

    def close(self, reason):
        self.url_filter = None
Пример #41
0
def getbloomFilter(bf, fem_kmers, kmer_size):
    if bf:
        print("Opening Bloom Filter of k-mers from female")
        female_kmers_bf = BloomFilter.open("data/female.bloom")
        print("Done")
    else:
        print("Need to make Bloom Filter of k-mers from female")
        bf_size = 3 * 1000 * 1000 * 1000
        bf_filename = "data/female.bloom"
        female_kmers_bf = BloomFilter(bf_size, .001, bf_filename)

        if fem_kmers:  # if female kmers file exist
            female_kmers_file = "data/female_kmers"
            with open(female_kmers_file, 'r') as fm_kmers:
                #assumes kmers are uppercase
                first_line = fm_kmers.readline()
                kmers.test_valid_kmer_format(first_line, kmer_size)
                fm_kmers.seek(0)
                for line in fm_kmers:
                    female_kmers_bf.add(line[:kmer_size])
        else:
            print(
                "Reading female reference one record at a time and k-merizing each record..."
            )
            female_reference_file = "data/female.fasta"
            n_kmers = "N" * kmer_size
            for record in SeqIO.parse(female_reference_file, "fasta"):
                to_kmerize_fwd = str(record.seq).upper()
                length = len(to_kmerize_fwd)
                for i in range(0, length - kmer_size + 1):
                    female_kmer = to_kmerize_fwd[i:i + kmer_size]
                    if female_kmer != n_kmers:
                        female_kmers_bf.add(to_kmerize_fwd[i:i + kmer_size])

        print("Done creating bloom filter")
    return female_kmers_bf
Пример #42
0
 def __init__(self, path):
     if not os.path.exists(path):
         raise RuntimeError(u"Missing Bloom: %s" % path)
     self.bloom = BloomFilter.open(path)
Пример #43
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
from JobCrawler import JobCrawler
from pybloomfilter import BloomFilter
from time import time

company_bf = BloomFilter(1024 * 1024 * 16, 0.01)
total_page = 1


def get_company_info(url, page=1):
    if page > total_page:
        return

    wbdata = requests.get(url).content
    soup = BeautifulSoup(wbdata, 'lxml')

    # print soup.prettify()

    company_list = soup.select('div.el > span.t2')
    # print type(company_list), '\ncompany_list :', company_list
    for index, company in enumerate(company_list):
        if index != 0:
            company_result = company.find_all(name='a')
            company_link = company_result[0].attrs['href']
            company_name = company_result[0].attrs['title']
            print company_name, ' - ', company_link
Пример #44
0
 def load_bf(self, filename, capacity, error_rate):
     bf = BloomFilter(capacity=capacity, error_rate=error_rate)
     with open(filename) as f:
         for line in f:
             bf.add(line.split('\t')[0].strip())
     return bf
Пример #45
0
        print("---  new folder...  ---")
        print("---  OK  ---")

    else:
        print("---  There is this folder!  ---")


if __name__ == '__main__':
    ProgramStarttime = datetime.datetime.now()
    try:
        #创建文件夹调用
        file_ad1 = "/home/260199/爬虫/爬虫数据/政府公告/政府政策公告信息" + str(ProgramStarttime) + "/国家超链接/"
        mkdir(file_ad1)  # 调用函数

        all_href = []
        href_bloom = BloomFilter.open('/home/260199/爬虫/爬虫代码/政策公告/government/country/all_href.bloom')

        #创建excel表并编辑表头
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('国家级政府公告', cell_overwrite_ok=True)
        header = [u'标题', u'正文', u'发布部门',u'所在栏目', u'栏目类别',u'发布日期',u'爬取时间', u'政策链接', u'附件']
        i = 0
        # 写表头
        for each_header in header:
            worksheet.write(0, i, each_header)
            i += 1
        row = 1
        print("当前时间为:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        print("国家工信部 数据开始收集,请稍等...")
        row,href_list = gongxinbu.main(row,worksheet,href_bloom,file_ad1,ProgramStarttime)
        all_href.extend(href_list)
Пример #46
0
class urlFrontier:
    def __init__(self, node_n, seen_persist, Q_logs=None):
        self.node_n = node_n
        self.Q_logs = Q_logs
        self.total_crawled = 0
        self.payloads_dropped = 0

        # single variable for tracking whether node should be active or not
        self.active = True

        # crawl task Queue
        # Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ]
        self.Q_crawl_tasks = Queue.PriorityQueue()

        # host queue dict
        # { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] }
        self.hqs = {}

        # seen url check
        # Bloom Filter ~ [ url ]
        if seen_persist:
            try:
                self.seen = BloomFilter.open(BF_FILENAME)
            except:
                self.Q_logs.put('Error opening bloom filter, creating new one')
                self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE,
                                        BF_FILENAME)
        else:
            self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)

        # DNS Cache
        # { netloc: (host_addr, time_last_checked) }
        self.DNScache = {}

        # overflow url Queue
        # Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ]
        self.Q_overflow_urls = Queue.Queue()

        # host queue cleanup Queue
        # Priority Queue ~ [ (time_to_delete, host_addr) ]
        self.Q_hq_cleanup = Queue.PriorityQueue()

        # active url count queue- for counting/tracking active
        # Queue ~ [ True ]
        self.Q_active_count = Queue.Queue()

        # thread active url dict- a dict of active urls by thread using, for restart dump
        # { thread_name: active_url }
        # NOTE: note that there are problems with this methodology, but that errors will only lead
        # to data redundancy (as opposed to omission)...
        self.thread_active = {}

        # Queue of messages to be sent to other nodes
        # Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ]
        self.Q_to_other_nodes = Queue.Queue()

    # primary routine for getting a crawl task from queue
    def get_crawl_task(self):
        if self.active:
            return self.Q_crawl_tasks.get()

        # if url frontier shutdown, block indefinitely (until node shutdown)
        else:
            while True:
                time.sleep(10)

    # primary routine to log crawl task done & submit extracted urls
    def log_and_add_extracted(self,
                              host_addr,
                              host_seed_dist,
                              success,
                              time_taken=0,
                              url_pkgs=[]):

        # handle failure of page pull
        # NOTE: TO-DO!
        if not success:
            pass

        # add urls to either hq of host_addr or else overflow queue
        for url_pkg in url_pkgs:
            self._add_extracted_url(host_addr, host_seed_dist, url_pkg)

        # calculate time delay based on success
        now = datetime.datetime.now()
        r = random.random()
        td = 10 * time_taken + r * BASE_PULL_DELAY if success else (
            0.5 + r) * BASE_PULL_DELAY
        next_time = now + datetime.timedelta(0, td)

        # if the hq of host_addr is not empty, enter new task in crawl task queue
        if len(self.hqs[host_addr]) > 0:

            # add task to crawl task queue
            r = self.hqs[host_addr].pop()
            self.Q_crawl_tasks.put((next_time, host_addr) + r)

        # else if empty, add task to cleanup queue
        else:
            self.Q_hq_cleanup.put((next_time, host_addr))

        # report crawl task done to queue, HOWEVER do not submit as done till payload dropped
        self.Q_crawl_tasks.task_done()

    # subroutine to add a url extracted from a host_addr
    def _add_extracted_url(self,
                           ref_host_addr,
                           ref_seed_dist,
                           url_pkg,
                           from_other_node=False):
        url_in, ref_page_stats, parent_url = url_pkg

        # basic cleaning operations on url
        # NOTE: it is the responsibility of the crawlNode.py extract_links fn to server proper url
        url = re.sub(r'/$', '', url_in)

        # BLOCK certain urls based on manual block rgx
        if re.search(BLOCK_URL_RGX, url) is not None:
            return False

        # if url already seen do not proceed, else log as seen
        if url in self.seen:
            return False
        else:
            self.seen.add(url)

        # get host IP address of url
        url_parts = urlparse.urlsplit(url)
        host_addr = self._get_and_log_addr(url_parts.netloc)

        # if the page is not of a safe type log and do not proceed
        # NOTE: certain types e.g. pdf, doc will be passed and handled specially by crawl_page!
        if re.search(SAFE_PATH_RGX, url_parts.path) is None:
            if DEBUG_MODE:
                self.Q_logs.put("*UN-SAFE PAGE TYPE SKIPPED: %s" % (url, ))
            return False

        # if DNS was resolved error already reported, do not proceed any further
        if host_addr is None:
            return False

        # calculate url's seed distance
        if not from_other_node:
            seed_dist = ref_seed_dist if host_addr == ref_host_addr else ref_seed_dist + 1
        else:
            seed_dist = ref_seed_dist

        # check for being past max seed distance
        if seed_dist > MAX_SEED_DIST and MAX_SEED_DIST > -1:
            return False

        # --> At this point, marker should be added to active count
        #     This will be removed when url is either:
        #       (A) sent to another node successfully
        #       (B) dropped to payload database
        self.Q_active_count.put(True)
        if DEBUG_MODE:
            self.Q_logs.put("Active count: %s" % self.Q_active_count.qsize())

        # if the page belongs to another node, pass to message sending service
        if not from_other_node:
            if DISTR_ON_FULL_URL:
                url_node = hash(url) % NUMBER_OF_NODES
            else:
                url_node = hash(host_addr) % NUMBER_OF_NODES
            if url_node != self.node_n:
                self.Q_to_other_nodes.put(
                    (url_node, url, ref_page_stats, seed_dist, parent_url))
                return False

        # if this is an internal link, and not from other node, send directly to the serving hq
        if seed_dist == ref_seed_dist and not from_other_node:
            self.hqs[host_addr].append(
                (url, ref_page_stats, seed_dist, parent_url))

            # update total count
            self.total_crawled += 1

        # else send to overflow_urls to stay cautiously thread safe
        else:

            # add to overflow queue
            self.Q_overflow_urls.put(
                (host_addr, url, ref_page_stats, seed_dist, parent_url))

            # add to active count
            self.total_crawled += 1

    # subfunction for getting IP address either from DNS cache or web
    def _get_and_log_addr(self, hostname):

        # try looking up hostname in DNScache
        now = datetime.datetime.now()
        if self.DNScache.has_key(hostname):

            # check time for DNS refresh
            addr, created = self.DNScache[hostname]
            age = now - created
            if age.seconds > DNS_REFRESH_TIME:
                addr = self._get_addr(hostname)
                if addr is not None:
                    self.DNScache[hostname] = (addr, now)
                else:
                    del self.DNScache[hostname]
        else:
            addr = self._get_addr(hostname)
            if addr is not None:
                self.DNScache[hostname] = (addr, now)
        return addr

    # sub-subfunction for getting IP address from socket
    def _get_addr(self, hostname):
        try:
            addr_info = socket.getaddrinfo(hostname, None)
        except Exception as e:
            self.Q_logs.put('DNS ERROR: skipping ' + hostname)
            return None

        # ensure result is non-null
        if len(addr_info) > 0:
            return addr_info[0][4][0]
        else:
            self.Q_logs.put('DNS ERROR: skipping ' + hostname)
            return None

    # primary routine WITH INTERNAL LOOP for maintenance threads
    # routine is: get cleanup task --> delete old hq after wait --> fill from overflow
    # routine is looped so as not to get stuck in an impasse situation
    def clean_and_fill_loop(self):
        hqs_to_make = 0

        # primary loop- must loop so as not to get stuck in impasse situation
        while self.active:

            # get queue to delete & time to delete at; if no hqs to make then block
            get_block = (hqs_to_make == 0)
            try:
                time_to_delete, host_addr = self.Q_hq_cleanup.get(get_block)

                # wait till safe to delete, then delete
                wait_time = time_to_delete - datetime.datetime.now()
                time.sleep(max(0, wait_time.total_seconds()))
                del self.hqs[host_addr]
                hqs_to_make += 1

            # if there are still hqs to make, then don't block on getting more cleanup tasks
            except Queue.Empty:
                pass

            # try a bounded number of times to find a url in overflow that doesn't already have an hq
            for i in range(min(OVERFLOW_TRY_MAX,
                               self.Q_overflow_urls.qsize())):

                # get an overflow url tuple
                r = list(self.Q_overflow_urls.get())
                host_addr = r[0]

                # if hq already exists for this host_addr then recycle and continue
                if self.hqs.has_key(host_addr):
                    self.Q_overflow_urls.task_done()
                    self.Q_overflow_urls.put(tuple(r))
                    continue

                # else create a new hq
                else:
                    self.hqs[host_addr] = []

                    # if OVERFLOW_MULTI enabled, try to fill the new hq with multiple consecutive
                    cn = 0
                    while cn < OVERFLOW_MULTI_TRY_L:
                        try:
                            s = list(self.Q_overflow_urls.get(False))

                        # don't block on attempt to fill additional urls from overflow here...
                        except Queue.Empty:
                            break

                        # check if the pulled url belongs in the hq, if not recycle
                        if s[0] == host_addr:
                            self.hqs[host_addr].append(tuple(s[1:]))
                        else:
                            self.Q_overflow_urls.put(tuple(s))
                            cn += 1
                        self.Q_overflow_urls.task_done()

                    # add the original url from overflow to crawl tasks
                    r.insert(0, datetime.datetime.now())
                    self.Q_crawl_tasks.put(tuple(r))
                    hqs_to_make -= 1
                    self.Q_overflow_urls.task_done()
                    self.Q_hq_cleanup.task_done()
                    break

    # primary routine for initialization of url frontier / hqs
    # NOTE: !!! Assumed that this is sole thread running when executed, prior to crawl start
    def initialize(self, urls=[]):
        now = datetime.datetime.now()

        # initialize all hqs as either full & tasked or empty & to be deleted
        i = 0
        while len(self.hqs) < HQ_TO_THREAD_RATIO * NUMBER_OF_CTHREADS:
            i += 1

            # expend all given urls
            if len(urls) > 0:
                self._init_add_url(urls.pop())

            # else add empty queues and mark to be cleared & replaced
            else:
                self.hqs[i] = []
                self.Q_hq_cleanup.put((now, i))

        # if there are urls left over, add to appropriate queues
        for url in urls:
            self._init_add_url(url)

    # subroutine for adding url to hq, assuming only one thread running (initialization)
    def _init_add_url(self, url_in):

        # basic cleaning operations on url
        url = re.sub(r'/$', '', url_in)

        # assume unseen and input to seen list, add to active count
        self.seen.add(url)

        # BLOCK certain urls based on manual block rgx
        if re.search(BLOCK_URL_RGX, url) is not None:
            return False

        # get host IP address of url
        url_parts = urlparse.urlsplit(url)
        host_addr = self._get_and_log_addr(url_parts.netloc)

        # if the page is not of a safe type log and do not proceed
        if re.search(SAFE_PATH_RGX, url_parts.path) is None:
            if DEBUG_MODE:
                self.Q_logs.put("*UN-SAFE PAGE TYPE SKIPPED: %s" % (url, ))
            return False

        # if DNS was resolved error already reported, do not proceed any further
        if host_addr is None:
            return False

        # if the page belongs to another node, pass to message sending service
        if DISTR_ON_FULL_URL:
            url_node = hash(url) % NUMBER_OF_NODES
        else:
            url_node = hash(host_addr) % NUMBER_OF_NODES
        if url_node != self.node_n:
            self.Q_to_other_nodes.put((url_node, url, None, 0, None))
            return False

        # add to an existing hq, or create new one & log new crawl task, or add to overflow
        self.Q_active_count.put(True)
        self.total_crawled += 1
        if DEBUG_MODE:
            self.Q_logs.put("Active count: %s" % self.Q_active_count.qsize())
        if self.hqs.has_key(host_addr):
            self.hqs[host_addr].append((url, None, 0, None))
        elif len(self.hqs) < HQ_TO_THREAD_RATIO * NUMBER_OF_CTHREADS:
            self.hqs[host_addr] = []
            self.Q_crawl_tasks.put(
                (datetime.datetime.now(), host_addr, url, None, 0, None))
        else:
            self.Q_overflow_urls.put((host_addr, url, None, 0, None))

    # routine called on abort (by user interrupt or by MAX_CRAWLED count being reached) to
    # save current contents of all queues to disk & seen filter flushed for restart
    def dump_for_restart(self):

        # ensure url frontier deactivated
        self.active = False

        # get all urls in Q_crawl_tasks, hqs, or Q_overflow_urls
        # only get urls as these will be re-injected through the initialize method of uf
        with open(RESTART_DUMP, 'w') as f:
            for thead_name, url in self.thread_active.iteritems():
                if url is not None:
                    f.write(url + '\n')

            while not self.Q_crawl_tasks.empty():
                try:
                    r = self.Q_crawl_tasks.get(True, 1)
                    f.write(r[2] + '\n')
                except:
                    continue

            for host_addr, paths in self.hqs.iteritems():
                for path in paths:
                    f.write(path[0] + '\n')

            while not self.Q_to_other_nodes.empty():
                try:
                    r = self.Q_to_other_nodes.get(True, 1)
                    f.write(r[1] + '\n')
                except:
                    continue

            while not self.Q_overflow_urls.empty():
                try:
                    r = self.Q_overflow_urls.get(True, 1)
                    f.write(r[1] + '\n')
                except:
                    continue

        # ensure seen filter file is synced
        self.seen.sync()
Пример #47
0
'''
from core.data.bloomfilter.wrappers import GenericBloomFilter

# This import can't fail, it is pure-python love ;)
from core.data.bloomfilter.seekfile_bloom import FileSeekBloomFilter\
    as FileSeekFilter

try:
    # This might fail since it is a C library that only works in Linux
    from pybloomfilter import BloomFilter as CMmapFilter

    # There were reports of the C mmap filter not working properly in OSX,
    # just in case, I'm testing here...
    temp_file = GenericBloomFilter.get_temp_file()
    try:
        bf = CMmapFilter(1000, 0.01, temp_file)
        bf.add(1)
        assert 1 in bf
        assert 2 not in bf
    except:
        WrappedBloomFilter = FileSeekFilter
    else:
        WrappedBloomFilter = CMmapFilter

except:
    WrappedBloomFilter = FileSeekFilter


class BloomFilter(GenericBloomFilter):
    def __init__(self, capacity, error_rate):
        '''
def setup(database: dict,
          password: str,
          bloomfilter_file=None,
          bf_false_positive_rate=BLOOMFILTER_DEFAULT_FALSE_POSITIVE_RATE,
          paralleled=False,
          num_processes=None) -> tuple:
    """
    Setup method of OXT for a database
    :param database: database with id -> list of words
    :param password: password to create keys
    :param bloomfilter_file: file to read/write bloomfilter
    :param bf_false_positive_rate: bloomfilter false positive rate
    :param bool paralleled: should we parallel the process or not
    :param num_processes: number of process used if parallel
    :return: (key, encrypted database)
    """
    global var_dict

    # TODO: generate keys from password
    K_P = random_secure(1)  # key to XOR index

    K_S = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for e
    iv = random_secure(
        CMAC_AES128_KEY_LENGTH_IN_BYTES)  # IV for AES encryption
    K_X = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for xtag
    K_I = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for index
    K_Z = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for Z
    K_T = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for keyword

    pairing = PairingGroup('SS512')

    g = pairing.random(GT)
    assert g.initPP(), "ERROR: Failed to init pre-computation table for g."

    total_pairs = 0
    inverted_index_all_pairs = defaultdict(
        list)  # word -> list of ids containing this word

    if paralleled:
        # parallel processing
        logger.info('Parallel gen_inverted_index')
        pool = multiprocessing.Pool()
        num_docs = len(database)
        inverted_tuples = pool.starmap(
            gen_inverted_index_paralleled,
            list(zip(database.items(), [K_P] * num_docs)))
        for inverted_list in inverted_tuples:
            for word, rind in inverted_list:
                inverted_index_all_pairs[word].append(rind)
                total_pairs += 1

    else:
        # sequential processing
        logger.info('Seq inverted_index_all_pairs')
        for (ind, words) in database.items():
            inverted_list = gen_inverted_index(ind, words, K_P)

            for word, rind in inverted_list:
                inverted_index_all_pairs[word].append(
                    rind)  # rind is now bytes
                total_pairs += 1

    # generate xtags. Each xtag is for a pair (word, index)
    xtags = set()

    if paralleled:
        logger.info('Parallel xtags')
        # parallel processing
        with multiprocessing.Pool(processes=num_processes,
                                  initializer=init_gen_xtags_parallel,
                                  initargs=(K_X, pairing, K_I, g)) as pool:
            xtags_lists = pool.map(gen_xtags_parallel,
                                   inverted_index_all_pairs.items())

            for xtags_list in xtags_lists:
                xtags.update(xtags_list)

            var_dict = {}
    else:
        logger.info('Seq xtags')
        for word, indices in inverted_index_all_pairs.items():
            xtags.update(gen_xtags(word, indices, K_X, pairing, K_I, g))

    # Create a Bloom filter and bitarray
    if bloomfilter_file is not None:
        bf = BloomFilter(total_pairs, bf_false_positive_rate, bloomfilter_file)
    else:
        bf = BloomFilter(total_pairs, bf_false_positive_rate)
    num_bits = bf.num_bits
    bits = bitarray(num_bits)
    bits.setall(False)

    # compute the positions of each xtag and set it
    # the reason we need to use bits array because the library doesn't expose bits. e.g. check if a bit is set or not
    xtag: str
    for xtag in xtags:
        bf.add(xtag)

        # mimic set in bits array
        for hash_seed in bf.hash_seeds:
            pos = bloomfilter_hash(xtag, hash_seed) % num_bits
            bits[pos] = True

    # generate encrypted database
    edb1 = dict()
    if paralleled:
        logger.info('Parallel edb1')
        # parallel processing
        with multiprocessing.Pool(processes=num_processes,
                                  initializer=init_gen_t_set_parallel,
                                  initargs=(K_S, K_I, K_Z, K_T, iv,
                                            pairing)) as pool:
            t_set_dict_lists = pool.map(gen_t_set_parallel,
                                        inverted_index_all_pairs.items())

            for t_set_dict in t_set_dict_lists:
                edb1.update(t_set_dict)

            var_dict = {}
    else:
        logger.info('Seq edb1')

        for word, indices in inverted_index_all_pairs.items():
            edb1.update(
                gen_t_set(word, indices, K_S, K_I, K_Z, K_T, iv, pairing))

    key = (K_P, K_S, K_X, K_I, K_Z, K_T)
    g_serialized = pairing.serialize(g)

    return key, iv, g_serialized, edb1, bf, bits
 def __init__(self, capacity, error_rate):
     super().__init__()
     self.bloom_filter_1 = BloomFilter(capacity, error_rate)
     self.bloom_filter_2 = BloomFilter(capacity, error_rate)
Пример #50
0
1. 国内-省-目的地 可以获取该地区所有城市
2. 城市-景点 可以获取该城市所有景点
3. 城市-社区-游记 可以获取该城市所有游记

--
BloomFilter
"""

import os
import requests
import re
from pybloomfilter import BloomFilter


dir_name = 'notes/'
bf = BloomFilter(1024 * 1024 * 16, 0.01)


def find_all_city_pages_url():
    req = requests.get('http://www.mafengwo.cn/mdd/')
    city_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', req.text)
    return city_pages


def get_city_number(url):
    return url[29:34]


def save_html(file_name, html):
    with open(file_name, 'wb+') as f:
        f.write(html.encode())
Пример #51
0
 def __init__(self, path=None):
     self.file = None
     self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
Пример #52
0
 def __init__(self):
     self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
     self.f_write = open('visitedsites','w')
     self.si = SearchIndex()
     self.si.SearchInit()
Пример #53
0
def dsk_with_bf(file_name, n_iter, n_partition, kmer_size, bf_capacity,
                bf_error, top_count, verbose):
    """
    Disk streaming of kmers with bloom filter.
    :param file_name: File to be processed.
    :param n_iter: Number of iterations to write kmers into disk.
    :param n_partition: Number of iterations to read files into memory.
    :param kmer_size: Length of the kmer.
    :param bf_capacity: Capacity of the bloom filter.
    :param bf_error: Probability of false positive in bloom filter.
    :param top_count: Number of kmers to be printed.
    :param verbose: Option to print elapsed time and memory usage.
    :return:
    """
    start_operation = time.time()
    # initialise a min heap
    h = Heap()
    h.populate(top_count)
    for iter_ in range(n_iter):
        start_iter = time.time()
        # initialise files where partitioned data is written.
        files = [open("{}".format(j), "w") for j in range(n_partition)]
        with open(file_name, "r") as file_from_read:
            count = 0
            for line in file_from_read:
                # take the second line to parse kmers.
                if count % 4 == 1:
                    line_length = len(line) - 1
                    for i in range(line_length - kmer_size + 1):
                        kmer = line[i:kmer_size + i]
                        # assign kmers to partitions.
                        hash_result = mmh3.hash(kmer)
                        if hash_result % n_iter == iter_:
                            # assign kmers to files
                            j = int((hash_result / n_iter) % n_partition)
                            files[j].write(kmer + "\n")
                count += 1
        for f in files:
            f.close()
        end = time.time()
        if verbose:
            print("Disk write for iteration {0} done in {1} seconds".format(
                str(iter_), str(end - start_iter)))
        for j in range(n_partition):
            # initialise bloom filter
            bf = BF(bf_capacity, bf_error, "bf_dsk")
            start_partition = time.time()
            kmer_freq = dict()
            with open(str(j), "r") as f:
                for kmer in f:
                    if kmer in bf:
                        if kmer not in kmer_freq:
                            kmer_freq[kmer] = 1
                        kmer_freq[kmer] += 1
                    else:
                        bf.add(kmer)
            end = time.time()
            if verbose:
                print(
                    "Hash table for iteration {0}, partition {1} done in {2} seconds."
                    .format(str(iter_), str(j), str(end - start_partition)))
                print(
                    "Has table size for iteration {0} partition {1} is {2} Mb".
                    format(str(iter_), str(j),
                           str(int(sys.getsizeof(kmer_freq)) / 10**6)))

            start_heap = time.time()
            for kmer, freq in kmer_freq.items():
                if freq > h.min():
                    # h.pop()
                    # h.push((freq, kmer))
                    h.push_pop((freq, kmer))
            end = time.time()
            if verbose:
                print("Heap done in {0} seconds".format(end - start_heap))
            # clean file and bf
            os.remove(str(j))
            os.remove("bf_dsk")
        end_iter = time.time()
        if verbose:
            print("Iteration {0} done in {1} seconds.".format(
                str(iter_), str(end_iter - start_iter)))

    for item in h.nlargest(top_count):
        freq, kmer = item
        print(kmer[:-1], freq)
    end = time.time()
    if verbose:
        print("Process done in {0} seconds.".format(str(end -
                                                        start_operation)))
Пример #54
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import csv
import sys
import datetime
from time import time
from pybloomfilter import BloomFilter

reload(sys)
sys.setdefaultencoding('utf-8')

download_bf = BloomFilter(1024*1024*16, 0.01)

def request(url, isFirstPage):
    if url not in download_bf:
        download_bf.add(url)
    else:
        return

    res = requests.get(url).text
    soup = BeautifulSoup(res, 'html.parser')
    # print soup.prettify()

    keylist = soup.select('div.key-list > div.item-mod')
    for index, house in enumerate(keylist):
        # if index == 2:
            # print house
Пример #55
0
_, dim = T_des.shape

# In[4]:

LSH_random_vectors_set = []
#powers_of_two = 1 << np.arange(LSH_dim-1, -1, -1)

# creating the multiple LSH random vectors
for i in range(L_buckets):
    np.random.seed(i)
    LSH_random_vectors_set.append(np.random.randn(dim, LSH_dim))

# creating the multiple Bloom Filters
BF_set = []
for i in range(L_buckets):
    BF_set.append(BloomFilter(2**(2 * LSH_dim), 0.01, None))

# In[5]:

t0 = time.process_time()

Q_kp, Q_des = detector.detectAndCompute(query_img, None)

t1 = time.process_time()

# We now add each LSH hash result to their dedicated Bloom Filter
for i in range(L_buckets):
    Q_reflections = Q_des.dot(LSH_random_vectors_set[i]) >= 0

    for q in np.array(Q_reflections, dtype=int):
        BF_set[i].add(q.tostring(None))
Пример #56
0
import requests
import re
import json
from redis import Redis
from rq import Queue
from bs4 import BeautifulSoup
from pybloomfilter import BloomFilter
from utils import get_html,get_proxy,delete_proxy,get_content
from urllib.parse import urlencode

low = Queue('low',connection=Redis(host='localhost',port=6379))

bloom_f = BloomFilter(capacity=100000, error_rate=0.01)


def spider_movie_comment(movie_id):
    # Get Pages
    url = "https://movie.douban.com/subject/"+movie_id+"/reviews?start="
    head = get_html(url+str(0))
    html = BeautifulSoup(head.content,"lxml")
    temp_html = html.select("#content > h1")
    print(temp_html)
    # f = open("index.html","w")
    # f.write(html.prettify())
    # f.close()

    text = temp_html[0].text
    page = int(re.sub(r"\D*","", text))
    data = []

    for page_num in range(page//20+1):
Пример #57
0
from pybloomfilter import BloomFilter
import sys, signal
from time import time, sleep
import os
from worker_filter import Filter

st = time()

done_sites_fname = 'done_sites.bin'
if os.path.isfile(done_sites_fname):
    bfdone = BloomFilter.open(done_sites_fname)
else:
    print "no file"
    bfdone = BloomFilter(2**27, 10**(-5), done_sites_fname)  #8M

start = 0

filter = Filter()

f = open('done_urls20160601.txt').read().strip().split('\n')
for url in f:
    bfdone.add(url)
print len(f)
cnt = 0
for url in f:
    if url in bfdone:
        cnt += 1
print cnt
inc = 0

print time() - st
Пример #58
0
class PcautoAskSpider(scrapy.Spider):
    name = "pcauto_ask"
    allowed_domains = ["pcauto.com.cn"]
    #start_urls = ['http://k.pcauto.com.cn/question/4035240.html']
    start_urls = ['http://k.pcauto.com.cn/question/k16/p1.html']

    def __init__(self):
        bloomfilterfilename = 'pcauto.filter'
        try:
            self.bf = BloomFilter.open(bloomfilterfilename)
        except:
            logging.info("new filter.bloom")
            self.bf = BloomFilter(50000000, 0.05, bloomfilterfilename)

    def start_requests(self):
        urls = [
            "http://k.pcauto.com.cn/question/k%d/p1.html" % i
            for i in (1, 2, 4, 5, 6)
        ]
        for url in urls:
            yield scrapy.Request(url, callback=self.parse_category)

    def parse_category(self, response):
        for element_li in response.xpath(
                '//ul[@id="wtList"]/li[@class!="liTit"]'):
            url = element_li.xpath(
                'i[@class="iTitle"]/a/@href').extract_first()
            num = element_li.xpath('i[@class="iNum"]/text()').extract_first()
            #状态 用来记录是否有最佳答案
            phase = True if element_li.xpath(
                'i[@class="iPhase"]/span[@class="icon_jj"]') else False
            #print("%s [%s]" % (url, num))
            if phase:
                if (url, phase) not in self.bf:
                    yield scrapy.Request(url, callback=self.parse_askcard)
                    self.bf.add((url, phase))
            else:
                if not num == '0' and (url, phase, num) not in self.bf:
                    yield scrapy.Request(url, callback=self.parse_askcard)
                    self.bf.add((url, phase, num))
        next_url = response.xpath(
            '//div[@class="pcauto_page"]/a[@class="next"]/@href'
        ).extract_first()
        if next_url:
            yield scrapy.Request(response.urljoin(next_url),
                                 callback=self.parse_category)

    def parse_askcard(self, response):
        item = PcautoItem()
        item['url'] = response.url
        question_title = response.xpath(
            '//div[@id="question_content"]/div[@class="modInner"]/div[1]//text()'
        ).extract()
        item['question_title'] = parseContentList2Str(question_title)
        item['question'] = response.xpath(
            '//div[@class="modInner"]/p/text()').extract_first()
        ask_time = response.xpath(
            '//div[@class="dInfo gray"]/span[@class="sTime"]/text()'
        ).extract_first()
        item['ask_time'] = str2Timestamp(ask_time)
        user_name = response.xpath(
            '//div[@class="dInfo gray"]/span[@class="sName"]/a/text()'
        ).extract_first()
        user_url = response.xpath(
            '//div[@class="dInfo gray"]/span[@class="sName"]/a/@href'
        ).extract_first()
        item['ask_user'] = {'name': user_name, 'url': user_url}
        element_best_answer = response.xpath(
            '//div[@class="modAnswer modBest mt10"]//div[@class="tb"]')
        item['best_answer'] = self.parse_answer(
            element_best_answer[0]) if element_best_answer else None

        answer_list = list()
        for element in response.xpath(
                '//div[@class="modAnswer mt10 modOut"]/div[@class="modInner"]/div[@class!="th"]'
        ):
            answer_list.append(self.parse_answer(element))
        item['answer_list'] = answer_list
        item['answer_count'] = len(answer_list)
        yield item

    def parse_answer(self, element):
        answer = dict()
        answer['id'] = element.xpath('div[2]/@id').extract_first()
        user_icon = element.xpath('.//img/@src').extract_first()
        element_user = element.xpath('.//i[@class="blue"]') or element.xpath(
            './/div[@class="dTitle"]')
        user_name = element_user[0].xpath('a/text()').extract_first()
        user_url = element_user[0].xpath('a/@href').extract_first()
        answer['user'] = {
            'name': user_name,
            'url': user_url,
            'icon': user_icon
        }
        answer_time = ''.join(
            element.xpath('.//div[@class="gray"]/text()').extract())
        answer['answer_time'] = str2Timestamp(answer_time) or element.xpath(
            './/span[@class="sTime"]/text()').extract_first()
        answer['answer'] = element.xpath(
            './/div[@class="answerCon"]/p/text()').extract_first()
        return answer
Пример #59
0
 def __init__(self):
     self.bf = BloomFilter.open('filter.bloom')
     self.f_write = open('jingdong.txt', 'w')
     self.si = SearchIndex()
     self.si.SearchInit()
Пример #60
0
#!/usr/bin/env python
# coding:utf-8
# manning  2015-1-27
import time
import os
import urlparse
import hashlib
import sys
#sys.path.append("..")

#from config.config import *
#reload(sys)
#sys.setdefaultencoding("utf-8")
from pybloomfilter import BloomFilter

bf = BloomFilter(100000, 0.01)


def format(url):
    '''
    策略是构建一个三元组
    第一项为url的netloc
    第二项为path中每项的拆分长度
    第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题)
    '''
    if urlparse.urlparse(url)[2] == '':
        url = url + '/'

    url_structure = urlparse.urlparse(url)
    netloc = url_structure[1]
    path = url_structure[2]