예제 #1
0
    def test_insert_then_test(self):
        result = create_index(
            '/tmp/fake.csv',  # input filename
            self.test_file,  # file-like object
            0.0001,  # error rate
            1,  # skip lines
            [1, 2],  # fields
            ',',  # delimiter
            False)  # recursive domain
        self.assertEqual(
            {
                '/tmp/fake.csv.2.bfindex': 6,
                '/tmp/fake.csv.1.bfindex': 5
            }, result)
        b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb'))
        b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb'))

        self.assertEqual(False, 'FieldA' in b1)
        self.assertEqual(False, 'FieldB' in b2)

        for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'):
            self.assertEqual(True, word in b1)
            self.assertEqual(False, word in b2)

        for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'):
            self.assertEqual(True, word in b2)
            self.assertEqual(False, word in b1)
예제 #2
0
    def test_insert_then_test(self):
        result = create_index(
            '/tmp/fake.csv',  # input filename
            self.test_file,   # file-like object
            0.0001,           # error rate
            1,                # skip lines
            [1, 2],           # fields
            ',',              # delimiter
            False)            # recursive domain
        self.assertEqual(
            {'/tmp/fake.csv.2.bfindex': 6,
             '/tmp/fake.csv.1.bfindex': 5},
            result)
        b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb'))
        b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb'))

        self.assertEqual(False, 'FieldA' in b1)
        self.assertEqual(False, 'FieldB' in b2)

        for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'):
            self.assertEqual(True, word in b1)
            self.assertEqual(False, word in b2)

        for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'):
            self.assertEqual(True, word in b2)
            self.assertEqual(False, word in b1)
예제 #3
0
def jaccard_ind(filename_1, filename_2):
    with open(filename_1, 'rb') as f_1:
        with open(filename_2, 'rb') as f_2:
            print(filename_1)
            b_1 = BloomFilter.fromfile(f_1)
            b_2 = BloomFilter.fromfile(f_2)
            b_inter = b_1.intersection(b_2)
            b_union = b_1.union(b_2)
            bits_inter = b_inter.bitarray.count(True)
            bits_union = b_union.bitarray.count(True)
            j_i = float(bits_inter) / float(bits_union)
            #print("%s ~ %s, %f" % filename_1, filename_2, j_i)
            print("%s %s %f" % (filename_1, filename_2, j_i))
        f_2.close()
    f_1.close()
예제 #4
0
def jaccard_ind(filename_1, filename_2):
    with open(filename_1, 'rb') as f_1:
        with open(filename_2, 'rb') as f_2:
            print(filename_1)
            b_1 = BloomFilter.fromfile(f_1)
            b_2 = BloomFilter.fromfile(f_2)
            b_inter = b_1.intersection(b_2)
            b_union = b_1.union(b_2)
            bits_inter = b_inter.bitarray.count(True)
            bits_union = b_union.bitarray.count(True)
            j_i = float(bits_inter) / float(bits_union)
            #print("%s ~ %s, %f" % filename_1, filename_2, j_i)
            print("%s %s %f" % (filename_1, filename_2, j_i))
        f_2.close()
    f_1.close()
예제 #5
0
 def __init__(self):
     try:
         with open(FILTER_FILE) as f:
             self.f = BloomFilter.fromfile(f)
     except IOError:
         self.f = BloomFilter(capacity=10000000, error_rate=0.001)
     self.num = 0
예제 #6
0
파일: dbpedia.py 프로젝트: we1l1n/SQG
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file) as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path) as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
예제 #7
0
def main():
    parser = argparse.ArgumentParser(prog='blacktop/nsrl')
    parser.add_argument("-v",
                        "--verbose",
                        help="Display verbose output message",
                        action="store_true",
                        required=False)
    parser.add_argument('name',
                        metavar='FILE',
                        type=str,
                        nargs='+',
                        help='a file name to search for.')
    args = parser.parse_args()

    with open('nsrl.bloom', 'rb') as nb:
        bf = BloomFilter.fromfile(nb)

        for file_name in args.name:
            if args.verbose:
                if file_name in bf:
                    print "File {} found in NSRL Database.".format(file_name)
                else:
                    print "File {} was NOT found in NSRL Database.".format(
                        file_name)
            else:
                print file_name in bf
    return
예제 #8
0
def update_bf(request):
    global proxies
    temp_list = request.split('\n')
    #print temp_list
    updated_proxy = temp_list[1] # ip of other proxy
    # write the bloom filter to a file bf_upd
    temp_bf_recv = open('bf_upd', "w") 
    temp_bf_recv.write(temp_list[2])
    temp_bf_recv.write('\n')
    temp_bf_recv.write(temp_list[3])
    temp_bf_recv.close()
    temp_bf_recv = open('bf_upd', 'r')
    # de-serialize the bloom filter
    temp_bf = BloomFilter.fromfile(temp_bf_recv)
    temp_bf_recv.close()

    if os.path.isfile('bf_upd'): # remove the file used to hold the bloom filter
        os.remove('bf_upd')

    index = 0;
    for proxy in proxies: 
        if proxy[0] == updated_proxy: # update only the proxy who's bloom filter is updated
            print "UPDATING BF OF PROXY: ", proxy[0]
            curr_port = proxy[1]
            break
        index = index + 1
    proxies[index] = (updated_proxy, curr_port, temp_bf) # update entry in the list
예제 #9
0
def main():
    parser = argparse.ArgumentParser(prog='blacktop/nsrl')
    parser.add_argument("-v",
                        "--verbose",
                        help="Display verbose output message",
                        action="store_true",
                        required=False)
    parser.add_argument('hash',
                        metavar='MD5',
                        type=str,
                        nargs='+',
                        help='a md5 hash to search for.')
    args = parser.parse_args()

    with open('nsrl.bloom', 'rb') as nb:
        bf = BloomFilter.fromfile(nb)

        for hash_hex in args.hash:
            hash = binascii.unhexlify(hash_hex)
            if args.verbose:
                if hash in bf:
                    print "Hash {} found in NSRL Database.".format(hash_hex)
                else:
                    print "Hash {} was NOT found in NSRL Database.".format(
                        hash_hex)
            else:
                print hash in bf
    return
예제 #10
0
 def open_spider(self, spider):
     brandName = 'mybloom'
     isexists = os.path.exists(brandName + '.blm')
     if isexists == True:
         self.bf = BloomFilter.fromfile(open(brandName + '.blm', 'rb'))
     else:
         self.bf = BloomFilter(100000, 0.001)
예제 #11
0
def start():
    res = request_get(biqukan_url)
    index = BeautifulSoup(res, features=features)

    if os.path.exists(bf_file):
        LOG.info('bs from file')
        bf = BloomFilter.fromfile(open(bf_file, 'r'))
    else:
        LOG.info('init bs')
        bf = BloomFilter(500000)

    try:
        pool = Pool(size=pool_size)
        book_urls = find_wanben()
        book_urls += find_new_storage_block(index)
        book_urls += find_recommend_block(index, u'强力推荐')
        book_urls += find_type_block(index, u'玄幻小说')
        book_urls += find_type_block(index, u'修真小说')
        book_urls += find_type_block(index, u'都市小说')
        book_urls += find_type_block(index, u'穿越小说')
        book_urls += find_type_block(index, u'网游小说')
        book_urls += find_type_block(index, u'科幻小说')
        book_urls += find_new_update_block(index)
        book_num = len(book_urls)
        for i, url in enumerate(book_urls):
            pool.spawn(download_book, url, bf)
            # download_book(url, bf)
            LOG.info(u'开始下载%s本,剩余%s本', i + 1, book_num - i - 1)

        pool.join()
        LOG.info(u'下载完成')
    except Exception as e:
        LOG.exception(e)
    finally:
        bf.tofile(open(bf_file, 'w'))
예제 #12
0
파일: spider.py 프로젝트: ruxtain/spiders
 def __enter__(self):
     if os.path.exists(self.bloom_file):
         with open(self.bloom_file, 'rb') as f:
             self.bloom = BloomFilter.fromfile(f)
     else:
         self.bloom = BloomFilter(capacity=10000000, error_rate=0.001)
     return self.bloom
예제 #13
0
파일: compile_names.py 프로젝트: Glank/rdp
def test():
    with open('blooms/boys', 'r') as f:
        boys = BloomFilter.fromfile(f)
    with open('blooms/girls', 'r') as f:
        girls = BloomFilter.fromfile(f)

    print "Enter a name:"
    while True:
        name = raw_input().strip().lower()
        if name in boys and name not in girls:
            print "That is a boy's name."
        elif name not in boys and name in girls:
            print "That is a girl's name."
        elif name in boys and name in girls:
            print "That could be either a boy's or a girl's name."
        else:
            print "That doesn't look like a boy's or a girl's name."
예제 #14
0
def add_proxy(request, conn):
    global proxies
    recv_list = request.split('\n')
    #print "RECV LIST: ",recv_list
    new_proxy = recv_list[1]
    new_port = int(recv_list[2])
    temp_bf = open("temp_bf_rec", "w+") # write the bloom filter string to a file
    temp_bf.write(recv_list[3])
    temp_bf.write('\n');
    temp_bf.write(recv_list[4])
    temp_bf.close()
    temp_bf = open("temp_bf_rec", 'r')
    new_bf = BloomFilter.fromfile(temp_bf)     # de-serialize the bloom filter
    temp_bf.close()

    if os.path.isfile('temp_bf_rec'): # remove the temp file
        os.remove('temp_bf_rec')
    print "GOT BLOOM FILTER"

    proxies.append((new_proxy,new_port,new_bf))
    #print "NEW PROXIES: ",proxies
    temp_list = ['NEW LIST OF PROXIES']
    for proxy in proxies:
        temp_list.append('\n')
        temp_list.append(proxy[0]) # add IP
        temp_list.append('\n')
        temp_list.append(str(proxy[1])) # add port
        temp_list.append('\n')
        temp_bf = open('temp_bf_send', "w") # serialize the bloom filter for sending
        proxy[2].tofile(temp_bf)
        temp_bf.close()
        temp_bf = open('temp_bf_send', "r")

        # create the a string so the bloom filter can be sent over socket connect
        temp2 = ''
        while 1:
            temp = temp_bf.read()
            #print "Reading from file: ", len(temp)
            if len(temp) > 0:
                temp2 = temp2 + temp
            else:
                break

        temp_bf.close()
        temp_list.append(temp2)  # add bloom filter   

    if os.path.isfile('temp_bf_send'): # remove temp file
        os.remove('temp_bf_send')

    #print temp_list
    temp_string = ''.join(temp_list) # create a string representation of the list

    for proxy in proxies:
        if proxy[0] != bootstrap_proxy: #only bootstrapping proxy will do this
            temp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            temp_sock.connect((proxy[0],proxy[1])) # connect to each proxy and send the new list of proxies
            temp_sock.send(temp_string)
            temp_sock.close()
예제 #15
0
 def init_bloom_filter(self, spider_name):
     self.bloom_file = '%s.bloom' % spider_name
     if os.path.exists(self.bloom_file):
         self.bloom_filter = \
             BloomFilter.fromfile(open(self.bloom_file, 'r'))
     else:
         self.bloom_filter = \
             BloomFilter(capacity=100000000, error_rate=0.001)
     pass
예제 #16
0
 def open_spider(self, spider):
     file_name = 'bloomfilter'
     is_exist = os.path.exists(file_name + '.blm')
     if is_exist:
         self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb'))
         print('open blm file success')
     else:
         self.bf = BloomFilter(100000, 0.001)
         print('didn\'t find the blm file')
예제 #17
0
 def __init__(self):
     print "Joint Linker initializing"
     try:
         f = open('../data/blooms/bloom1hoppredicate.pickle')
         self.bloom1hoppred = BloomFilter.fromfile(f)
         f.close()
         f = open('../data/blooms/bloom1hopentity.pickle')
         self.bloom1hopentity = BloomFilter.fromfile(f)
         f.close()
         f = open('../data/blooms/bloom2hoppredicate.pickle')
         self.bloom2hoppredicate = BloomFilter.fromfile(f)
         f.close()
         f = open('../data/blooms/bloom2hoptypeofentity.pickle')
         self.bloom2hoptypeofentity = BloomFilter.fromfile(f)
         f.close()
     except Exception,e:
         print e
         sys.exit(1)
예제 #18
0
 def __init__(self, path):
     self.path = path
     self.rfile = None
     self.is_tofile = False
     if not os.path.isfile(path):
         self.bf = BloomFilter(100000, 0.001)
     else:
         self.rfile = open(path, 'r')
         self.bf = BloomFilter.fromfile(self.rfile)
예제 #19
0
파일: dbpedia.py 프로젝트: debayan/SQG
 def __init__(self, endpoint="http://sda-srv01.iai.uni-bonn.de:8164/sparql",
              one_hop_bloom_file="./data/blooms/spo1.bloom"):
     super(DBpedia, self).__init__(endpoint)
     self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
     if os.path.exists(one_hop_bloom_file):
         with open(one_hop_bloom_file) as bloom_file:
             self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
     else:
         self.one_hop_bloom = None
예제 #20
0
 def _fromfile_(self):
     try:
         f= open('../out/filter', 'r')
         self.bloom_cache=BloomFilter.fromfile(f)
         self.count=self.bloom_cache.count
         f.close()
     except Exception, ex:
         print(Exception, ex)
         self.bloom_cache = BloomFilter(capacity=10000000, error_rate=0.00001)
         self.count=0
예제 #21
0
파일: bloomzip.py 프로젝트: stiege/bloomzip
    def __init__(self, name):
        super(BloomZip, self).__init__()
        self.__data = StringIO()
        self._name = name
        self._bf = None

        if os.path.isfile(self._name):
            with open(self._name, 'rb') as f:
                length = struct.unpack(">L", f.read(4))[0]
                self._bf = BloomFilter.fromfile(f, length)
예제 #22
0
파일: config.py 프로젝트: zyq001/pycrawler
def loadBloomFromFile(fileName = bloomDumpCapsName):
    from pybloom import BloomFilter
    try:
        bloom = BloomFilter.fromfile(open(fileName, 'r'))
    except IOError as er:
        print 'load bloom from file fail, return null', er
        return None
    except Exception as e:
        print 'load bloom from file got exception, return null', e
        return None
    return bloom
예제 #23
0
 def get_values_by_key_data(self, token, word_freq, offset, bloom_filter_dump_size):
     if word_freq == None:
         return numpy.zeros(0), None, 0
     if token in self.cache:
         self.update_cache(token)
         return self.cache[token][:3]       
     self.values_file.seek(offset)
     codes = pickle.load(self.values_file)
     prob_filter = None
     if bloom_filter_dump_size:
         prob_filter = BloomFilter.fromfile(self.values_file, bloom_filter_dump_size)
     self.update_cache(token, (codes, prob_filter, word_freq, offset, bloom_filter_dump_size))
     return codes, prob_filter, word_freq        
예제 #24
0
def read_redis_bf_from_file():
    try:
        f_r = open('bf_redis', 'rb')
        bf_redis = BloomFilter.fromfile(f_r)
        f_r.close()
        print "file is exists,ok"
    except:
        f_w = open('bf_redis', 'wb')
        bf_redis = BloomFilter(capacity=10000000, error_rate=0.001)
        bf_redis.tofile(f_w)
        f_w.close()
        print "file not exists,new"
    return bf_redis
예제 #25
0
 def __init__(self):
     self.faillog = open('fail.txt', 'a')
     if os.path.exists(settings['MONGODB_DB'] + '.urls'):
         #print("SEEN FILE EXISTS!!!!!!!!!!!!!!!!!!!!!!!!!")
         self.bloomFilter = BloomFilter.fromfile(
             open(settings['MONGODB_DB'] + '.urls', 'r'))
     else:
         self.bloomFilter = BloomFilter(1000000, 0.001)
     connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                      settings['MONGODB_PORT'])
     db = connection[settings['MONGODB_DB']]
     self.collection = db[settings['MONGODB_COLLECTION']]
     self.collection.ensure_index('url', unique=True)
     self.collection.create_index([("crawltime", DESCENDING)])
예제 #26
0
 def __init__(self, cachefile, capacity=1000000, error_rate=0.001):
     self.cachefile = cachefile
     if os.name == 'nt' or not cachefile:
         from pybloom import BloomFilter
         if self.cache():
             with open(cachefile, 'r') as fp:
                 self.filter = BloomFilter.fromfile(fp)
         else:
             self.filter = BloomFilter(capacity=capacity, error_rate=error_rate)
     elif os.name == 'posix':
         from pybloomfilter import BloomFilter
         if self.cache():
             self.filter = BloomFilter.open(self.cachefile)
         else:
             self.filter = BloomFilter(capacity, error_rate, cachefile)
예제 #27
0
    def fromfile(cls, f):
        """Deserialize the ScalableBloomFilter in file object `f'."""
        filter = cls()
        filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
        nfilters, = unpack("<l", f.read(calcsize("<l")))
        if nfilters > 0:
            header_fmt = "<" + "Q" * nfilters
            bytes = f.read(calcsize(header_fmt))
            filter_lengths = unpack(header_fmt, bytes)
            for fl in filter_lengths:
                filter.filters.append(BloomFilter.fromfile(f, fl))
        else:
            filter.filters = []

        return filter
예제 #28
0
 def __init__(self):
     self.faillog = open('fail.txt', 'a')
     #self.urls_seen = set()
     if os.path.exists(settings['DB_DB'] + '.urls'):
         self.bloomFilter = BloomFilter.fromfile(
             open(settings['DB_DB'] + '.urls', 'r'))
     else:
         self.bloomFilter = BloomFilter(1000000, 0.001)
     connection = pymongo.MongoClient(settings['DB_SERVER'],
                                      settings['DB_PORT'])
     db = connection[settings['DB_DB']]
     self.collection = db[settings['DB_COLLECTION']]
     self.collection.ensure_index('url', unique=True)
     self.collection.create_index([("crawltime", DESCENDING)])
     self.url2name = self.loadDict(connection)
예제 #29
0
    def fromfile(cls, f):
        """Deserialize the ScalableBloomFilter in file object `f'."""
        filter = cls()
        filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
        nfilters, = unpack('<l', f.read(calcsize('<l')))
        if nfilters > 0:
            header_fmt = '<' + 'Q' * nfilters
            bytes = f.read(calcsize(header_fmt))
            filter_lengths = unpack(header_fmt, bytes)
            for fl in filter_lengths:
                filter.filters.append(BloomFilter.fromfile(f, fl))
        else:
            filter.filters = []

        return filter
예제 #30
0
def waitForBloom(from_ip, print_labels=False, frame=None, print_start=0):
    # Create file for bloom filter import
    f = open('bloomFileIn', 'wb')

    # Connect to IP address
    host = from_ip
    port = 10000
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    # Keep trying until a connection is made
    while True:
        try:
            s.bind((host, port))
        except socket.error:
            pass
        else:
            break

    s.listen(1)
    print "Waiting for data..."
    conn, addr = s.accept()
    print "Connection from " + addr[0]

    # Receive bloom filter in increments
    data = conn.recv(1024)
    size = sys.getsizeof(data)
    while (data):
        f.write(data)
        data = conn.recv(1024)
        size += sys.getsizeof(data)

    print "Received " + str(size / 1000) + " KB"

    # Print to GUI
    if print_labels:
        ttk.Label(frame,
                  text=("Received bloom filter (" + str(size / 1000) +
                        " KB)")).grid(row=print_start, column=0)

    # Cleanup & bloom filter creation
    f.close()
    f = open('bloomFileIn', 'rb')
    bloom = BloomFilter.fromfile(f)
    f.close()
    conn.close()
    s.close()
    os.remove('bloomFileIn')
    return bloom
예제 #31
0
 def __init__(self):
     self.redis = connects.RedisConnect(**config.redis_config)
     #self.p = self.redis.r.pipeline()
     try:
         print u'正在初始化,请稍后.....'
         f = open('/home/hujun/bloomfilter.txt')  #尝试打开保存bloomfilter的文件
     except IOError:
         print 'create a new bloomfilter without file'
         #如果打开失败,说明不存在这个文件,就重新创建一个bloomfilter
         self.bloomfilter = BloomFilter(capacity=1000000,
                                        error_rate=0.00001)
         #初始化bloomfilter之后,直接将第一次抓取的url加到bloomfilter中
         self.bloomfilter.add(config.first_url)
     else:
         print 'reload bloomfilter from a file'
         self.bloomfilter = BloomFilter.fromfile(f)
예제 #32
0
    def test_recursive_domains(self):
        result = create_index(
            '/tmp/fake.csv',  # input filename
            self.test_file,  # file-like object
            0.0001,  # error rate
            1,  # skip lines
            [3],  # fields
            ',',  # delimiter
            True)  # recursive domain
        self.assertEqual({'/tmp/fake.csv.3.bfindex': 9}, result)

        b = BloomFilter.fromfile(open('/tmp/fake.csv.3.bfindex', 'rb'))

        for word in ('subdomain.yahoo.com', 'yahoo.com', 'com',
                     'example.domain.com', 'domain.com', 'www.google.co.uk',
                     'google.co.uk', 'co.uk', 'uk'):
            self.assertEqual(True, word in b)
예제 #33
0
def waitForBloom(from_ip, print_labels=False, frame=None, print_start=0):
	# Create file for bloom filter import
	f = open('bloomFileIn', 'wb')

	# Connect to IP address
	host = from_ip
	port = 10000
	s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

	# Keep trying until a connection is made
	while True:
		try:
			s.bind((host, port))
		except socket.error:
			pass
		else:
			break

	s.listen(1)
	print "Waiting for data..."
	conn, addr = s.accept()
	print "Connection from " + addr[0]

	# Receive bloom filter in increments
	data = conn.recv(1024)
	size = sys.getsizeof(data)
	while(data):
		f.write(data)
		data = conn.recv(1024)
		size += sys.getsizeof(data)

	print "Received " + str(size/1000) + " KB"

	# Print to GUI
	if print_labels:
		ttk.Label(frame, text=("Received bloom filter (" + str(size/1000) + " KB)")).grid(row=print_start,column=0)

	# Cleanup & bloom filter creation
	f.close()
	f = open('bloomFileIn', 'rb')
	bloom = BloomFilter.fromfile(f)
	f.close()
	conn.close()
	s.close()
	os.remove('bloomFileIn')
	return bloom
예제 #34
0
  def fromfiles( cls, name, bufferSize=1024, dataDir="" ):
    bloomFiles = glob.glob( os.path.join( dataDir, "%s-*.bloom" % name ) )
    dataFiles = glob.glob( os.path.join( dataDir, "%s-*.data" % name ) )

    if( len( bloomFiles ) == 0 ):
      return None

    factory = cls( name, bufferSize=bufferSize, dataDir=dataDir )

    for i in range( len( dataFiles ) ): 
      b, d = bloomFiles[i], dataFiles[i]
      box = GadgetBox( b.split( "." )[0].split( os.sep )[-1], bufferSize, dataDir=dataDir )
      with open( os.path.join( dataDir, b ), "rb" ) as f:
        box.filter = BloomFilter.fromfile( f )

      factory.boxes.append( box )

    return factory
예제 #35
0
def main():
    parser = argparse.ArgumentParser(prog='blacktop/nsrl')
    parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False)
    parser.add_argument('name', metavar='FILE', type=str, nargs='+', help='a file name to search for.')
    args = parser.parse_args()

    with open('nsrl.bloom', 'rb') as nb:
        bf = BloomFilter.fromfile(nb)

        for file_name in args.name:
            if args.verbose:
                if file_name in bf:
                    print "File {} found in NSRL Database.".format(file_name)
                else:
                    print "File {} was NOT found in NSRL Database.".format(file_name)
            else:
                print file_name in bf
    return
예제 #36
0
    def test_recursive_domains(self):
        result = create_index(
            '/tmp/fake.csv',  # input filename
            self.test_file,   # file-like object
            0.0001,           # error rate
            1,                # skip lines
            [3],              # fields
            ',',              # delimiter
            True)             # recursive domain
        self.assertEqual(
            {'/tmp/fake.csv.3.bfindex': 9},
            result)

        b = BloomFilter.fromfile(open('/tmp/fake.csv.3.bfindex', 'rb'))

        for word in ('subdomain.yahoo.com', 'yahoo.com', 'com',
                     'example.domain.com', 'domain.com', 'www.google.co.uk',
                     'google.co.uk', 'co.uk', 'uk'):
            self.assertEqual(True, word in b)
예제 #37
0
파일: search.py 프로젝트: kost/docker-kf
def main():
    parser = argparse.ArgumentParser(prog='blacktop/nsrl')
    parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False)
    parser.add_argument('hash', metavar='MD5', type=str, nargs='+', help='a md5 hash to search for.')
    args = parser.parse_args()

    with open('nsrl.bloom', 'rb') as nb:
        bf = BloomFilter.fromfile(nb)

        for hash_hex in args.hash:
            hash = binascii.unhexlify(hash_hex)
            if args.verbose:
                if hash in bf:
                    print "Hash {} found in NSRL Database.".format(hash)
                else:
                    print "Hash {} was NOT found in NSRL Database.".format(hash)
            else:
                print hash in bf
    return
예제 #38
0
def get_accurate_case(sql_result,blame_line,func_case_list):
    import os
    hash_code = sql_result[0]
    func_name = sql_result[1]
    file_path = sql_result[2]
    func_start_line = sql_result[3]
    func_end_line = sql_result[4]
    sql = "select CASE_ID,BLOOMFILTER from CASE_VERSION" 
    cursor.execute(sql)
    rows = cursor.fetchall()
    for row in rows:
        case_id = row[0]
        BloomFilter_file = row[1]
        if os.path.exists(BloomFilter_file): 
            bf = BloomFilter.fromfile(open(BloomFilter_file,"rb"))            
            if hash_code in bf:
                global_accurate_case_list.append(case_id)
                analyze_git_blame(file_path,func_name,blame_line)
                func_case_list.append(case_id)
    return func_case_list
예제 #39
0
def get_accurate_case(sql_result, blame_line, func_case_list):
    import os
    hash_code = sql_result[0]
    func_name = sql_result[1]
    file_path = sql_result[2]
    func_start_line = sql_result[3]
    func_end_line = sql_result[4]
    sql = "select CASE_ID,BLOOMFILTER from CASE_VERSION"
    cursor.execute(sql)
    rows = cursor.fetchall()
    for row in rows:
        case_id = row[0]
        BloomFilter_file = row[1]
        if os.path.exists(BloomFilter_file):
            bf = BloomFilter.fromfile(open(BloomFilter_file, "rb"))
            if hash_code in bf:
                global_accurate_case_list.append(case_id)
                analyze_git_blame(file_path, func_name, blame_line)
                func_case_list.append(case_id)
    return func_case_list
예제 #40
0
def convert_string_to_list(temp_buff):
    global proxies
    temp_list = temp_buff.split('\n') # break up the string at each \n
    proxies = []  
    size = len(temp_list)

    # create a list of proxies with the special tuple
    i = 1;
    while i < size:
        temp_bf_read = open('bf_read', "w") # create new file bf_read
        temp_bf_read.write(temp_list[i+2])  # write the bloom filter to the file
        temp_bf_read.write('\n')
        temp_bf_read.write(temp_list[i+3])
        temp_bf_read.close()
        temp_bf_read = open('bf_read', "r") 
        temp_bf = BloomFilter.fromfile(temp_bf_read) # de-serialize the bloom filter
        # add the tuple to the list of proxies
        proxies.append((temp_list[i], int(temp_list[i+1]), temp_bf))
        i = i + 4 

        if os.path.isfile('bf_read'): # remove the file used to hold the bloom filter
            os.remove('bf_read')
예제 #41
0
def test():
    with open('blooms/uni_names', 'r') as f:
        uni_names = BloomFilter.fromfile(f)
    with open('ngpols/uni_names', 'r') as f:
        uni_names_2 = NGPOLFilter.fromfile(f)
    with open('ngpols/uni_names_cluster', 'r') as f:
        uni_names_3 = NGClusterFilter.fromfile(f)
    print "Enter a name:"
    def testname(name, filt):
        if name in filt:
            print "Yup, that name was in the owl doc."
        else:
            print "That name wasn't in the owl doc."
    while True:
        name = raw_input().strip().lower()
        print "Bloom:"
        testname(name, uni_names)
        print "NGOPL:"
        testname(name, uni_names_2)
        print "NGCluster:"
        testname(name, uni_names_3)
        print
예제 #42
0
def main():
    #url =raw_input('input the tieba you like')
    #the tieba url you want to download the img ,pay attention to the format,'/f?kw=XXXXXXXXX'
    url = 'http://tieba.baidu.com/f?kw=轴心国画室'
    if not os.path.exists('%s' % (url[28:])):
        os.makedirs('%s' % (url[28:]))
        os.makedirs('./%s/check' % (url[28:]))
        record(url)
    else:
        numlist = getallnumlist(url)
        #ues the bloomfilter first time create and find the tie that haven't download
        with open('./%s/check/bloomfilter' % (url[28:]), 'rb') as b:
            bloomfilter = BloomFilter.fromfile(b)
        img_no_download = []
        for i in numlist:
            if not bloomfilter.add(i):
                img_no_download += i

        if not img_no_download:
            print 'nothing update'
        else:
            multiprocessdownload(img_no_download)
예제 #43
0
def test():
    with open('blooms/actor_names', 'rb') as f:
        bloom = BloomFilter.fromfile(f)
    with open('ngpols/actor_names', 'rb') as f:
        ngpol = NGPOLFilter.fromfile(f)
    with open('probsets/actor_names', 'rb') as f:
        probset = NgramProbSet.fromfile(f)
    print "Enter a title:"
    def testname(name, filt):
        if name in filt:
            print "Yup, that is an actor's name."
            if isinstance(filt, ProbabilitySet):
                print filt.getProbability(name)
        else:
            print "That wasn't an actor's name."
    while True:
        name = ''.join(re.findall('[A-Z0-9]+',raw_input().upper()))
        print "Bloom:"
        testname(name, bloom)
        print "NGOPL:"
        testname(name, ngpol)
        print "NgramProbSet:"
        testname(name, probset)
예제 #44
0
def start():
    res = request_get(biqukan_url)
    index = BeautifulSoup(res, features=features)

    if os.path.exists(bf_file):
        LOG.info('bs from file')
        bf = BloomFilter.fromfile(open(bf_file, 'r'))
    else:
        LOG.info('init bs')
        bf = BloomFilter(500000)

    try:
        book_urls = find_wanben()
        book_urls += find_new_storage_block(index)
        book_urls += find_recommend_block(index, u'强力推荐')
        book_urls += find_type_block(index, u'玄幻小说')
        book_urls += find_type_block(index, u'修真小说')
        book_urls += find_type_block(index, u'都市小说')
        book_urls += find_type_block(index, u'穿越小说')
        book_urls += find_type_block(index, u'网游小说')
        book_urls += find_type_block(index, u'科幻小说')
        book_urls += find_new_update_block(index)
        book_urls += find_wanben()
        book_num = len(book_urls)
        start = time.time()
        for i, url in enumerate(book_urls[:10]):
            download_book(url, bf)
            LOG.info(u'已经下载%s本,剩余%s本', i+1, book_num - i -1)
            # time.sleep(30)

        print '%s' % (time.time() - start)
        LOG.info(u'下载完成')
    except Exception as e:
        LOG.exception(e)
    finally:
        bf.tofile(open(bf_file, 'w'))
def main():
    #url =raw_input('input the tieba you like')
    #the tieba url you want to download the img ,pay attention to the format,'/f?kw=XXXXXXXXX'
    url ='http://tieba.baidu.com/f?kw=轴心国画室'
    if not os.path.exists('%s'  %(url[28:])  ):
        os.makedirs('%s' %(url[28:])  )
        os.makedirs('./%s/check' %(url[28:])  )
        record(url)
    else:
        numlist =getallnumlist(url)        
        #ues the bloomfilter first time create and find the tie that haven't download
        with open('./%s/check/bloomfilter' %(url[28:]) ,'rb') as b:
            bloomfilter =BloomFilter.fromfile(b)
        img_no_download =[]
        for i in numlist:
            if not bloomfilter.add(i):
                img_no_download += i



        if not img_no_download:
            print 'nothing update'
        else:
            multiprocessdownload(img_no_download)
예제 #46
0
파일: actor_example.py 프로젝트: Glank/rdp
#!/usr/bin/env python 
from rdp import *
from ngrams import *
from pybloom import BloomFilter
from edits import *

#initialize actor inclusion set
with open('blooms/actor_names', 'rb') as f:
    actor_bloom = BloomFilter.fromfile(f)
with open('ngpols/actor_names', 'rb') as f:
    actor_ngpol = NGPOLFilter.fromfile(f)
actor_fuzzy = BloomFSS(actor_bloom, 1)
actor_filt = OrSet([actor_ngpol, actor_fuzzy])

#initialize movie name inclusion set
with open('ngpols/film_titles', 'rb') as f:
    titles_filt = NGClusterFilter.fromfile(f)

#setup the grammar
S = Symbol('S')

is_ = InclusionSetTerminal(
    'IS',
    set(['BE','IS','WAS','WERE','ARE','DOES'])
);
will_ = InclusionSetTerminal(
    'WILL',
    set(['WILL', 'ARE'])
);

actor = InclusionSetTerminal('actor', actor_filt , max_words=3)
예제 #47
0
class UidQueue():
    """
    Uid queue, include queue and bloom filter
    """
    def __init__(self, max_count=200000, error_rate=0.001):
        """
        Initialize
        @param max_count: capacity of bloom filter
        @param error_rate: error_rate of bloom filter
        @return: None
        """
        self.queue   = Queue()
        self.bloom   = BloomFilter(capacity=max_count, error_rate=error_rate)
        self.crawled = 0
    
    @staticmethod
    def _remove_duplicate(list_in):
        """
        remove duplicated item in list
        @param list_in: list
        @return: None
        """
        return list(set(list_in))
    
    def dump(self, path, encoding):
        """
        Dump data to file
        @param path: path prefix
        @param encoding: file encoding
        @return: None 
        """
        try:
            print "Saving ... "
            with codecs.open(path+'-queue.bak', 'wb', encoding) as wf:
                tmp = {'queue': list(set(list(self.queue.queue))), 'count': self.crawled}
                json.dump(tmp, wf)
            with codecs.open(path+'-bloom.bak', 'wb') as wf:
                self.bloom.tofile(wf)
        except Exception as e:
            print "Dump Uid Queue Failed"
            print e
    
    def restore(self, path, encoding):
        """
        Restore data from file
        @param path: path prefix
        @param encoding: file encoding
        @return: None 
        """
        try:
            with codecs.open(path+'-bloom.bak', 'rb') as rf:
                self.bloom.fromfile(rf)
            with codecs.open(path+'-queue.bak', 'rb', encoding) as rf:
                tmp = json.load(rf)
                [self.queue.put(uid) for uid in tmp['queue']]
                self.crawled = tmp['count']       
            # set encoding=utf-8 is wrong, only deal with ascii
            
        except Exception as e:
            print "Restore Uid Queue Failed: ", e
    
    def _put_all(self, list_in, block, timeout):
        """
        Put all item in list to queue while item not in bloom
        @param list_in: Item from
        @param block: Is block for put open
        @param timeout: Timeout of put
        @return: None
        """
        [self.queue.put(uid, block, timeout) for uid in list_in]
    
    def extend(self, container, block=True, timeout=3):
        """
        Extend uid Queue, remove duplicated and put all in container to queue
        @param container: Where items in
        @param block: Is block for put open
        @param timeout: Timeout of put
        @return: Error return WRONG_TYPE
        """
        # TODO: seem like can not remove duplicate using set
        tmp = []
        for uid in container:
            if uid not in self.bloom and uid not in self.queue.queue:
                tmp.append(uid)
        self._put_all(list_in=tmp, block=block, timeout=timeout)
        
    
    def get(self, block=True, timeout=3):
        """
        Get uid from queue, and add it into bloom filter 
        @param block: Is block for put open
        @param timeout: Timeout of put
        @return: uid
        """
        uid = self.queue.get(block=block, timeout=timeout)
        self.bloom.add(uid)
        self.crawled += 1
        return uid
    
    def __len__(self):
        """
        Length of uid queue
        """
        return self.queue.qsize()
예제 #48
0
def load_bloomfilter(flname):
    with open(flname) as fl:
        bfilter = BloomFilter.fromfile(fl)
    return bfilter
예제 #49
0
from django.shortcuts import render
from django.http import HttpResponse
from django.template import loader
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from kafka import KafkaProducer
import random
import redis 

from pybloom import BloomFilter
bloom_filter = BloomFilter(capacity=1000, error_rate=0.001)
try :
    with open("bloom","r") as bloom_file:
        bloom_filter = bloom_filter.fromfile(bloom_file)
except :
    print "File not found"
_redis = redis.StrictRedis(password='******')
producer = KafkaProducer(bootstrap_servers='localhost:9092')

def validate_url(request):
    long_url = request.GET.get('long_url', None)
    data = {
        'is_taken': long_url == 'google.com'
    }
    return JsonResponse(data)

def manage_url(request, short_url):

    # check in bloom filter
    global bloom_filter
    try :
예제 #50
0
def main():
    default_config_file = '/nsrl/nsrl.conf'
    config = configparser.ConfigParser()
    config.read(default_config_file)
    #add commandline options
    hash_type = config.get('config', 'hashfile_type')

    parser = argparse.ArgumentParser(prog='nsrl')
    parser.add_argument("-v",
                        "--verbose",
                        help="Display verbose output message",
                        action="store_true",
                        required=False)
    parser.add_argument("-0",
                        "--no-hits",
                        help="Suppress Output of matching hashes",
                        action="store_true",
                        required=False)
    parser.add_argument("-1",
                        "--no-misses",
                        help="Suppress Output of mismatching hashes",
                        action="store_true",
                        required=False)
    inputs = parser.add_mutually_exclusive_group(required=True)
    inputs.add_argument('hash',
                        metavar='<{}>'.format(hash_type),
                        type=str,
                        nargs='*',
                        default=[],
                        help='{} hash to search for.'.format(hash_type))
    inputs.add_argument('-s',
                        '--stdin',
                        help="Read hashes from stdin",
                        action="store_true")
    args = parser.parse_args()

    if args.verbose:
        print("Version INFO: {}".format(config.get('config', "rds_version")))
        print("Error Rate: {}".format(config.get('config', "error_rate")))
        print("Build Date: {}".format(config.get('config', "build_date")))
        print("Filename: {}".format(config.get('config', "hashfile_name")))
        print("Hashcount: {}".format(config.get('config', "hash_count")))

    with open('nsrl.bloom', 'rb') as nb:
        bf = BloomFilter.fromfile(nb)

        if args.stdin:
            hashlist = [hash.strip() for hash in sys.stdin.readlines()]
        else:
            hashlist = args.hash
        for hash_hex in hashlist:
            hash = binascii.unhexlify(hash_hex)
            output = ""

            # only print output if for mismatches if selected
            hash_is_a_match = (hash in bf)
            if (hash_is_a_match
                    and not args.no_hits) or (not hash_is_a_match
                                              and not args.no_misses):
                #output
                if args.verbose:
                    output = "{}:{}".format(hash_hex, hash_is_a_match)
                elif args.no_hits != args.no_misses:
                    output = "{}".format(hash_hex)
                else:
                    output = "{}:{}".format("+" if hash_is_a_match else "-",
                                            hash_hex)
                print(output)
    return
예제 #51
0
def get_bf_by_case(case_file_path):
    sql = "select BLOOMFILTER from CASE_VERSION where PATH=" + case_file_path   
    cursor.execute(sql)
    bf_filename = cursor.fetchone()
    return BloomFilter.fromfile(open(bf_filename,"rb"))
예제 #52
0
파일: edits.py 프로젝트: Glank/rdp
 def fromfile(f):
     inst = BloomFFS(None, 0)
     inst.max_edits = pickle.load(f)
     inst.alphabet = pickle.load(f)
     inst.bloom = BloomFilter.fromfile(f)
예제 #53
0
파일: ikantxt2.py 프로젝트: Gzure/spider
features = 'lxml'

url = 'https://www.biqukan.com'
wan_ben_url = 'https://www.biqukan.com/wanben'
bf_file = 'ikantxt2'
base_dir = u'/downloads/小说'
# base_dir = u'小说'
content_f = re.compile(u'.*正文卷')

req = requests.get(url=url, timeout=10)
html = req.text
index = BeautifulSoup(html, features=features)

if os.path.exists(bf_file):
    LOG.info('bs from file')
    bf = BloomFilter.fromfile(open(bf_file, 'r'))
else:
    LOG.info('init bs')
    bf = BloomFilter(500000)


def find_title(name):
    return index.find('h2', text=name)


def find_container(name):
    return index.find('h2', text=name).find_next()


def find_wanben():
    book_urls = []
예제 #54
0
def _load_filter():
    bf = BloomFilter.fromfile(open(_BLOOM_DUMP))
    return bf
예제 #55
0
def get_bf_by_case(case_file_path):
    sql = "select BLOOMFILTER from CASE_VERSION where PATH=" + case_file_path
    cursor.execute(sql)
    bf_filename = cursor.fetchone()
    return BloomFilter.fromfile(open(bf_filename, "rb"))
예제 #56
0
파일: urlset.py 프로젝트: fikgol/vulcan
#! /usr/bin/env python
import os 
import sys
from pybloom import BloomFilter
from unittest import TestSuite
if __name__=="__main__":
  ser=open("oom","rb")
  ser.seek(0)
  oom=BloomFilter.fromfile(ser)
  for f in os.listdir(sys.argv[1]):
	if f.find(".log") == -1:
		continue
	for line in open(os.path.join(sys.argv[1],f)):
		ss=line.strip().split("\t")
		if not (ss[len(ss)-1] in oom):
			print ss[len(ss)-1]


sys.exit(0)

oom = BloomFilter(capacity=1000*1000*200,error_rate=0.0001)
for f in os.listdir(sys.argv[1]):
	if f.find(".log") == -1:
		continue
	for line in open(os.path.join(sys.argv[1],f)):
		ss=line.strip().split("\t")
		oom.add(ss[len(ss)-1])
		if not (ss[len(ss)-1] in oom ):
			print ss[len(ss)-1]
ser=open("oom","wb")
oom.tofile(ser)