Пример #1
0
def all_surface(
    uri_pagetitle_file: Path, surfaceformscore_file: Path, redirect_file: Path = None,
):
    """Collect all surfaceforms, print as (URI, surfaceform-json) TSV

    Args:
        uri_pagetitle_file: TSV of (uri, page title) pairs
        surfaceformscore_file: surfaceFormsScore file
        redirect_file: Redirect file
    """
    import sys, tqdm, json

    assert uri_pagetitle_file.exists()
    assert surfaceformscore_file.exists()

    def load_synonyms(lines: typing.Collection[str]):
        import urllib.parse as ul

        for line in lines:
            line = ul.unquote_plus(line.strip())
            try:
                a, b = line.split("\t", 1)
                a, b = a.strip(), b.strip()
                if a and b and (a != b):
                    yield b, a
                    yield a, b
            except:
                pass

    syn = {}
    check_syn: typing.Container = ()
    if redirect_file:
        print("Loading synonyms from", redirect_file, file=sys.stderr)
        t = get_file_lines(redirect_file)
        syn = dict(load_synonyms(tqdm.tqdm(redirect_file.open(), total=t)))

        try:
            from pybloomfilter import BloomFilter # type: ignore

            print(f"Making Bloom filter", file=sys.stderr)
            bf = BloomFilter(len(syn), 0.1, "/tmp/filter.bloom")
            bf.update(syn)
            check_syn = bf
        except:
            check_syn = syn

    print(f"Using {len(syn)} synonyms", file=sys.stderr)

    def get_synonyms(s, path=()):
        if s and (s not in path):
            yield s
            if s in check_syn:
                yield from get_synonyms(syn.get(s), path + (s,))

    ent_surface_scores: typing.Dict = {}
    if surfaceformscore_file:
        import urllib.parse as ul

        t = get_file_lines(surfaceformscore_file)
        print(f"Loading surface forms from {surfaceformscore_file}", file=sys.stderr)
        with Path(surfaceformscore_file).open() as fo:
            for line in tqdm.tqdm(fo, total=t):
                try:
                    line = ul.unquote_plus(line.strip())
                    ent, surface, score = line.split("\t")
                    score = float(score)
                    if "\\u" in surface:
                        surface = surface.encode("utf8").decode("unicode-escape")
                    for val in get_synonyms(surface):
                        ss = ent_surface_scores.setdefault(ent, {})
                        ss[val] = max(ss.get(val, 0), score)
                except Exception as e:
                    log.error(e)

    t = get_file_lines(uri_pagetitle_file)
    for line in tqdm.tqdm(open(uri_pagetitle_file), total=t):
        try:
            uri, pagetitle = line.strip().split(None, 1)
            import urllib.parse as ul

            pagetitle = ul.unquote_plus(pagetitle)
            surface_score = ent_surface_scores.get(pagetitle)
            if surface_score:
                top = max(surface_score.values())
                surface_score = {
                    sur: round((score / top) if score != 1.0 else 1.0, 5)
                    for sur, score in surface_score.items()
                }
                print(uri, json.dumps(surface_score), sep="\t")
        except Exception as e:
            log.error(e)
Пример #2
0
 def load_bf(self, filename, capacity, error_rate):
     bf = BloomFilter(capacity=capacity, error_rate=error_rate)
     with open(filename) as f:
         for line in f:
             bf.add(line.split('\t')[0].strip())
     return bf
Пример #3
0
from pybloomfilter import BloomFilter
import sys, signal
from time import time, sleep
import os
from worker_filter import Filter

st = time()

done_sites_fname = 'done_sites.bin'
if os.path.isfile(done_sites_fname):
    bfdone = BloomFilter.open(done_sites_fname)
else:
    print "no file"
    bfdone = BloomFilter(2**27, 10**(-5), done_sites_fname)  #8M

start = 0

filter = Filter()

f = open('done_urls20160601.txt').read().strip().split('\n')
for url in f:
    bfdone.add(url)
print len(f)
cnt = 0
for url in f:
    if url in bfdone:
        cnt += 1
print cnt
inc = 0

print time() - st
Пример #4
0
_, dim = T_des.shape

# In[4]:

LSH_random_vectors_set = []
#powers_of_two = 1 << np.arange(LSH_dim-1, -1, -1)

# creating the multiple LSH random vectors
for i in range(L_buckets):
    np.random.seed(i)
    LSH_random_vectors_set.append(np.random.randn(dim, LSH_dim))

# creating the multiple Bloom Filters
BF_set = []
for i in range(L_buckets):
    BF_set.append(BloomFilter(2**(2 * LSH_dim), 0.01, None))

# In[5]:

t0 = time.process_time()

Q_kp, Q_des = detector.detectAndCompute(query_img, None)

t1 = time.process_time()

# We now add each LSH hash result to their dedicated Bloom Filter
for i in range(L_buckets):
    Q_reflections = Q_des.dot(LSH_random_vectors_set[i]) >= 0

    for q in np.array(Q_reflections, dtype=int):
        BF_set[i].add(q.tostring(None))
Пример #5
0
#!/usr/bin/env python
# coding:utf-8
# manning  2015-1-27
import time
import os
import urlparse
import hashlib
import sys
#sys.path.append("..")

#from config.config import *
#reload(sys)
#sys.setdefaultencoding("utf-8")
from pybloomfilter import BloomFilter

bf = BloomFilter(100000, 0.01)


def format(url):
    '''
    策略是构建一个三元组
    第一项为url的netloc
    第二项为path中每项的拆分长度
    第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题)
    '''
    if urlparse.urlparse(url)[2] == '':
        url = url + '/'

    url_structure = urlparse.urlparse(url)
    netloc = url_structure[1]
    path = url_structure[2]
Пример #6
0
1. 国内-省-目的地 可以获取该地区所有城市
2. 城市-景点 可以获取该城市所有景点
3. 城市-社区-游记 可以获取该城市所有游记

--
BloomFilter
"""

import os
import requests
import re
from pybloomfilter import BloomFilter


dir_name = 'notes/'
bf = BloomFilter(1024 * 1024 * 16, 0.01)


def find_all_city_pages_url():
    req = requests.get('http://www.mafengwo.cn/mdd/')
    city_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', req.text)
    return city_pages


def get_city_number(url):
    return url[29:34]


def save_html(file_name, html):
    with open(file_name, 'wb+') as f:
        f.write(html.encode())
def setup(database: dict,
          password: str,
          bloomfilter_file=None,
          bf_false_positive_rate=BLOOMFILTER_DEFAULT_FALSE_POSITIVE_RATE,
          paralleled=False,
          num_processes=None) -> tuple:
    """
    Setup method of OXT for a database
    :param database: database with id -> list of words
    :param password: password to create keys
    :param bloomfilter_file: file to read/write bloomfilter
    :param bf_false_positive_rate: bloomfilter false positive rate
    :param bool paralleled: should we parallel the process or not
    :param num_processes: number of process used if parallel
    :return: (key, encrypted database)
    """
    global var_dict

    # TODO: generate keys from password
    K_P = random_secure(1)  # key to XOR index

    K_S = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for e
    iv = random_secure(
        CMAC_AES128_KEY_LENGTH_IN_BYTES)  # IV for AES encryption
    K_X = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for xtag
    K_I = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for index
    K_Z = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for Z
    K_T = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES)  # Key for keyword

    pairing = PairingGroup('SS512')

    g = pairing.random(GT)
    assert g.initPP(), "ERROR: Failed to init pre-computation table for g."

    total_pairs = 0
    inverted_index_all_pairs = defaultdict(
        list)  # word -> list of ids containing this word

    if paralleled:
        # parallel processing
        logger.info('Parallel gen_inverted_index')
        pool = multiprocessing.Pool()
        num_docs = len(database)
        inverted_tuples = pool.starmap(
            gen_inverted_index_paralleled,
            list(zip(database.items(), [K_P] * num_docs)))
        for inverted_list in inverted_tuples:
            for word, rind in inverted_list:
                inverted_index_all_pairs[word].append(rind)
                total_pairs += 1

    else:
        # sequential processing
        logger.info('Seq inverted_index_all_pairs')
        for (ind, words) in database.items():
            inverted_list = gen_inverted_index(ind, words, K_P)

            for word, rind in inverted_list:
                inverted_index_all_pairs[word].append(
                    rind)  # rind is now bytes
                total_pairs += 1

    # generate xtags. Each xtag is for a pair (word, index)
    xtags = set()

    if paralleled:
        logger.info('Parallel xtags')
        # parallel processing
        with multiprocessing.Pool(processes=num_processes,
                                  initializer=init_gen_xtags_parallel,
                                  initargs=(K_X, pairing, K_I, g)) as pool:
            xtags_lists = pool.map(gen_xtags_parallel,
                                   inverted_index_all_pairs.items())

            for xtags_list in xtags_lists:
                xtags.update(xtags_list)

            var_dict = {}
    else:
        logger.info('Seq xtags')
        for word, indices in inverted_index_all_pairs.items():
            xtags.update(gen_xtags(word, indices, K_X, pairing, K_I, g))

    # Create a Bloom filter and bitarray
    if bloomfilter_file is not None:
        bf = BloomFilter(total_pairs, bf_false_positive_rate, bloomfilter_file)
    else:
        bf = BloomFilter(total_pairs, bf_false_positive_rate)
    num_bits = bf.num_bits
    bits = bitarray(num_bits)
    bits.setall(False)

    # compute the positions of each xtag and set it
    # the reason we need to use bits array because the library doesn't expose bits. e.g. check if a bit is set or not
    xtag: str
    for xtag in xtags:
        bf.add(xtag)

        # mimic set in bits array
        for hash_seed in bf.hash_seeds:
            pos = bloomfilter_hash(xtag, hash_seed) % num_bits
            bits[pos] = True

    # generate encrypted database
    edb1 = dict()
    if paralleled:
        logger.info('Parallel edb1')
        # parallel processing
        with multiprocessing.Pool(processes=num_processes,
                                  initializer=init_gen_t_set_parallel,
                                  initargs=(K_S, K_I, K_Z, K_T, iv,
                                            pairing)) as pool:
            t_set_dict_lists = pool.map(gen_t_set_parallel,
                                        inverted_index_all_pairs.items())

            for t_set_dict in t_set_dict_lists:
                edb1.update(t_set_dict)

            var_dict = {}
    else:
        logger.info('Seq edb1')

        for word, indices in inverted_index_all_pairs.items():
            edb1.update(
                gen_t_set(word, indices, K_S, K_I, K_Z, K_T, iv, pairing))

    key = (K_P, K_S, K_X, K_I, K_Z, K_T)
    g_serialized = pairing.serialize(g)

    return key, iv, g_serialized, edb1, bf, bits
Пример #8
0
# In[3]:

T_kp, T_des = detector.detectAndCompute(train_img, None)

# In[4]:

_, dim = T_des.shape
LSH_dim = 16
np.random.seed(0)
LSH_random_vectors = np.random.randn(dim, LSH_dim)
powers_of_two = 1 << np.arange(LSH_dim - 1, -1, -1)

# In[5]:

bf = BloomFilter(10**(LSH_dim / 4), 0.01, None)

# We maximize the efficiency by utilizing matrix operations
# for the crude LSH implementation

t0 = time.process_time()

Q_kp, Q_des = detector.detectAndCompute(query_img, None)

t1 = time.process_time()

Q_reflections = Q_des.dot(LSH_random_vectors) >= 0
#Q_bin = Q_reflections.dot(powers_of_two)

# And we remove duplicates to ensure uniqueness of features
for q in np.array(Q_reflections, dtype=int):
Пример #9
0
 def __init__(self):
     self.filename = ROOTDIR + '/factory/cfg/filter.bloom'
     self.bf = self.__getbloomfilter()
     if self.bf == -1:
         self.bf = BloomFilter(100000, 0.001, self.filename)
     pass
Пример #10
0
 def __init__(self):
     try:
         self.bf = BloomFilter.open('tuniu.filter')
     except:
         logging.info("new filter.bloom")
         self.bf = BloomFilter(100000000, 0.05, 'tuniu.filter')
Пример #11
0
 def forwards(self, orm):
     for poll in orm.Poll.objects.all():
         poll.seen_ips = BloomFilter(1000, 0.01,
                                     '/tmp/test.bloom').to_base64()
         poll.save()
def analyse_fpr(matrix, df, i, j, correl_data_struct, target_fpr, block_size):
    num_blocks = math.floor(len(matrix) / block_size)

    print("num blocks:", num_blocks)

    many_many_elements = set(correl_data_struct.exception_list_0)
    one_many_elements = set(correl_data_struct.exception_list_not_one)

    size_correl = 0.0
    size_normal = 0.0

    block_bloom_list_0_normal = []
    block_bloom_list_0_correl = []
    block_bloom_list_1 = []

    block_set_0 = []
    block_set_1 = []

    for t in range(0, num_blocks):
        block_set_0.append(set([]))
        block_set_1.append(set([]))

    for t in range(0, int(block_size * num_blocks)):
        ind = math.floor(t / block_size)
        block_set_0[ind].add(matrix[t][i])
        block_set_1[ind].add(matrix[t][j])

    for t in range(0, num_blocks):

        count_to_add = 0

        for item in block_set_0[t]:
            if item in one_many_elements:
                count_to_add += 1

        block_bloom_list_0_correl.append(BloomFilter(count_to_add, target_fpr))
        block_bloom_list_0_normal.append(
            BloomFilter(len(block_set_0[t]), target_fpr))
        block_bloom_list_1.append(BloomFilter(len(block_set_1[t]), target_fpr))

        for item in block_set_0[t]:
            block_bloom_list_0_normal[-1].add(item)
            if item in one_many_elements:
                block_bloom_list_0_correl[-1].add(item)

        # print("perecentage used:",count_to_add*1.00/len(block_set_0[t]))

        for item in block_set_1[t]:
            block_bloom_list_1[-1].add(item)

        size_normal += 1.44 * math.log(1.00 / target_fpr, 2) * len(
            block_set_0[t])
        size_correl += 1.44 * math.log(1.00 / target_fpr, 2) * count_to_add

    print("Size Ratio:", size_correl * 1.00 / size_normal)
    # correl_bf=BloomFilter(len(correl_data_struct.exception_list_0), 0.01)
    # for item in correl_data_struct.exception_list_0:
    #   correl_bf.add(item)
    #   # print(item)

    # correl_bf_not_one=BloomFilter(len(correl_data_struct.exception_list_not_one), 0.01)
    # for item in correl_data_struct.exception_list_not_one:
    #   correl_bf_not_one.add(item)

    # size_correl=size_normal
    # size_correl+=1.44*math.log(1.00/0.01,2)*len(correl_data_struct.exception_list_0)
    # size_correl+=1.44*math.log(1.00/0.01,2)*len(correl_data_struct.exception_list_not_one)

    num_queries_per_block = 1000

    total_negatives = 0
    total_false_positives_normal = 0
    total_false_positives_correl = 0

    for curr_block in tqdm(range(0, num_blocks)):
        rand_list = np.random.uniform(0, 1.0, num_queries_per_block)

        for t in range(0, num_queries_per_block):
            ind = math.floor(rand_list[t] * num_blocks * block_size)

            # If true positive, continue
            if matrix[ind][i] in block_set_0[curr_block]:
                if matrix[ind][i] not in many_many_elements:
                    val = math.floor(matrix[ind][i] /
                                     correl_data_struct.factor_0_to_1)
                    # This will give an error if the factor is too small
                    if val not in block_bloom_list_1[
                            curr_block] or val not in block_set_1[curr_block]:
                        print("ERROR", val, matrix[ind][i], matrix[ind][j])
                        sys.exit(1)
                continue

            total_negatives += 1

            if matrix[ind][i] in block_bloom_list_0_normal[curr_block]:
                total_false_positives_normal += 1

            if matrix[ind][i] in many_many_elements:
                if matrix[ind][i] in block_bloom_list_0_correl[curr_block]:
                    total_false_positives_correl += 1
            else:
                val = math.floor(matrix[ind][i] /
                                 correl_data_struct.factor_0_to_1)
                if matrix[ind][i] in one_many_elements:
                    if matrix[ind][i] in block_bloom_list_0_correl[
                            curr_block] and val in block_bloom_list_1[
                                curr_block]:
                        total_false_positives_correl += 1
                else:
                    if val in block_bloom_list_1[curr_block]:
                        total_false_positives_correl += 1

    fpr_correl = total_false_positives_correl * 1.00 / total_negatives
    fpr_normal = total_false_positives_normal * 1.00 / total_negatives
    print("Normal False positive rate:", fpr_normal)
    print("Correl False positive rate:", fpr_correl)

    print("\n\n")

    return fpr_correl, size_correl, fpr_normal, size_normal
Пример #13
0
# In[3]:

T_kp, T_des = detector.detectAndCompute(train_img, None)

# In[4]:

_, dim = T_des.shape
LSH_dim = 16
np.random.seed(0)
LSH_random_vectors = np.random.randn(dim, LSH_dim)
powers_of_two = 1 << np.arange(LSH_dim - 1, -1, -1)

# In[5]:

bf = BloomFilter(2**(LSH_dim + 1), 0.01, None)

# We maximize the efficiency by utilizing matrix operations
# for the crube LSH implementation
t1 = time.process_time()

Q_kp, Q_des = detector.detectAndCompute(query_img, None)

Q_reflections = Q_des.dot(LSH_random_vectors) >= 0
Q_bin = Q_reflections.dot(powers_of_two)

# And we remove duplicates to ensure uniqueness of features
for q in list(set(Q_bin)):
    # needs to insert here a method for re-hashing or
    # transforming the array list of descriptors to a bit array
Пример #14
0
def R_2_A(index_url, url_tail, site_name, level, is_sege):
    if not is_sege:
        if level == 0:

            return general_func.Relative_to_Absolute(index_url, url_tail)
        elif level == 1:
            if site_name == "qq_copyright":
                temp = []
                #这里拿到所有的专辑id,在url_tail(在此作一个去重),去访问所有的专辑信息
                bloomname = "albummid_filter"
                isexists = os.path.exists(bloomname + ".bloom")
                if isexists:
                    #存在即打开这个文件
                    bf = BloomFilter.open(bloomname + ".bloom")
                else:
                    #不存在即创建
                    bf = BloomFilter(10000000, 0.001, bloomname + ".bloom")
                for token in url_tail:
                    if not bf.add(token):
                        temp.append(token)
                        #url_tail.remove(token)
                        #print "重复id,丢弃",token
                res_urls = []
                map(
                    lambda i: res_urls.append(
                        "https://c.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid={aid}&g_tk=5381&format=jsonp"
                        .format(aid=i)), temp)
                return res_urls

                return general_func.Relative_to_Absolute(index_url, url_tail)
            elif level == 2:

                return general_func.Relative_to_Absolute(index_url, url_tail)
            elif level == 3:

                return general_func.Relative_to_Absolute(index_url, url_tail)
            elif level == 4:

                return general_func.Relative_to_Absolute(index_url, url_tail)
    else:
        if level == 0:
            if site_name == "qq_music":
                res_urls = []
                map(
                    lambda i: res_urls.append(
                        "https://y.qq.com/portal/singer/{aid}.html".format(
                            aid=i)), url_tail)
                return res_urls
            if site_name == "qq_copyright":
                #这里拿到歌手id,直接去请求下面这个所有歌曲的接口 , 每个歌曲中都会带有一个albummid,我们需要对这个albummid作一个去重
                res_urls = []
                map(
                    lambda i: res_urls.append(
                        "https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg?g_tk=5381&format=jsonp&singermid={aid}&begin=0&num=900"
                        .format(aid=i)), url_tail)
                return res_urls

            return general_func.Relative_to_Absolute(index_url, url_tail)
        elif level == 1:

            return general_func.Relative_to_Absolute(index_url, url_tail)
        elif level == 2:

            return general_func.Relative_to_Absolute(index_url, url_tail)
        elif level == 3:

            return general_func.Relative_to_Absolute(index_url, url_tail)
        elif level == 4:

            return general_func.Relative_to_Absolute(index_url, url_tail)
Пример #15
0
                                "(install with `pip install pydeep`)")
try:
    import magic
except ImportError:
    raise MaliceDependencyError("Unable to import magic "
                                "(install with `pip install magic`)")
try:
    from pybloomfilter import BloomFilter
except ImportError:
    raise MaliceDependencyError("Unable to import pybloomfilter "
                                "(install with `pip install pybloomfilter`)")

if os.path.isfile('filter.bloom'):
    bf = BloomFilter.open('filter.bloom')
else:
    bf = BloomFilter(10000000, 0.01, 'filter.bloom')

# csrf = CsrfProtect(app)

sm = ScanManager()

github = 'https://github.com/blacktop/malice'  #current_app.config['GITHUB']


# open connection before each request
@malice.before_request
def before_request():
    try:
        g.rdb_conn = r.connect(host='localhost', port=28015, db='file')
        g.rdb_sess_conn = r.connect(host='localhost', port=28015, db='session')
        g.rdb_sample_conn = r.connect(host='localhost',
Пример #16
0
#!/usr/bin/env python
from pybloomfilter import BloomFilter
import os.path
import sys

# cat <values_file> | ./ingest.py <bloom_file> <max_items> <error_rate>

bloomFilePath = sys.argv[1]
if os.path.isfile(bloomFilePath):
    bf = BloomFilter.open(bloomFilePath)
else:
    maxItems = int(sys.argv[2])
    errorRate = float(sys.argv[3])
    bf = BloomFilter(maxItems, errorRate, bloomFilePath)

valuesBuffer = []
for line in iter(sys.stdin.readline, ''):
    valuesBuffer.append(unicode(line.rstrip('\n')))
    if len(valuesBuffer) > 100000:
        bf.update(valuesBuffer)
        valuesBuffer = []

bf.update(valuesBuffer)
bf.sync
Пример #17
0
train_data = add_padding_idx(train_data)
test_data = add_padding_idx(test_data)

# Note that, no matter how many node types are here, make sure the
# hyperedge (N1,N2,N3,...) has id, N1 < N2 < N3...

compress = True
# Note that, no matter how many node types are here, make sure the
# hyperedge (N1,N2,N3,...) has id, N1 < N2 < N3...
if not dynamic_dict:
	test_dict = build_hash(dict_data, compress=compress, max_size=max_size,
								 min_size=min_size, fname="test")
	train_dict = test_dict
	# train_dict = build_hash(train_data, compress = compress, max_size=max_size, min_size = min_size, fname="test")
else:
	train_dict = [BloomFilter(1e8, 1e-3) for i in range(max_size + 1)]
	test_dict = [BloomFilter(1e8, 1e-3) for i in range(max_size + 1)]
print("dict_size", len(train_dict), len(test_dict))



print("after weight filter", train_data.shape, test_data.shape, dict_data.shape)
print(train_weight, np.min(train_weight), np.max(train_weight))
train_weight_mean = np.mean(train_weight)
train_weight = train_weight / train_weight_mean * neg_num
test_weight = test_weight / train_weight_mean * neg_num
dict_weight = dict_weight / train_weight_mean * neg_num
print("train data amount", len(train_data))


if args.feature == 'walk':
Пример #18
0
sys.setdefaultencoding('utf-8')  #系统输出编码置为utf8
sys.setrecursionlimit(1000000)  #设置递归调用深度

urlTest = 'http://www.my089.com/ConsumerInfo1.aspx?uid=0C7C8143B7536149'
urlStart = urlTest

filedirectory = getConfig()
#test()
if login():
    print('Login success!')
    #test()

    strtime = str(time.strftime('%Y%m%d%H%M', time.localtime(time.time())))

    createFolder('log')
    bf = BloomFilter(100000000, 0.001, 'log/' + strtime + 'filter' + '.bloom')
    print "num_bits: " + str(bf.num_bits)
    print "num_hashes: " + str(bf.num_hashes)
    #bf.clear_all()

    #orderCount = 0
    #allCount = 0

    logf1 = open('log/' + strtime + 'log1' + '.log', 'wb')  #记录处理过的页面
    logf2 = open('log/' + strtime + 'log2' + '.log', 'wb')  #记录处理过的页面
    logAll = open('log/' + strtime + 'all' + '.log', 'wb')  #记录所有找到的链接
    aList.append(urlDefault)
    aList.append(urlSucceed)

    allCount += len(aList)
    for item in aList:
Пример #19
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
from JobCrawler import JobCrawler
from pybloomfilter import BloomFilter
from time import time

company_bf = BloomFilter(1024 * 1024 * 16, 0.01)
total_page = 1


def get_company_info(url, page=1):
    if page > total_page:
        return

    wbdata = requests.get(url).content
    soup = BeautifulSoup(wbdata, 'lxml')

    # print soup.prettify()

    company_list = soup.select('div.el > span.t2')
    # print type(company_list), '\ncompany_list :', company_list
    for index, company in enumerate(company_list):
        if index != 0:
            company_result = company.find_all(name='a')
            company_link = company_result[0].attrs['href']
            company_name = company_result[0].attrs['title']
            print company_name, ' - ', company_link
Пример #20
0
class MostFollowTopicsSpider(scrapy.spiders.Spider):
    name = "MostFollowTopicsSpider"
    allowed_domains = ["zhihu.com"]
    start_urls = ['https://www.zhihu.com/topic/19776749/organize/entire']

    topic_bloom_filter = BloomFilter(500000, 0.001, 'topic.bloom')

    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
    }

    def parse(self, response):
        url = response.request.url
        if response.request.method == 'GET':
            index = len('https://www.zhihu.com/topic/')
            try:
                topic_id = int(response.url[index:index + 8])
            except:
                raw_input('wrong...wait!')
                yield scrapy.Request(url=response.url, callback=self.parse)
            topic_name = response.css(
                'h1.zm-editable-content::text').extract_first()
            followers = int(
                response.css('div.zm-topic-side-followers-info strong::text').
                extract_first()) or 0

            self.topic_bloom_filter.add(topic_id)
            yield {
                'topic_id': topic_id,
                'topic_name': topic_name,
                'followers': followers,
            }
            print('[%s] %s: %s' % (topic_id, topic_name, followers))

            self.xsrf = response.css(
                'input[name=_xsrf]::attr(value)').extract_first()
            yield scrapy.FormRequest(response.url,
                                     formdata={'_xsrf': self.xsrf},
                                     callback=self.parse)
        elif response.request.method == 'POST':
            js = json.loads(response.text)
            for topic_object_list in js['msg'][1]:
                topic_object = topic_object_list[0]
                if topic_object[0] == 'topic':
                    if topic_object[2] not in self.topic_bloom_filter:
                        yield scrapy.Request(
                            'https://www.zhihu.com/topic/%s/organize/entire' %
                            topic_object[2],
                            callback=self.parse)
                    else:
                        print('repeat')
                elif topic_object[0] == 'load':
                    print('more!!')
                    url = urlparse.urlparse(
                        response.url).path + '?child=%s&parent=%s' % (
                            topic_object[2], topic_object[3])
                    url = response.urljoin(url)
                    yield scrapy.FormRequest(url,
                                             formdata={'_xsrf': self.xsrf},
                                             callback=self.parse)

    def start_requests(self):
        t = str(int(time.time() * 1000))
        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
        return [
            scrapy.Request(url=captcha_url,
                           headers=self.header,
                           callback=self.parser_captcha)
        ]

    def parser_captcha(self, response):
        with open('captcha.jpg', 'wb') as f:
            f.write(response.body)
            f.close()
        try:
            im = Image.open('captcha.jpg')
            im.show()
            im.close()
        except:
            print(u'请到 %s 目录找到captcha.jpg 手动输入' %
                  os.path.abspath('captcha.jpg'))
        captcha = raw_input("please input the captcha\n>")
        return scrapy.FormRequest(url='https://www.zhihu.com/#signin',
                                  headers=self.header,
                                  callback=self.login,
                                  meta={'captcha': captcha})

    def login(self, response):
        xsrf = response.xpath("//input[@name='_xsrf']/@value").extract_first()
        if xsrf is None:
            return ''
        post_url = 'https://www.zhihu.com/login/phone_num'
        post_data = {
            "_xsrf": xsrf,
            "phone_num": '13987654321',
            "password": '******',
            "captcha": response.meta['captcha']
        }
        return [
            scrapy.FormRequest(url=post_url,
                               formdata=post_data,
                               headers=self.header,
                               callback=self.check_login)
        ]

    # 验证返回是否成功
    def check_login(self, response):
        js = json.loads(response.text)
        if 'msg' in js and js['msg'] == u'登录成功':
            for url in self.start_urls:
                yield scrapy.Request(url=url,
                                     headers=self.header,
                                     dont_filter=True)
        else:
            print('login failed')
            print(js['msg'])
Пример #21
0
import requests
import re
import json
from redis import Redis
from rq import Queue
from bs4 import BeautifulSoup
from pybloomfilter import BloomFilter
from utils import get_html,get_proxy,delete_proxy,get_content
from urllib.parse import urlencode

low = Queue('low',connection=Redis(host='localhost',port=6379))

bloom_f = BloomFilter(capacity=100000, error_rate=0.01)


def spider_movie_comment(movie_id):
    # Get Pages
    url = "https://movie.douban.com/subject/"+movie_id+"/reviews?start="
    head = get_html(url+str(0))
    html = BeautifulSoup(head.content,"lxml")
    temp_html = html.select("#content > h1")
    print(temp_html)
    # f = open("index.html","w")
    # f.write(html.prettify())
    # f.close()

    text = temp_html[0].text
    page = int(re.sub(r"\D*","", text))
    data = []

    for page_num in range(page//20+1):
Пример #22
0
class CrawlBSF:
    request_headers = {
        'host': "www.mafengwo.cn",
        'connection': "keep-alive",
        'cache-control': "no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
    }

    cur_level = 0
    max_level = 5
    dir_name = 'iterate/'
    iter_width = 50
    downloaded_urls = []

    du_md5_file_name = dir_name + 'download.txt'
    du_url_file_name = dir_name + 'urls.txt'

    download_bf = BloomFilter(1024 * 1024 * 16, 0.01)

    cur_queue = deque()
    child_queue = deque()

    def __init__(self, url):
        self.root_url = url
        self.cur_queue.append(url)
        self.du_file = open(self.du_url_file_name, 'a+')
        try:
            self.dumd5_file = open(self.du_md5_file_name, 'r')
            self.downloaded_urls = self.dumd5_file.readlines()
            self.dumd5_file.close()
            for urlmd5 in self.downloaded_urls:
                self.download_bf.add(urlmd5[:-2])
        except IOError:
            print "File not found"
        finally:
            self.dumd5_file = open(self.du_md5_file_name, 'a+')

    def enqueueUrl(self, url):
        self.child_queue.append(url)

    def dequeuUrl(self):
        try:
            url = self.cur_queue.popleft()
            return url
        except IndexError:
            self.cur_level += 1
            if self.cur_level == self.max_level:
                return None
            if len(self.child_queue) == 0:
                return None
            self.cur_queue = self.child_queue
            self.child_queue = deque()
            return self.dequeuUrl()

    def getpagecontent(self, cur_url):
        print "downloading %s at level %d" % (cur_url, self.cur_level)
        try:
            req = urllib2.Request(cur_url, headers=self.request_headers)
            response = urllib2.urlopen(req)
            html_page = response.read()
            filename = cur_url[7:].replace('/', '_')
            fo = open("%s%s.html" % (self.dir_name, filename), 'wb+')
            fo.write(html_page)
            fo.close()
        except urllib2.HTTPError, Arguments:
            print Arguments
            return
        except httplib.BadStatusLine:
            print 'BadStatusLine'
            return
Пример #23
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import csv
import sys
import datetime
from time import time
from pybloomfilter import BloomFilter

reload(sys)
sys.setdefaultencoding('utf-8')

download_bf = BloomFilter(1024*1024*16, 0.01)

def request(url, isFirstPage):
    if url not in download_bf:
        download_bf.add(url)
    else:
        return

    res = requests.get(url).text
    soup = BeautifulSoup(res, 'html.parser')
    # print soup.prettify()

    keylist = soup.select('div.key-list > div.item-mod')
    for index, house in enumerate(keylist):
        # if index == 2:
            # print house
Пример #24
0
                    break
                continue

            PST._set_p_hash_kind("md5")

            # Assignate the correct redis connexion
            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]

            # Creating the bloom filter name: bloomyyyymm
            filebloompath = os.path.join(
                bloompath, 'bloom' + PST.p_date.year + PST.p_date.month)

            if os.path.exists(filebloompath):
                bloom = BloomFilter.open(filebloompath)
            else:
                bloom = BloomFilter(100000000, 0.01, filebloompath)
                bloop_path_set.add(filebloompath)

            # UNIQUE INDEX HASHS TABLE
            r_serv0 = dico_redis["201300"]
            r_serv0.incr("current_index")
            index = r_serv0.get("current_index") + str(PST.p_date)
            # HASHTABLES PER MONTH (because of r_serv1 changing db)
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)

            # For each bloom filter
            opened_bloom = []
            for bloo in bloop_path_set:
                # Opening blooms
                opened_bloom.append(BloomFilter.open(bloo))
 def __init__(self, capacity, error_rate):
     super().__init__()
     self.bloom_filter_1 = BloomFilter(capacity, error_rate)
     self.bloom_filter_2 = BloomFilter(capacity, error_rate)
Пример #26
0
class Crawling:

    request_headers = {
        'host': dest_url[7:],
        'connection': "keep-alive",
        'cache-control': "no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
    }

    level = 0
    max_level = 3

    dir_name = "download"

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    curr_queue = deque()
    next_queue = deque()

    urls_bloom = BloomFilter(1024 * 1024 * 16, 0.01)
    down_bloom = BloomFilter(1024 * 1024 * 16, 0.01)

    def __init__(self, url):
        self.curr_queue.append(url)

        # 只是记录那些已经下载下来了, 目前没有其他作用
        self.down_file = open(self.dir_name + "/history.txt", 'a+')
        # 记录已经下载下来的url的hash
        self.hash_file = open(self.dir_name + "/history_md5.txt", "a+")
        #   导入到bloomfilter中区,因此不会重复下载
        for md5 in self.hash_file.readlines():
            self.down_bloom.add(md5[:-2])

    def enqueue(self, url):
        ''' urls_bloom 中只记录hash,是为了减少记录在文件中的大小;实际也可以记录url?原有做法更准确?
        '''
        hash = hashlib.md5(url.encode('utf8')).hexdigest()
        if url not in self.urls_bloom and hash not in self.down_bloom:
            self.urls_bloom.add(url)
            self.next_queue.append(url)
            print("enqueue: [{}]".format(url))

    def dequeue(self):
        try:
            url = self.curr_queue.popleft()
            return url
        except IndexError:
            return None

    def complete(self, url):
        hash = hashlib.md5(url.encode('utf8')).hexdigest()
        self.down_bloom.add(hash)
        self.hash_file.write(hash + "\r\n")
        self.down_file.write(url + "\r\n")
        #self.down_file.flush()

    def next_level(self):
        self.curr_queue = self.next_queue
        self.next_queue = deque()

    def close(self):
        self.down_file.close()
Пример #27
0
 def __init__(self):
     self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
     self.f_write = open('visitedsites','w')
     self.si = SearchIndex()
     self.si.SearchInit()
Пример #28
0
TIMEOUT = 20
GITHUB_IMG = '//img[contains(@class, "avatar width-full height-full")]'
GITHUB_REPO_TITLE = '//a[@itemprop="name codeRepository"]'
GITHUB_LANGUAGE = '//span[@itemprop="programmingLanguage"]'
GITHUB_USERNAME_TITLE = '//span[@class="link-gray pl-1"]'

PATH_PREFIX_DEFAULT = 'data/users_order_'

options = webdriver.ChromeOptions()
options.add_argument(' - incognito')

browser = webdriver.Chrome(
    executable_path=r'/home/kevin/Downloads/chromedriver', options=options)

seen_usernames = BloomFilter(10000, .03)


def make_url(user, tab):
    return f'https://github.com/{user}?tab={tab}'


def is_loaded(url):
    browser.get(url)
    try:
        WebDriverWait(browser, TIMEOUT).until(
            EC.visibility_of_element_located((By.XPATH, GITHUB_IMG)))

    except TimeoutException:
        print(f'Timed out waiting for {url} to load')
        return False
Пример #29
0
 def __init__(self, path=None):
     self.file = None
     self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
Пример #30
0
# Date 13-Nov-2017: In this version, we are just using plain features
# and adding them to the Bloom. Specifically, we just convert each
# the 128-element array of SIFT features to their byte representation
# and add them to the Bloom; thus, each

from pybloomfilter import BloomFilter
import numpy as np
import cv2
import sys

# In[2]:

detector = cv2.xfeatures2d.SIFT_create()

bf = BloomFilter(10000000, 0.01, None)

train_img = cv2.imread('train.jpg', 0)
query_img = cv2.imread('raw.png', 0)

# In[3]:

T_kp, T_des = detector.detectAndCompute(train_img, None)
Q_kp, Q_des = detector.detectAndCompute(query_img, None)

_, dim = Q_des.shape
LSH_dim = 64
np.random.seed(0)
LSH_random_vectors = np.random.randn(dim, LSH_dim)
powers_of_two = 1 << np.arange(LSH_dim - 1, -1, -1)