Exemplo n.º 1
0
offset = 123456789;
t = 5*1048576;
a = [];
b = [];
i = 0;


# Test Element
test_element = randint(ielements,offset-1);
filter_name = str(test_element);



# Create the Cuckoo Filter
r = Client()
r.cfCreate(filter_name, cfsize);


# Insert a fraction of the elements 
for x in range(1,ielements-1):
  r.cfAdd(filter_name, str(x));

# Test a large number of elements 
for x in range(offset,t+offset):
  pos = r.cfExists(filter_name, str(x));
  #print(pos,x)
  if pos == 0:
      a.append(x) 

# Print FPR and set size
print("The length of list A is: ", len(a))
datapath = Path('../input')


def parse_json_body_text(json_filename):
    print("Processing ..", json_filename.stem)
    with open(json_filename) as json_data:
        data = json.load(json_data)
        paper_id=data['paper_id']
        for body_text in data['body_text']:
            para = body_text['text']
            yield para


try:
    redisbloomclient.cfCreate('processed_documents', 40000)
except ResponseError as e:
    print("Error:", repr(e))


#process document return sentences and entities 
def process_file(f,redisbloomclient=redisbloomclient, rediscluster_client=rediscluster_client):
    pid = 0
    article_id=f.stem
    print("Processing article_id ", article_id)
    if redisbloomclient.cfExists('processed_documents', article_id):
        print("already processed ", article_id)
        return article_id
    for para in parse_json_body_text(f):
        rediscluster_client.setnx(f"paragraphs:{article_id}:pid:{pid}",para)
        pid+= 1