def pickle_vt(self): files = glob.glob(self.json_path) docs = [] count = 0 print("\n This may take some time") for file in files: count += 1 with utils.fopen(file) as f: for line in f: doc = json.loads(line) docs.append(doc) print("\n JSON DOC #", str(count)) print("DOC NAME: ", file) with open(self.pickle_path, "wb") as f: print("Starting pickling ") pickle.dump(docs, f) print("Done Pickling All VT JSON to One Doc ")
from json_flattener import JsonFlattener # Globbing directories for compact json strings data_dir = "../../../data/static_vt/compact/*/*.json" files = glob.glob(data_dir) # initializing flattener and doc list flattener = JsonFlattener() docs = [] total_files = len(files) print("\n Reading all Json into memory") for file in files: # Reading each globbed file flattening and appending to docs # Output of percentage is shown with utils.fopen(file) as f: for line in f: doc = json.loads(line) flat_doc = flattener.flatten_json_iterative_solution(doc) docs.append(flat_doc) done = int(50 * len(docs) / total_files) sys.stdout.write("\r[{}{}] {}%".format("█" * done, "." * (50 - done), int(100 * len(docs) / total_files))) sys.stdout.flush() # initializing JsonVectorizer vectorizer = JsonVectorizer() # extending vectorizer with flattened_docs processed = 0 print("\nExtending with docs")
def generate_dict(src, idx_list, target_dict, cnt): cur_level = src flag = True for item in idx_list: if item in cur_level.keys(): cur_level = cur_level[item] else: flag = False if flag: for idx in target_dict.keys(): if idx == cur_level: target_dict[idx].append(cnt) with fopen(osp.join(DATA_PATH, 'sample10000.json')) as f: for line in f: doc = json.loads(line) generate_dict(doc, location_idx_list, location_dict, cnt) generate_dict(doc, certificate_idx_list, certificate_dict, cnt) cur_level = doc flag = True for item in server_idx_list: if item in cur_level.keys(): cur_level = cur_level[item] else: flag = False if flag: