def command_parser(args): if len(args) > 1: if args[1] == 'fix': print "args path: {}".format(args[2]) TP = TwitterParser(args[2], args[2]) if TP.get_all_json_file(): TP.fixer_json() print "Done !" else: print "[Error] with the path: {} ".format(args[2]) exit(0) if args[1] == 'sort': print "args path: {}".format(args[2]) sorted_dir = ht.mkdir_system(args[2], 'sorted') TP = TwitterParser(args[2], sorted_dir) if TP.get_all_json_file(): TP.sort_all() print "Done !" else: print "[Error] with the path: {} ".format(args[2]) if args[1] == 'mkbig': TP = TwitterParser(args[2], args[3]) big_dir = ht.mkdir_system(args[3], 'big') if TP.get_all_json_file(): sort_to_big(TP.all_json, big_dir) if args[1] == 'build': print "path: {}".format(args[2]) build_trees(args[2])
def tweet_id_replay_id_file_wrapper(dir_path, out_path): ''' wrapper for the function tweet_id_replay_id_file, help mange the dir and file :param dir_path: where the json files :param out_path: where to write the out put :return: None ''' list_files = ht.walk_rec(dir_path, [], '.json') out_logger = ht.mkdir_system(out_path, 'loging') tuples_out = ht.mkdir_system(out_path, 'tuples') for p_path in list_files: tweet_id_replay_id_file(p_path, tuples_out, out_logger)
def build_trees(f_name_big, long=5): """ building the trees, by iterate over each record in the big file, this function also crate a lookup file for merging nodes in the tree :param f_name_big: the big file (sorted file) :return: trees files """ ram_big = ["1", "1", [], 0, 0] out_dir = '/'.join(str(f_name_big).split('/')[:-1]) lookup_dir = ht.mkdir_system(out_dir, 'lookup') trees_dir = ht.mkdir_system(out_dir, 'trees') log_dir = ht.mkdir_system(out_dir, 'log') not_done = True big_file = open('{}'.format(f_name_big), 'r') dico_data = {} start = True while not_done: if start: # for x in range(30000): # cur_line = big_file.readline() cur_line = big_file.readline() start = False cur_line = big_file.readline() if cur_line is None or len(cur_line) < 1: break end_not = True while end_not: arr_split_data = str(cur_line).split('@#@') id_line = arr_split_data[0] with open('{}/log.txt'.format(log_dir), 'a') as log_f: log_f.write("{}\n".format(id_line)) json_line = arr_split_data[1] dico_data[id_line] = cur_line ans = _look_up(id_line, lookup_dir, long) if ans is not None: flush_tree(dico_data, lookup_dir, trees_dir, long, ans) break replay = get_replay(json_line) if replay is None: end_not = False flush_tree(dico_data, lookup_dir, trees_dir, long) dico_data = {} continue print "{}->{}".format(id_line, replay) print "len: {} {}".format(len(id_line), len(replay)) replay_data = binary_search(replay, f_name_big, ram_big) if replay_data is None: flush_tree(dico_data, lookup_dir, trees_dir, long) dico_data = {} break print "found !!!" cur_line = replay_data
def ram_bulider(f_name_big, tuple_path=None): out_dir = '/'.join(str(f_name_big).split('/')[:-1]) trees_dir = ht.mkdir_system(out_dir, 'trees') log_dir = ht.mkdir_system(out_dir, 'log') if tuple_path is not None: d_mem = loader(tuple_path, True) else: d_mem = loader(f_name_big) list_val = d_mem.values() str_list = [str(x) for x in list_val] str_list.sort(key=len, reverse=True) loger(str_list, log_dir, 'mem_d.txt', False, True) d_json = get_hash_json(f_name_big) flush_to_files(d_mem, d_json, trees_dir)
def build_tree_ram(f_name_big, long=5): pass ram_big = ["1", "1", [], 0, 0] out_dir = '/'.join(str(f_name_big).split('/')[:-1]) lookup_dir = ht.mkdir_system(out_dir, 'lookup') trees_dir = ht.mkdir_system(out_dir, 'trees') log_dir = ht.mkdir_system(out_dir, 'log') not_done = True big_file = open('{}'.format(f_name_big), 'r') dico_data = {} start = True with open(f_name_big, 'r+') as f_big: ram_big[2] = f_big.readlines() while not_done: cur_index = 0 cur_line = ram_big[2][cur_index] cur_index += 1 if cur_line is None or len(cur_line) < 1: break end_not = True while end_not: arr_split_data = str(cur_line).split('@#@') id_line = arr_split_data[0] with open('{}/log.txt'.format(log_dir), 'a') as log_f: log_f.write("{}\n".format(id_line)) json_line = arr_split_data[1] dico_data[id_line] = cur_line ans = _look_up(id_line, lookup_dir, long) if ans is not None: flush_tree(dico_data, lookup_dir, trees_dir, long, ans) break replay = get_json_data(json_line) if replay is None: end_not = False flush_tree(dico_data, lookup_dir, trees_dir, long) dico_data = {} continue print "{}->{}".format(id_line, replay) print "len: {} {}".format(len(id_line), len(replay)) replay_data = binary_search(replay, f_name_big, ram_big, True) if replay_data is None: flush_tree(dico_data, lookup_dir, trees_dir, long) dico_data = {} break print "found !!!" cur_line = replay_data
def make_big_json(self, path_sorted, path_out): """ building the big file (merge all the sorted file into one big file) :param path_sorted: :param path_out: :return: """ big_dir = ht.mkdir_system(path_out, 'big') self.get_all_files(path_sorted) self.sort_to_big(self.files, big_dir)
def parser_command(arg=None): if arg is None: arg = sys.argv if len(arg) < 2: print 'no path was given' print "python parser_twitter [path_zip] [path_out] [ram_size=10M]" return else: if arg[1] == 'big': rel_path = '/'.join(str(arg[2]).split('/')[:-1]) out = ht.mkdir_system(rel_path, 'out') p_pars = Parser(arg[2], out) p_pars.full_process() print "done process all data" return if arg[1] == 'ram': if len(arg) == 3: ram_bulider(arg[2]) else: ram_bulider(arg[2], arg[3]) return if arg[1] == 'ana': analysis(arg[2]) return if arg[1] == 'size': size_file = _get_size_file(arg[2]) print "SIZE : {}".format(size_file) return if arg[1] == 'tuple': tweet_id_replay_id_file_wrapper(arg[2], arg[3]) if arg[1] == 'full': rel_path = '/'.join(str(arg[2]).split('/')[:-1]) out = ht.mkdir_system(rel_path, 'out') p_pars = Parser(arg[2], out) p_pars.full_process() big_path = '{}/big/big.json'.format(out) ram_bulider(big_path) analysis('{}/big/trees'.format(out)) print "done process all data" print "-----------" * 10
def flush_all(self, map_dict): input_hash = '' ctr = 0 end = 5 for item_i in self.all_json: input_hash = '{}{}'.format(input_hash, item_i) if ctr > end: break else: ctr += 1 random_seq = int(hashlib.sha1(input_hash).hexdigest(), 16) % (10**8) out_path_dir = ht.mkdir_system(self.output_path, random_seq) for val in map_dict.keys(): flush(map_dict[val], "{}_{}".format(val, len(map_dict[val])), out_path_dir)
def constract_fix_json_dir(self): ''' :return: ''' self.get_all_files() for file_gz in self.files: self.decompress_file(file_gz, self.out_dir) self.get_all_files(self.out_dir) for file_i in self.files: self.fix_json(file_i) self.get_all_files(self.out_dir) self.get_all_files(self.out_dir) p_path = ht.mkdir_system(self.out_dir, 'sorted') for x in self.files: self.sort_file(x, p_path) print "done !"
def get_IDs_files(self): ou_dir = ht.mkdir_system(self.output_path, 'ids') for file_i in self.all_json: map_dict = {} data_stream = json.load(open(file_i)) for entry in data_stream['arr_tweets']: id_i = long((entry['id']).split(':')[-1]) node_i = TweetNode(id_i, entry, None) if id_i in map_dict: print "[Error]" else: map_dict[id_i] = None if 'inReplyTo' in entry: replay_id = str(entry['inReplyTo']).split('/')[-1][:-2] node_i.in_replay_to = replay_id map_dict[id_i] = node_i name_file = str(file_i).split('/')[-1] list_sorted = sorted(map_dict.keys()) with open('{}/{}.txt'.format(ou_dir, name_file), 'w') as file: for ky in list_sorted: file.write(repr(map_dict[ky])) file.write('\n') return map_dict