예제 #1
0
def command_parser(args):
    if len(args) > 1:
        if args[1] == 'fix':
            print "args path: {}".format(args[2])
            TP = TwitterParser(args[2], args[2])
            if TP.get_all_json_file():
                TP.fixer_json()
                print "Done !"
            else:
                print "[Error] with the path: {} ".format(args[2])
            exit(0)
        if args[1] == 'sort':
            print "args path: {}".format(args[2])
            sorted_dir = ht.mkdir_system(args[2], 'sorted')
            TP = TwitterParser(args[2], sorted_dir)
            if TP.get_all_json_file():
                TP.sort_all()
                print "Done !"
            else:
                print "[Error] with the path: {} ".format(args[2])
        if args[1] == 'mkbig':
            TP = TwitterParser(args[2], args[3])
            big_dir = ht.mkdir_system(args[3], 'big')
            if TP.get_all_json_file():
                sort_to_big(TP.all_json, big_dir)
        if args[1] == 'build':
            print "path: {}".format(args[2])
            build_trees(args[2])
예제 #2
0
def tweet_id_replay_id_file_wrapper(dir_path, out_path):
    '''
    wrapper for the function tweet_id_replay_id_file, help mange the dir and file
    :param dir_path: where the json files
    :param out_path: where to write the out put
    :return: None
    '''
    list_files = ht.walk_rec(dir_path, [], '.json')
    out_logger = ht.mkdir_system(out_path, 'loging')
    tuples_out = ht.mkdir_system(out_path, 'tuples')
    for p_path in list_files:
        tweet_id_replay_id_file(p_path, tuples_out, out_logger)
예제 #3
0
def build_trees(f_name_big, long=5):
    """
    building the trees, by iterate over each record in the big file,
    this function also crate a lookup file for merging nodes in the tree
    :param f_name_big: the big file (sorted file)
    :return: trees files
    """
    ram_big = ["1", "1", [], 0, 0]
    out_dir = '/'.join(str(f_name_big).split('/')[:-1])
    lookup_dir = ht.mkdir_system(out_dir, 'lookup')
    trees_dir = ht.mkdir_system(out_dir, 'trees')
    log_dir = ht.mkdir_system(out_dir, 'log')
    not_done = True
    big_file = open('{}'.format(f_name_big), 'r')
    dico_data = {}
    start = True
    while not_done:
        if start:
            # for x in range(30000):
            #    cur_line = big_file.readline()
            cur_line = big_file.readline()
            start = False
        cur_line = big_file.readline()
        if cur_line is None or len(cur_line) < 1:
            break
        end_not = True
        while end_not:
            arr_split_data = str(cur_line).split('@#@')
            id_line = arr_split_data[0]
            with open('{}/log.txt'.format(log_dir), 'a') as log_f:
                log_f.write("{}\n".format(id_line))
            json_line = arr_split_data[1]
            dico_data[id_line] = cur_line
            ans = _look_up(id_line, lookup_dir, long)
            if ans is not None:
                flush_tree(dico_data, lookup_dir, trees_dir, long, ans)
                break
            replay = get_replay(json_line)
            if replay is None:
                end_not = False
                flush_tree(dico_data, lookup_dir, trees_dir, long)
                dico_data = {}
                continue
            print "{}->{}".format(id_line, replay)
            print "len: {} {}".format(len(id_line), len(replay))
            replay_data = binary_search(replay, f_name_big, ram_big)
            if replay_data is None:
                flush_tree(dico_data, lookup_dir, trees_dir, long)
                dico_data = {}
                break
            print "found !!!"
            cur_line = replay_data
예제 #4
0
def ram_bulider(f_name_big, tuple_path=None):
    out_dir = '/'.join(str(f_name_big).split('/')[:-1])
    trees_dir = ht.mkdir_system(out_dir, 'trees')
    log_dir = ht.mkdir_system(out_dir, 'log')
    if tuple_path is not None:
        d_mem = loader(tuple_path, True)
    else:
        d_mem = loader(f_name_big)
    list_val = d_mem.values()
    str_list = [str(x) for x in list_val]
    str_list.sort(key=len, reverse=True)
    loger(str_list, log_dir, 'mem_d.txt', False, True)
    d_json = get_hash_json(f_name_big)
    flush_to_files(d_mem, d_json, trees_dir)
예제 #5
0
def build_tree_ram(f_name_big, long=5):
    pass
    ram_big = ["1", "1", [], 0, 0]
    out_dir = '/'.join(str(f_name_big).split('/')[:-1])
    lookup_dir = ht.mkdir_system(out_dir, 'lookup')
    trees_dir = ht.mkdir_system(out_dir, 'trees')
    log_dir = ht.mkdir_system(out_dir, 'log')
    not_done = True
    big_file = open('{}'.format(f_name_big), 'r')
    dico_data = {}
    start = True
    with open(f_name_big, 'r+') as f_big:
        ram_big[2] = f_big.readlines()

    while not_done:
        cur_index = 0
        cur_line = ram_big[2][cur_index]
        cur_index += 1
        if cur_line is None or len(cur_line) < 1:
            break
        end_not = True
        while end_not:
            arr_split_data = str(cur_line).split('@#@')
            id_line = arr_split_data[0]
            with open('{}/log.txt'.format(log_dir), 'a') as log_f:
                log_f.write("{}\n".format(id_line))
            json_line = arr_split_data[1]
            dico_data[id_line] = cur_line
            ans = _look_up(id_line, lookup_dir, long)
            if ans is not None:
                flush_tree(dico_data, lookup_dir, trees_dir, long, ans)
                break
            replay = get_json_data(json_line)
            if replay is None:
                end_not = False
                flush_tree(dico_data, lookup_dir, trees_dir, long)
                dico_data = {}
                continue
            print "{}->{}".format(id_line, replay)
            print "len: {} {}".format(len(id_line), len(replay))
            replay_data = binary_search(replay, f_name_big, ram_big, True)
            if replay_data is None:
                flush_tree(dico_data, lookup_dir, trees_dir, long)
                dico_data = {}
                break
            print "found !!!"
            cur_line = replay_data
예제 #6
0
 def make_big_json(self, path_sorted, path_out):
     """
     building the big file (merge all the sorted file into one big file)
     :param path_sorted:
     :param path_out:
     :return:
     """
     big_dir = ht.mkdir_system(path_out, 'big')
     self.get_all_files(path_sorted)
     self.sort_to_big(self.files, big_dir)
예제 #7
0
def parser_command(arg=None):
    if arg is None:
        arg = sys.argv
    if len(arg) < 2:
        print 'no path was given'
        print "python parser_twitter [path_zip] [path_out] [ram_size=10M]"
        return
    else:
        if arg[1] == 'big':
            rel_path = '/'.join(str(arg[2]).split('/')[:-1])
            out = ht.mkdir_system(rel_path, 'out')
            p_pars = Parser(arg[2], out)
            p_pars.full_process()
            print "done process all data"
            return
        if arg[1] == 'ram':
            if len(arg) == 3:
                ram_bulider(arg[2])
            else:
                ram_bulider(arg[2], arg[3])
            return
        if arg[1] == 'ana':
            analysis(arg[2])
            return
        if arg[1] == 'size':
            size_file = _get_size_file(arg[2])
            print "SIZE : {}".format(size_file)
            return
        if arg[1] == 'tuple':
            tweet_id_replay_id_file_wrapper(arg[2], arg[3])
        if arg[1] == 'full':
            rel_path = '/'.join(str(arg[2]).split('/')[:-1])
            out = ht.mkdir_system(rel_path, 'out')
            p_pars = Parser(arg[2], out)
            p_pars.full_process()
            big_path = '{}/big/big.json'.format(out)
            ram_bulider(big_path)
            analysis('{}/big/trees'.format(out))
            print "done process all data"
        print "-----------" * 10
예제 #8
0
 def flush_all(self, map_dict):
     input_hash = ''
     ctr = 0
     end = 5
     for item_i in self.all_json:
         input_hash = '{}{}'.format(input_hash, item_i)
         if ctr > end:
             break
         else:
             ctr += 1
     random_seq = int(hashlib.sha1(input_hash).hexdigest(), 16) % (10**8)
     out_path_dir = ht.mkdir_system(self.output_path, random_seq)
     for val in map_dict.keys():
         flush(map_dict[val], "{}_{}".format(val, len(map_dict[val])),
               out_path_dir)
예제 #9
0
 def constract_fix_json_dir(self):
     '''
     :return:
     '''
     self.get_all_files()
     for file_gz in self.files:
         self.decompress_file(file_gz, self.out_dir)
     self.get_all_files(self.out_dir)
     for file_i in self.files:
         self.fix_json(file_i)
     self.get_all_files(self.out_dir)
     self.get_all_files(self.out_dir)
     p_path = ht.mkdir_system(self.out_dir, 'sorted')
     for x in self.files:
         self.sort_file(x, p_path)
     print "done !"
예제 #10
0
 def get_IDs_files(self):
     ou_dir = ht.mkdir_system(self.output_path, 'ids')
     for file_i in self.all_json:
         map_dict = {}
         data_stream = json.load(open(file_i))
         for entry in data_stream['arr_tweets']:
             id_i = long((entry['id']).split(':')[-1])
             node_i = TweetNode(id_i, entry, None)
             if id_i in map_dict:
                 print "[Error]"
             else:
                 map_dict[id_i] = None
             if 'inReplyTo' in entry:
                 replay_id = str(entry['inReplyTo']).split('/')[-1][:-2]
                 node_i.in_replay_to = replay_id
             map_dict[id_i] = node_i
         name_file = str(file_i).split('/')[-1]
         list_sorted = sorted(map_dict.keys())
         with open('{}/{}.txt'.format(ou_dir, name_file), 'w') as file:
             for ky in list_sorted:
                 file.write(repr(map_dict[ky]))
                 file.write('\n')
     return map_dict