def main(argv, stop_after_init=False, preset_set_of_users=None):
    pickle_path_lptq                    = '/tmp/process_log_times_pq.bin.gz'
    pickle_path_clicks                  = '/tmp/process_log_clicks.bin.gz'
    pickle_path_clusters                = '/tmp/process_log_clusters.dict.txt.gz'
    pickle_path_removed_queries         = '/tmp/process_log_removed_queries.lst.txt.gz'
    pickle_path_big_queries_set         = '/tmp/process_log_big_queries_set.lst.txt.gz'
    pickle_path_users                   = '/tmp/process_log_usets_set.lst.txt.gz'

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print "Usage: %s"  % argv[0], ' '.join(CLI_ARGS)
        print "Currently missing parameters arguments:", ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    global mdb_host
    mdb_host                = argv[1].strip()
    filter_queries_file     = argv[2].strip()
    allowed_users_file      = argv[3].strip()
    log_filepath            = argv[4].strip()
    clusters_file           = argv[5].strip()
    queries_to_ids_file     = argv[6].strip()

    t_init = time()

    # print "Starting... compute_everything()"
    # t0 = time()
    # everything = compute_everything(mdb_host, filter_queries_file, allowed_users_file)
    # removed_queries = everything['removed_queries']
    # print "Done ", time()-t0


    ########################################################################################################################
    # We are going to do a lot of "is in allowed users?" so we need a set, not a list
    print "Loading users..."
    # users_set = set([int(line.strip()) for line in univ_open(allowed_users_file, mode='r')])
    # We use compute_everything because it gets rid of the null-clusters queries before retrieving the list
    # of users, thus reducing the dataset overall, as queries are then retrieved from the users set
    t0 = time()
    global users_set
    if preset_set_of_users is not None:
        users_set = preset_set_of_users
    else:
        try:
            print "Trying to pickle from disk...", pickle_path_users
            with gzopen(pickle_path_users, 'r') as f:
                print "File", pickle_path_users, "was found!"
                users_set = set(load_pickled_list(f))
            pickled = True
        except Exception as err:
            print "Error for", pickle_path_users, "was:", err
            # if not isinstance(err, IOError):
            print "No pickled files or error loading it, recomputing..."
            pickled = False
            # Note: here we use compute_everything because it will load the queries clusters OF THE INITIAL QUERIES only
            # remove the ones that have null clusterings
            # and then generate the list of users who queried the pruned list of queries
            # we do not direclty use the clusters from it, nor the queries, because we still have to remove the other queries
            # that have null clustering vectors. By "other queries" we mean not queries we use as the seed to select some user/data
            # any queries that is part of a user profile of one of the allowed users (the ones who queried the query list seed)
            # this bigger queries set is generated by load_big_query_set() in this file
            users_set = set(compute_everything(mdb_host, filter_queries_file, allowed_users_file)['users'])
        print "Done ", time()-t0
        print "Total number of users that will be analyzed:", len(users_set)
        pickle_ask(pickled, pickle_path_users, users_set, dump_f=pickle_list)
        print "Done ", time()-t0
        # everything = None  # We are not using it afterwards, so, this should help the GC

    ####################################################################################################################

    # import itertoolsmodule as iter
    print "Computing the set of allowed queries..."
    t0 = time()
    try:
        print "Trying to pickle from disk...", pickle_path_big_queries_set
        with gzopen(pickle_path_big_queries_set, 'r') as f:
            big_queries_set = set(load_pickled_list(f))
        pickled = True
    except Exception as err:
        if not isinstance(err, IOError):
            print "Error for", pickle_path_big_queries_set, "was:", err
        print "No pickled files or error loading it, recomputing..."
        pickled = False
        big_queries_set = load_big_query_set(log_filepath, users_set)
    print "Done ", time()-t0
    print "Total number of queries that will be analyzed:", len(big_queries_set)
    pickle_ask(pickled, pickle_path_big_queries_set, big_queries_set, dump_f=pickle_list)
    
    ####################################################################################################################

    global clusters
    print "Pre-initializing clusters dict..."
    t0 = time()
    clusters = dict.fromkeys(big_queries_set)
    print "clusters now has", len(clusters), "keys"
    print "Done ", time()-t0

    print "Retrieving big list of clusters for the", len(big_queries_set), "queries..."
    t0 = time()

    global clusters_loaded
    clusters_loaded = False
    p_clusters, mapres_clusters = run_in_bg_process(do_process_clusters_pickle, (pickle_path_clusters,))
    
    def join_clusters():
        p_clusters.join()
        global clusters, clusters_loaded
        if clusters_loaded:
            return clusters
        result = mapres_clusters.get()[0]
        if result is False:  
            # The pickling from disk did not work, recompute it in place (join_clusters() is called when clusters are 
            # NEEDED so we cannot wait/async this))
            print "Error while pickling clusters from disk", pickle_path_clusters, ", recomputing..."
            t0 = time()
            result = do_process_clusters_recompute(big_queries_set, clusters_file, queries_to_ids_file, clusters)
            print "Done do_process_clusters_recompute()", time()-t0
            # Any user input needs to be on the main thread, pickle ask will by itself send the pickling task to a bg
            # worker process if the user answers yes
            pickle_ask(False, pickle_path_clusters, result, dump_f=picke_dict)
        clusters_loaded = True
        clusters = result
        return clusters

    ########################################################################################################################
    removed_queries = compute_removed_queries_because_of_null_clustering(pickle_path_removed_queries, clusters, join_clusters)
    print "Removed", len(removed_queries), "out of", len(big_queries_set)
    ########################################################################################################################
    t1 = time()
    print "Launching process_log_clicks computation in a separated process"
    p_lpc, lpc_mapres = run_in_bg_process(process_log_clicks, (log_filepath, users_set, removed_queries))
    p_lpc.close()
    ########################################################################################################################
    ########################################################################################################################

    print "Starting... process_log_times_pq()"
    t0 = time()
    try:
        print "Trying to pickle from disk...", pickle_path_lptq
        lptpq = pload(gzopen(pickle_path_lptq, 'rb'))
        pickled = True
    except Exception as err:
        if not isinstance(err, IOError):
            print "Error for", pickle_path_lptq, "was:", err
        print "No pickled files or error loading it, recomputing..."
        pickled = False
        lptpq = process_log_times_pq(log_filepath, users_set, removed_queries)
    print "Done process_log_times_pq() in", time()-t0
    pickle_ask(pickled, pickle_path_lptq, lptpq)

    print "Starting... process_log_clicks()"
    t0 = time()
    # Note: Disabled the pickling as, for some reason, it does not work
    # and there is only ~15s difference between recomputation and pickling from disk anyway...
    # try:
    #     print "Trying to pickle from disk..."
    #     lpc = pload(open(pickle_path_clicks, 'rb'))
    #     pickled = True
    # except Exception as err:
    #     if not isinstance(err, IOError):
    #         print "Error was:", err
    #     print "No pickled files or error loading it, recomputing..."
    #     pickled = False
    #     lpc = process_log_clicks(log_filepath, users_set, removed_queries)
    ########################################################################################################################
    ########################################################################################################################
    print "waiting for the pool to finish, if not finished yet..."
    p_lpc.join()
    lpc = lpc_mapres.get()[0]
    print "Took a total time of", time()-t1, "or less"
    ########################################################################################################################
    ########################################################################################################################
    print "Done ", time()-t0
    # pickle_ask(pickled, pickle_path_clicks, lpc)

    print "Some reprocessing..."
    # We need the clusters from now on, so let us wait for the children process to be finished and the data 
    # transferred back to us
    join_clusters()
    print "Removing null-vectors clusters queries from `clusters`..."
    t0 = time()
    for qid in removed_queries:
        try:
            del clusters[qid]
        except KeyError:
            pass  # If it was already not there, that's perfect
    print "Done ", time()-t0


    t0 = time()
    for user_queries_dic in lpc.user_clicks_number:
        if user_queries_dic is None:
            continue
        del user_queries_dic['_id']
    for user_queries_dic in lptpq.user_queries_number:
        if user_queries_dic is None:
            continue
        del user_queries_dic['_id']
    print "Done ", time()-t0

    # Deprecated, for now, but we might switch back to it so, keep it for now
    print "Computing number of users who issued the query, per query..."
    t0 = time()
    number_of_users_who_queried = dict.fromkeys(big_queries_set - removed_queries, 0)
    for query_dict in lptpq.user_queries_number:
        if query_dict is None:
            continue
        for qid in query_dict:
            number_of_users_who_queried[qid] += 1
    print "Done ", time()-t0

    print "Computing number of users who clicked, per query..."
    t0 = time()
    number_of_users_who_clicked = dict.fromkeys(big_queries_set - removed_queries, 0)
    for query_dict in lpc.user_clicks_number:
        if query_dict is None:
            continue
        for qid in query_dict:
            number_of_users_who_clicked[qid] += 1
    print "Done ", time()-t0

    # # GC
    big_queries_set = None
    removed_queries = None

    # print "Some reprocessing..."
    # t0 = time()
    # for user_queries_dic in lpc.user_clicks_number:
    #     if user_queries_dic is None:
    #         continue
    #     del user_queries_dic['_id']
    #     for q in removed_queries:
    #         try:
    #             del user_queries_dic[q]
    #         except KeyError:
    #             # key was not there? fine, we did not need to delete it then
    #             pass
    # for user_queries_dic in lptpq.user_queries_number:
    #     if user_queries_dic is None:
    #         continue
    #     del user_queries_dic['_id']
    #     for q in removed_queries:
    #         try:
    #             del user_queries_dic[q]
    #         except KeyError:
    #             # key was not there? fine, we did not need to delete it then
    #             pass
    print "Done ", time()-t0

    print "Starting..."
    t0 = time()
    us.init(
        lpc.user_clicks_number,
        lptpq.user_queries_number,
        clusters,
        users_set,
        number_of_users_who_queried,
        number_of_users_who_clicked
    )
    print "Done ", time()-t0
    # Note: At this point in the main() execution, the script takes ~2.5G of RAM.

    print "Total initialization phase time:", time()-t_init

    if stop_after_init:
        return

    print "Initializing users similarity computation phase..."
    # Similarity computation benchmark:
    t0 = time()
    i = 0
    global DATA_SET_SIZE
    DATA_SET_SIZE = len(users_set)
    # Note: a too small batch size will waste time respawning processes and re-generating the user_sim module cache
    # but a too high batch size will kill mongodb and the computer's RAM (as 1 batch size unit is 1 user computed by
    # the process and the process commits everything at once)
    print "Generating sorted users set..."

    print "Generating workers pool..."
    p = Pool(processes=POOL_SIZE)
    start_values = range(0, DATA_SET_SIZE, BATCH_SIZE)
    print "Mapping (launching) pool to", len(start_values), "different start_values", start_values
    t0 = time()
    p.map(compute_user_sim_batch, start_values)
    p.close()
    p.join()
    print "Workers finished in %.3f." % (time()-t0)

    # for u in users_set:
    #     for u2 in users_set:
    #         i += 1
    #         try:
    #             us.sim(u, u2)
    #         except KeyError as err:
    #             print err
    #             key = err.args[0]
    #             print key, "in big_queries_set?", key in big_queries_set
    #             print key, "in removed_queries?", key in removed_queries
    #             print key, "in clusters?", key in clusters
    #             res = False
    #             for u_dict in lpc.user_clicks_number:
    #                 if u_dict is not None:
    #                     res |= (key in u_dict)
    #             print key, "in clicks?", res
    #             res = False
    #             for u_dict in lptpq.user_queries_number:
    #                 if u_dict is not None:
    #                     res |= (key in u_dict)
    #             print key, "in user_queries_number?", res
    #         if i % 10000 is 0:
    #             print i+1, "\t\tsim() calls in\t\t", time()-t0, "\t\taverage\t\t", (time()-t0)/float(i+1)

    raw_input("Now what?")
def top_k_queries(k, host, queries_filter_file, allowed_users_file, ids_mapping_file):
    queries_ids = compute_everything(host, queries_filter_file, allowed_users_file, k)['q_list']
    queries_strings_indexed_by_id = [l.strip() for l in univ_open(ids_mapping_file)]
    queries_strings = [queries_strings_indexed_by_id[i] for i in queries_ids]
    return queries_strings