def tortoise(pure=False,
             force_matrix_creation=False,
             skip_matrix_creation=False,
             last_run=None):
    assert not force_matrix_creation or not skip_matrix_creation
    # The computation must be forced in case we want
    # to compute pure results
    force_matrix_creation = force_matrix_creation or pure

    if not skip_matrix_creation:
        bibauthor_print("Preparing cluster sets.")
        clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
        bibauthor_print("Building all matrices.")
        exit_statuses = schedule_create_matrix(
            clusters,
            sizes,
            force=force_matrix_creation)
        assert len(exit_statuses) == len(clusters)
        assert all(stat == os.EX_OK for stat in exit_statuses)

    bibauthor_print("Preparing cluster sets.")
    clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(
        clusters,
        sizes)
    assert len(exit_statuses) == len(clusters)
    assert all(stat == os.EX_OK for stat in exit_statuses)
示例#2
0
def tortoise(pure=False,
             force_matrix_creation=False,
             skip_matrix_creation=False,
             last_run=None):
    assert not force_matrix_creation or not skip_matrix_creation
    # The computation must be forced in case we want
    # to compute pure results
    force_matrix_creation = force_matrix_creation or pure

    if not skip_matrix_creation:
        bibauthor_print("Preparing cluster sets.")
        clusters, _lnames, sizes = delayed_cluster_sets_from_personid(
            pure, last_run)
        bibauthor_print("Building all matrices.")
        exit_statuses = schedule_create_matrix(clusters,
                                               sizes,
                                               force=force_matrix_creation)
        assert len(exit_statuses) == len(clusters)
        assert all(stat == os.EX_OK for stat in exit_statuses)

    bibauthor_print("Preparing cluster sets.")
    clusters, _lnames, sizes = delayed_cluster_sets_from_personid(
        pure, last_run)
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(clusters, sizes)
    assert len(exit_statuses) == len(clusters)
    assert all(stat == os.EX_OK for stat in exit_statuses)
示例#3
0
def tortoise_from_scratch():
    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Building all matrices.")
    exit_statuses = schedule_create_matrix(cluster_sets, sizes, force=True)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)

    empty_results_table()

    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(cluster_sets, sizes)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)
def tortoise_from_scratch():
    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Building all matrices.")
    exit_statuses = schedule_create_matrix(cluster_sets, sizes, force=True)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)

    empty_results_table()

    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(cluster_sets, sizes)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)
def approximate(xs, ys, power):
    assert len(xs) == len(ys)

    matrix_size = power + 1
    variables = 2 * power + 1

    xs = map(float, xs)
    ys = map(float, ys)

    xs = reduce(lambda x, y: x + [list(starmap(operator.mul, izip(x[-1], y)))], repeat(xs, variables - 1), [[1] * len(xs)])
    assert len(xs) == variables

    s = map(sum, xs)
    assert s[0] == len(ys)

    b = [sum(starmap(operator.mul, izip(ys, x))) for x in xs[:matrix_size]]
    a = [s[i:i + matrix_size] for i in xrange(matrix_size)]

    # So, we have a*x = b and we are looking for x
    matr = [ai + [bi] for ai, bi in izip(a, b)]

    def unify_row(i, j):
        matr[i] = [cell / matr[i][j] for cell in matr[i]]
        assert matr[i][j] == 1

    def subtract_row(i, j, row):
        assert matr[i][j] == 1

        matr[row] = [matr[row][k] - matr[i][k] * matr[row][j] for k in xrange(len(matr[i]))]

        assert matr[row][j] == 0

# NOTE: Example for matrix_size = 3
#    unify_row(0, 0)
#    subtract_row(0, 0, 1)
#    subtract_row(0, 0, 2)
#    unify_row(1, 1)
#    subtract_row(1, 1, 2)
#    unify_row(2, 2)
#    subtract_row(2, 2, 1)
#    subtract_row(2, 2, 0)
#    subtract_row(1, 1, 0)

    for i in xrange(matrix_size):
        unify_row(i, i)
        for j in xrange(matrix_size - i - 1):
            subtract_row(i, i, i + j + 1)

    for i in xrange(matrix_size):
        for j in xrange(matrix_size - i - 1):
            subtract_row(matrix_size - i - 1, matrix_size - i - 1, j)

    assert all(matr[i][:matrix_size] == ([0] * i) + [1] + ([0] * (matrix_size - 1 - i)) for i in xrange(matrix_size))

    ret = map(operator.itemgetter(matrix_size), matr)

    return ret
示例#6
0
 def __init__(self, personid_records):
     '''
     @param personid_records:
         A list of tuples: (personid, bibrefrec, flag).
         Notice that all bibrefrecs should be the same
         since the Blob represents only one bibrefrec.
     '''
     self.bib = personid_records[0][1]
     assert all(p[1] == self.bib for p in personid_records), \
         "All cluster sets should share the bibrefrec"
     self.claimed = set()
     self.assigned = set()
     self.rejected = set()
     for pid, _, flag in personid_records:
         if flag > 1:
             self.claimed.add(pid)
         elif flag >= -1:
             self.assigned.add(pid)
         else:
             self.rejected.add(pid)
 def __init__(self, personid_records):
     '''
     @param personid_records:
         A list of tuples: (personid, bibrefrec, flag).
         Notice that all bibrefrecs should be the same
         since the Blob represents only one bibrefrec.
     '''
     self.bib = personid_records[0][1]
     assert all(p[1] == self.bib for p in personid_records), \
         "All cluster sets should share the bibrefrec"
     self.claimed = set()
     self.assigned = set()
     self.rejected = set()
     for pid, _, flag in personid_records:
         if flag > 1:
             self.claimed.add(pid)
         elif flag >= -1:
             self.assigned.add(pid)
         else:
             self.rejected.add(pid)
def approximate(xs, ys, power):
    assert len(xs) == len(ys)

    matrix_size = power + 1
    variables = 2 * power + 1

    xs = map(float, xs)
    ys = map(float, ys)

    xs = reduce(lambda x, y: x + [list(starmap(operator.mul, izip(x[-1], y)))],
                repeat(xs, variables - 1), [[1] * len(xs)])
    assert len(xs) == variables

    s = map(sum, xs)
    assert s[0] == len(ys)

    b = [sum(starmap(operator.mul, izip(ys, x))) for x in xs[:matrix_size]]
    a = [s[i:i + matrix_size] for i in xrange(matrix_size)]

    # So, we have a*x = b and we are looking for x
    matr = [ai + [bi] for ai, bi in izip(a, b)]

    def unify_row(i, j):
        matr[i] = [cell / matr[i][j] for cell in matr[i]]
        assert matr[i][j] == 1

    def subtract_row(i, j, row):
        assert matr[i][j] == 1

        matr[row] = [
            matr[row][k] - matr[i][k] * matr[row][j]
            for k in xrange(len(matr[i]))
        ]

        assert matr[row][j] == 0


# NOTE: Example for matrix_size = 3
#    unify_row(0, 0)
#    subtract_row(0, 0, 1)
#    subtract_row(0, 0, 2)
#    unify_row(1, 1)
#    subtract_row(1, 1, 2)
#    unify_row(2, 2)
#    subtract_row(2, 2, 1)
#    subtract_row(2, 2, 0)
#    subtract_row(1, 1, 0)

    for i in xrange(matrix_size):
        unify_row(i, i)
        for j in xrange(matrix_size - i - 1):
            subtract_row(i, i, i + j + 1)

    for i in xrange(matrix_size):
        for j in xrange(matrix_size - i - 1):
            subtract_row(matrix_size - i - 1, matrix_size - i - 1, j)

    assert all(matr[i][:matrix_size] == ([0] * i) + [1] +
               ([0] * (matrix_size - 1 - i)) for i in xrange(matrix_size))

    ret = map(operator.itemgetter(matrix_size), matr)

    return ret
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
示例#10
0
def schedule(jobs, sizs, estimator, memfile_path=None):
    if bconfig.DEBUG_PROCESS_PEAK_MEMORY and memfile_path:

        def register_memory_usage():
            pid = os.getpid()
            peak = get_peak_mem()
            fp = open(memfile_path, 'a')
            print_tortoise_memory_log(
                {
                    'pid': pid,
                    'peak1': peak[0],
                    'peak2': peak[1],
                    'est': sizs[idx],
                    'bibs': bibs[idx]
                }, fp)
            fp.close()
    else:

        def register_memory_usage():
            pass

    def run_job(idx):
        try:
            sys.stdout = output_killer
            jobs[idx]()
            register_memory_usage()
            os._exit(os.EX_OK)
        except Exception as e:
            f = open('/tmp/exception-%s' % str(os.getpid()), "w")
            f.write(str(e) + '\n')
            f.close()
            os._exit(os.EX_SOFTWARE)

    max_workers = get_cores_count()
    pid_2_idx = {}
    # free = get_free_memory()
    initial = get_total_memory()
    free = initial
    output_killer = open(os.devnull, 'w')

    ret_status = [None] * len(jobs)
    bibs = sizs
    sizs = map(estimator, sizs)
    free_idxs = range(len(jobs))
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) == len(
        free_idxs)

    done = 0.
    total = sum(sizs)
    biggest = max(sizs)

    logger.update_status(0., "0 / %d" % len(jobs))
    too_big = [idx for idx in free_idxs if sizs[idx] > free]
    for idx in too_big:
        pid = os.fork()
        if pid == 0:  # child
            run_job(idx)
        else:  # parent
            done += sizs[idx]
            del free_idxs[idx]
            cpid, status = os.wait()
            logger.update_status(
                done / total,
                "%d / %d" % (len(jobs) - len(free_idxs), len(jobs)))
            ret_status[idx] = status
            assert cpid == pid

    while free_idxs or pid_2_idx:
        while len(pid_2_idx) < max_workers:
            idx = get_biggest_job_below(free, (sizs[idx] for idx in free_idxs))
            if idx != -1:
                job_idx = free_idxs[idx]
                pid = os.fork()
                if pid == 0:  # child
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else:  # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        logger.update_status(
            done / total, "%d / %d" %
            (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    logger.update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat is not None for stat in ret_status)

    return ret_status
                pid = os.fork()
                if pid == 0: # child
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else: # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat != None for stat in ret_status)

    return ret_status
示例#12
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    override_stdout_config(stdout=True)

    files = [
        '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/')
        if x.startswith('cluster_status_report_pid')
    ]
    fnum = float(len(files))
    quanta = .1 / fnum

    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(
        lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.]))
    coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.])

    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i, c in enumerate(cn):
                update_status(i / l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i, fi in enumerate(files):
        if generate_graphs:
            if i % 1000 == 0:
                gen_graphs(True)

        f = open(fi, 'r')
        status = i / fnum
        update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status + 0.2 * quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (
                str(cur_clust), str(cur_coef), str(cur_maxlen),
                str(cur_clustnumber))

            if cur_coeffs:

                assert len(
                    cur_coeffs
                ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % (
                    str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all(
                    [x >= 0 and x <= 1 for x in cur_coeffs]
                ), "Error, a coefficient is wrong here! Check me! %s %s %s" % (
                    str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs) / cur_clen

                update_status(status + 0.4 * quanta,
                              '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi + 1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi +
                                            cur_avg) / (avi + 1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2],
                                               cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3],
                                               cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi +
                                            cur_clustnumber) / (avi + 1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi +
                                            cur_clustnumber / cur_maxlen) / (
                                                avi + 1)

                update_status(status + 0.6 * quanta,
                              '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi + 1
                cluster_stats[cur_clust][cur_coef][1] = (
                    cluster_stats[cur_clust][cur_coef][1] * avi +
                    cur_avg) / (avi + 1)
                cluster_stats[cur_clust][cur_coef][2] = min(
                    cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(
                    cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (
                    cluster_stats[cur_clust][cur_coef][4] * avi +
                    cur_clustnumber) / (avi + 1)
                cluster_stats[cur_clust][cur_coef][5] = (
                    cluster_stats[cur_clust][cur_coef][5] * avi +
                    cur_clustnumber / cur_maxlen) / (avi + 1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()

    if pickle_output:
        update_status(0, 'Dumping to file...')
        f = open(pickle_output, 'w')
        SER.dump(
            {
                'cluster_stats':
                dict((x, dict(cluster_stats[x]))
                     for x in cluster_stats.iterkeys()),
                'coeff_stats':
                dict((coeff_stats))
            }, f)
        f.close()
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else:  # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        update_status(
            done / total, "%d / %d" %
            (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat != None for stat in ret_status)

    return ret_status
示例#14
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    import matplotlib.pyplot as plt
    plt.ioff()
    def _gen_plot(data, filename):
        plt.clf()
        ax = plt.subplot(111)
        ax.grid(visible=True)
        x = sorted(data.keys())

        w = [data[k][0] for k in x]
        try:
            wscf = max(w)
        except:
            wscf = 0
        w = [float(i)/wscf for i in w]
        y = [data[k][1] for k in x]
        maxi = [data[k][3] for k in x]
        mini = [data[k][2] for k in x]

        lengs = [data[k][4] for k in x]
        try:
            ml = float(max(lengs))
        except:
            ml = 1
        lengs = [k/ml for k in lengs]

        normalengs = [data[k][5] for k in x]

        ax.plot(x,y,'-o',label='avg')
        ax.plot(x,maxi,'-o', label='max')
        ax.plot(x,mini,'-o', label='min')
        ax.plot(x,w, '-x', label='norm %s' % str(wscf))
        ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml)))
        ax.plot(x,normalengs, '-o', label='ncl')
        plt.ylim(ymax = 1., ymin = -0.01)
        plt.xlim(xmax = 1., xmin = -0.01)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.)
        plt.savefig(filename)

    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = filehandler.open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    import matplotlib.pyplot as plt
    plt.ioff()
    def _gen_plot(data, filename):
        plt.clf()
        ax = plt.subplot(111)
        ax.grid(visible=True)
        x = sorted(data.keys())

        w = [data[k][0] for k in x]
        try:
            wscf = max(w)
        except:
            wscf = 0
        w = [float(i)/wscf for i in w]
        y = [data[k][1] for k in x]
        maxi = [data[k][3] for k in x]
        mini = [data[k][2] for k in x]

        lengs = [data[k][4] for k in x]
        try:
            ml = float(max(lengs))
        except:
            ml = 1
        lengs = [k/ml for k in lengs]

        normalengs = [data[k][5] for k in x]

        ax.plot(x,y,'-o',label='avg')
        ax.plot(x,maxi,'-o', label='max')
        ax.plot(x,mini,'-o', label='min')
        ax.plot(x,w, '-x', label='norm %s' % str(wscf))
        ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml)))
        ax.plot(x,normalengs, '-o', label='ncl')
        plt.ylim(ymax = 1., ymin = -0.01)
        plt.xlim(xmax = 1., xmin = -0.01)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.)
        plt.savefig(filename)

    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = filehandler.open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()