def tortoise(pure=False, force_matrix_creation=False, skip_matrix_creation=False, last_run=None): assert not force_matrix_creation or not skip_matrix_creation # The computation must be forced in case we want # to compute pure results force_matrix_creation = force_matrix_creation or pure if not skip_matrix_creation: bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix( clusters, sizes, force=force_matrix_creation) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses) bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store( clusters, sizes) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses)
def tortoise(pure=False, force_matrix_creation=False, skip_matrix_creation=False, last_run=None): assert not force_matrix_creation or not skip_matrix_creation # The computation must be forced in case we want # to compute pure results force_matrix_creation = force_matrix_creation or pure if not skip_matrix_creation: bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid( pure, last_run) bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix(clusters, sizes, force=force_matrix_creation) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses) bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid( pure, last_run) bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store(clusters, sizes) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses)
def tortoise_from_scratch(): bibauthor_print("Preparing cluster sets.") cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix(cluster_sets, sizes, force=True) assert len(exit_statuses) == len(cluster_sets) assert all(stat == os.EX_OK for stat in exit_statuses) empty_results_table() bibauthor_print("Preparing cluster sets.") cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store(cluster_sets, sizes) assert len(exit_statuses) == len(cluster_sets) assert all(stat == os.EX_OK for stat in exit_statuses)
def approximate(xs, ys, power): assert len(xs) == len(ys) matrix_size = power + 1 variables = 2 * power + 1 xs = map(float, xs) ys = map(float, ys) xs = reduce(lambda x, y: x + [list(starmap(operator.mul, izip(x[-1], y)))], repeat(xs, variables - 1), [[1] * len(xs)]) assert len(xs) == variables s = map(sum, xs) assert s[0] == len(ys) b = [sum(starmap(operator.mul, izip(ys, x))) for x in xs[:matrix_size]] a = [s[i:i + matrix_size] for i in xrange(matrix_size)] # So, we have a*x = b and we are looking for x matr = [ai + [bi] for ai, bi in izip(a, b)] def unify_row(i, j): matr[i] = [cell / matr[i][j] for cell in matr[i]] assert matr[i][j] == 1 def subtract_row(i, j, row): assert matr[i][j] == 1 matr[row] = [matr[row][k] - matr[i][k] * matr[row][j] for k in xrange(len(matr[i]))] assert matr[row][j] == 0 # NOTE: Example for matrix_size = 3 # unify_row(0, 0) # subtract_row(0, 0, 1) # subtract_row(0, 0, 2) # unify_row(1, 1) # subtract_row(1, 1, 2) # unify_row(2, 2) # subtract_row(2, 2, 1) # subtract_row(2, 2, 0) # subtract_row(1, 1, 0) for i in xrange(matrix_size): unify_row(i, i) for j in xrange(matrix_size - i - 1): subtract_row(i, i, i + j + 1) for i in xrange(matrix_size): for j in xrange(matrix_size - i - 1): subtract_row(matrix_size - i - 1, matrix_size - i - 1, j) assert all(matr[i][:matrix_size] == ([0] * i) + [1] + ([0] * (matrix_size - 1 - i)) for i in xrange(matrix_size)) ret = map(operator.itemgetter(matrix_size), matr) return ret
def __init__(self, personid_records): ''' @param personid_records: A list of tuples: (personid, bibrefrec, flag). Notice that all bibrefrecs should be the same since the Blob represents only one bibrefrec. ''' self.bib = personid_records[0][1] assert all(p[1] == self.bib for p in personid_records), \ "All cluster sets should share the bibrefrec" self.claimed = set() self.assigned = set() self.rejected = set() for pid, _, flag in personid_records: if flag > 1: self.claimed.add(pid) elif flag >= -1: self.assigned.add(pid) else: self.rejected.add(pid)
def approximate(xs, ys, power): assert len(xs) == len(ys) matrix_size = power + 1 variables = 2 * power + 1 xs = map(float, xs) ys = map(float, ys) xs = reduce(lambda x, y: x + [list(starmap(operator.mul, izip(x[-1], y)))], repeat(xs, variables - 1), [[1] * len(xs)]) assert len(xs) == variables s = map(sum, xs) assert s[0] == len(ys) b = [sum(starmap(operator.mul, izip(ys, x))) for x in xs[:matrix_size]] a = [s[i:i + matrix_size] for i in xrange(matrix_size)] # So, we have a*x = b and we are looking for x matr = [ai + [bi] for ai, bi in izip(a, b)] def unify_row(i, j): matr[i] = [cell / matr[i][j] for cell in matr[i]] assert matr[i][j] == 1 def subtract_row(i, j, row): assert matr[i][j] == 1 matr[row] = [ matr[row][k] - matr[i][k] * matr[row][j] for k in xrange(len(matr[i])) ] assert matr[row][j] == 0 # NOTE: Example for matrix_size = 3 # unify_row(0, 0) # subtract_row(0, 0, 1) # subtract_row(0, 0, 2) # unify_row(1, 1) # subtract_row(1, 1, 2) # unify_row(2, 2) # subtract_row(2, 2, 1) # subtract_row(2, 2, 0) # subtract_row(1, 1, 0) for i in xrange(matrix_size): unify_row(i, i) for j in xrange(matrix_size - i - 1): subtract_row(i, i, i + j + 1) for i in xrange(matrix_size): for j in xrange(matrix_size - i - 1): subtract_row(matrix_size - i - 1, matrix_size - i - 1, j) assert all(matr[i][:matrix_size] == ([0] * i) + [1] + ([0] * (matrix_size - 1 - i)) for i in xrange(matrix_size)) ret = map(operator.itemgetter(matrix_size), matr) return ret
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
def schedule(jobs, sizs, estimator, memfile_path=None): if bconfig.DEBUG_PROCESS_PEAK_MEMORY and memfile_path: def register_memory_usage(): pid = os.getpid() peak = get_peak_mem() fp = open(memfile_path, 'a') print_tortoise_memory_log( { 'pid': pid, 'peak1': peak[0], 'peak2': peak[1], 'est': sizs[idx], 'bibs': bibs[idx] }, fp) fp.close() else: def register_memory_usage(): pass def run_job(idx): try: sys.stdout = output_killer jobs[idx]() register_memory_usage() os._exit(os.EX_OK) except Exception as e: f = open('/tmp/exception-%s' % str(os.getpid()), "w") f.write(str(e) + '\n') f.close() os._exit(os.EX_SOFTWARE) max_workers = get_cores_count() pid_2_idx = {} # free = get_free_memory() initial = get_total_memory() free = initial output_killer = open(os.devnull, 'w') ret_status = [None] * len(jobs) bibs = sizs sizs = map(estimator, sizs) free_idxs = range(len(jobs)) assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) == len( free_idxs) done = 0. total = sum(sizs) biggest = max(sizs) logger.update_status(0., "0 / %d" % len(jobs)) too_big = [idx for idx in free_idxs if sizs[idx] > free] for idx in too_big: pid = os.fork() if pid == 0: # child run_job(idx) else: # parent done += sizs[idx] del free_idxs[idx] cpid, status = os.wait() logger.update_status( done / total, "%d / %d" % (len(jobs) - len(free_idxs), len(jobs))) ret_status[idx] = status assert cpid == pid while free_idxs or pid_2_idx: while len(pid_2_idx) < max_workers: idx = get_biggest_job_below(free, (sizs[idx] for idx in free_idxs)) if idx != -1: job_idx = free_idxs[idx] pid = os.fork() if pid == 0: # child os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] logger.update_status( done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) logger.update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat is not None for stat in ret_status) return ret_status
pid = os.fork() if pid == 0: # child os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat != None for stat in ret_status) return ret_status
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = [ '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid') ] fnum = float(len(files)) quanta = .1 / fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict( lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.])) coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i, c in enumerate(cn): update_status(i / l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i, fi in enumerate(files): if generate_graphs: if i % 1000 == 0: gen_graphs(True) f = open(fi, 'r') status = i / fnum update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status + 0.2 * quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len( cur_coeffs ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all( [x >= 0 and x <= 1 for x in cur_coeffs] ), "Error, a coefficient is wrong here! Check me! %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs) / cur_clen update_status(status + 0.4 * quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi + 1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi + cur_avg) / (avi + 1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi + cur_clustnumber) / (avi + 1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / ( avi + 1) update_status(status + 0.6 * quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi + 1 cluster_stats[cur_clust][cur_coef][1] = ( cluster_stats[cur_clust][cur_coef][1] * avi + cur_avg) / (avi + 1) cluster_stats[cur_clust][cur_coef][2] = min( cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max( cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = ( cluster_stats[cur_clust][cur_coef][4] * avi + cur_clustnumber) / (avi + 1) cluster_stats[cur_clust][cur_coef][5] = ( cluster_stats[cur_clust][cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / (avi + 1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0, 'Dumping to file...') f = open(pickle_output, 'w') SER.dump( { 'cluster_stats': dict((x, dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats': dict((coeff_stats)) }, f) f.close()
os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] update_status( done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat != None for stat in ret_status) return ret_status
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): import matplotlib.pyplot as plt plt.ioff() def _gen_plot(data, filename): plt.clf() ax = plt.subplot(111) ax.grid(visible=True) x = sorted(data.keys()) w = [data[k][0] for k in x] try: wscf = max(w) except: wscf = 0 w = [float(i)/wscf for i in w] y = [data[k][1] for k in x] maxi = [data[k][3] for k in x] mini = [data[k][2] for k in x] lengs = [data[k][4] for k in x] try: ml = float(max(lengs)) except: ml = 1 lengs = [k/ml for k in lengs] normalengs = [data[k][5] for k in x] ax.plot(x,y,'-o',label='avg') ax.plot(x,maxi,'-o', label='max') ax.plot(x,mini,'-o', label='min') ax.plot(x,w, '-x', label='norm %s' % str(wscf)) ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml))) ax.plot(x,normalengs, '-o', label='ncl') plt.ylim(ymax = 1., ymin = -0.01) plt.xlim(xmax = 1., xmin = -0.01) ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.) plt.savefig(filename) override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = filehandler.open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()