def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = [ '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid') ] fnum = float(len(files)) quanta = .1 / fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict( lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.])) coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i, c in enumerate(cn): update_status(i / l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i, fi in enumerate(files): if generate_graphs: if i % 1000 == 0: gen_graphs(True) f = open(fi, 'r') status = i / fnum update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status + 0.2 * quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len( cur_coeffs ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all( [x >= 0 and x <= 1 for x in cur_coeffs] ), "Error, a coefficient is wrong here! Check me! %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs) / cur_clen update_status(status + 0.4 * quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi + 1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi + cur_avg) / (avi + 1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi + cur_clustnumber) / (avi + 1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / ( avi + 1) update_status(status + 0.6 * quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi + 1 cluster_stats[cur_clust][cur_coef][1] = ( cluster_stats[cur_clust][cur_coef][1] * avi + cur_avg) / (avi + 1) cluster_stats[cur_clust][cur_coef][2] = min( cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max( cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = ( cluster_stats[cur_clust][cur_coef][4] * avi + cur_clustnumber) / (avi + 1) cluster_stats[cur_clust][cur_coef][5] = ( cluster_stats[cur_clust][cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / (avi + 1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0, 'Dumping to file...') f = open(pickle_output, 'w') SER.dump( { 'cluster_stats': dict((x, dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats': dict((coeff_stats)) }, f) f.close()
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): import matplotlib.pyplot as plt plt.ioff() def _gen_plot(data, filename): plt.clf() ax = plt.subplot(111) ax.grid(visible=True) x = sorted(data.keys()) w = [data[k][0] for k in x] try: wscf = max(w) except: wscf = 0 w = [float(i)/wscf for i in w] y = [data[k][1] for k in x] maxi = [data[k][3] for k in x] mini = [data[k][2] for k in x] lengs = [data[k][4] for k in x] try: ml = float(max(lengs)) except: ml = 1 lengs = [k/ml for k in lengs] normalengs = [data[k][5] for k in x] ax.plot(x,y,'-o',label='avg') ax.plot(x,maxi,'-o', label='max') ax.plot(x,mini,'-o', label='min') ax.plot(x,w, '-x', label='norm %s' % str(wscf)) ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml))) ax.plot(x,normalengs, '-o', label='ncl') plt.ylim(ymax = 1., ymin = -0.01) plt.xlim(xmax = 1., xmin = -0.01) ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.) plt.savefig(filename) override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = filehandler.open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
import gc import numpy as np #This is supposed to defeat a bit of the python vm performance losses: import sys sys.setcheckinterval(1000000) try: from collections import defaultdict except: from invenio.containerutils import defaultdict from itertools import groupby, chain, repeat from invenio.bibauthorid_general_utils import update_status, update_status_final, override_stdout_config, override_stdout_config override_stdout_config(fileout=True, stdout=False) from invenio.bibauthorid_cluster_set import delayed_cluster_sets_from_marktables from invenio.bibauthorid_cluster_set import delayed_cluster_sets_from_personid from invenio.bibauthorid_wedge import wedge from invenio.bibauthorid_name_utils import generate_last_name_cluster_str from invenio.bibauthorid_backinterface import empty_tortoise_results_table from invenio.bibauthorid_backinterface import remove_clusters_by_name from invenio.bibauthorid_general_utils import bibauthor_print from invenio.bibauthorid_prob_matrix import prepare_matirx #Scheduler is [temporarily] deprecated in favour of the much simpler schedule_workers #from invenio.bibauthorid_scheduler import schedule, matrix_coefs from invenio.bibauthorid_least_squares import to_function as create_approx_func from invenio.bibauthorid_general_utils import schedule_workers