def _report_af_quality_(path, records_filter=allow_all_filter): """Prints report about authors' identities quality.""" afs_len = [] afs_ok_len = [] for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list(af for af in afs if af != '-') afs_len.append(len(afs)) afs_ok_len.append(len(afs_ok)) afs_ok_frac = list( float(m) / float(l) for m, l in zip(afs_ok_len, afs_len)) print max(afs_len), "\n", round(avg(afs_len), 2), "\n", round(std(afs_len), 2) print max(afs_ok_len), "\n", round(avg(afs_ok_len), 2), "\n", round(std(afs_ok_len), 2) print round(max(afs_ok_frac), 2), "\n", round(avg(afs_ok_frac), 2), "\n", round(std(afs_ok_frac), 2)
def _report_ci_(path, records_filter = allow_all_filter, \ ci_dst_records_filter = allow_all_filter, \ uq_id_field_name = zbl_io.ZBL_ID_FIELD): """Prints report about citations in ZBL file. records_filter(record) - should return True if record is admitted ci_dst_records_filter(record) - should return True if record that citation is pointing at is admitted uq_id_field_name - name of a field that uniquely identifies record """ #wczytywanie zbioru na ktory moga wskazywac cytowania: print "Loading ids of records that may be citation destination." dst_records_ids = set() for i,record in enumerate( zbl_io.read_zbl_records(open(path)) ): if i%100000 == 0: print i," records considered" #progress bar if record.has_key(uq_id_field_name) and ci_dst_records_filter(record): dst_records_ids.add(record[uq_id_field_name]) print "Done.", len(dst_records_ids), " records loaded." #statystyki: cis_len = [] #liczba cytowan cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("ci") or not records_filter(record): continue cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) #identyfikatory cytowan: identified_ci_ids = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name)) #rekordy dopsowane do cytowan i w zadanym zbiorze: filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids) cis_len.append(len(cis)) cis_matched.append(len(filtered_matched_records)) cis_matched_div_len = list( float(m)/float(l) for m,l in zip(cis_matched, cis_len) ) print "Citation statistics (only on records with citations) [total, min avg max std]: " print "-Number of citations :", "\t", round(sum(cis_len),0), "\t", round(min(cis_len),0), "\t", round(avg(cis_len),2), "\t", round(max(cis_len),0), "\t", round(std(cis_len),2) print "-Matching citations:", "\t", round(sum(cis_matched),0), "\t", round(min(cis_matched),0), "\t", round(avg(cis_matched),2), "\t", round(max(cis_matched),0), "\t", round(std(cis_matched),2) print "-Fraction of matching citations: - ", "\t", round(min(cis_matched_div_len),3), "\t", round(avg(cis_matched_div_len),3), "\t", round(max(cis_matched_div_len),3), "\t", round(std(cis_matched_div_len),3) print "-Total Number of citations/Matching citations:", "\t", round(float(sum(cis_matched))/sum(cis_len),3) print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \ round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \ round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) cis_matched_hist = {} for i in xrange(0, max(cis_matched)+1): cis_matched_hist[i] = sum(1 for c in cis_matched if c==i) print "Histogram:", cis_matched_hist n, bins, patches = plt.hist(sorted(cis_matched), bins = max(cis_matched), normed=False, alpha=0.75) plt.xlabel("Liczba dopasowanych cytowan") plt.ylabel("Liczba rekordow") plt.show()
def _report_counts_(counts, label = None): """Prints report on count statistics (counts = dictionary{name:count}/list of numbers ).""" try: counts = counts.values() except: pass if not label is None: print label print " Max:", max(counts) print " Min:", min(counts) try: print " Avg:", round(avg(counts),2) except: print " Avg: -" try: print " Std:", round(std(counts),2) except: print " Std: -" print " Total num of categories:", len(counts) #ile mamy kategorii ktora wystepuje tyle a tyle razy print " <5:", sum(1 for c in counts if c<5) print " <10:", sum(1 for c in counts if c<10) print " <25:", sum(1 for c in counts if c<25) print " <50:", sum(1 for c in counts if c<50) print " <100:", sum(1 for c in counts if c<100) print " <1000:", sum(1 for c in counts if c<1000) print " <10000:", sum(1 for c in counts if c<10000)
def _draw_af_hist_(path, records_filter = allow_all_filter): """Draws histogram of authorship.""" af_count = {} #dict{author: count} for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list( af for af in afs if af!='-' ) for af in afs_ok: af_count[af] = af_count.get(af, 0) + 1 print len(af_count), " authors found." print max(af_count.values()), " = max" print min(af_count.values()), " = min" avg_af_values = avg(af_count.values()) print round(avg_af_values, 2), " = avg" print round(std(af_count.values()), 2), " = std" print sum(1 for af in af_count.values() if af > avg_af_values) , " authors above avg" print sum(1 for af in af_count.values() if af < avg_af_values) , " authors below avg" n, bins, patches = plt.hist(af_count.values(), bins = max(af_count.values()), normed=False, log=True, alpha=0.75) plt.xlabel("Liczba wystapien w rekordach") plt.ylabel("Liczba autorow") plt.show()
def _report_counts_(counts, label=None): """Prints report on count statistics (counts = dictionary{name:count}/list of numbers ).""" try: counts = counts.values() except: pass if not label is None: print label print " Max:", max(counts) print " Min:", min(counts) try: print " Avg:", round(avg(counts), 2) except: print " Avg: -" try: print " Std:", round(std(counts), 2) except: print " Std: -" print " Total num of categories:", len(counts) #ile mamy kategorii ktora wystepuje tyle a tyle razy print " <5:", sum(1 for c in counts if c < 5) print " <10:", sum(1 for c in counts if c < 10) print " <25:", sum(1 for c in counts if c < 25) print " <50:", sum(1 for c in counts if c < 50) print " <100:", sum(1 for c in counts if c < 100) print " <1000:", sum(1 for c in counts if c < 1000) print " <10000:", sum(1 for c in counts if c < 10000)
def _draw_af_hist_(path, records_filter=allow_all_filter): """Draws histogram of authorship.""" af_count = {} #dict{author: count} for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list(af for af in afs if af != '-') for af in afs_ok: af_count[af] = af_count.get(af, 0) + 1 print len(af_count), " authors found." print max(af_count.values()), " = max" print min(af_count.values()), " = min" avg_af_values = avg(af_count.values()) print round(avg_af_values, 2), " = avg" print round(std(af_count.values()), 2), " = std" print sum(1 for af in af_count.values() if af > avg_af_values), " authors above avg" print sum(1 for af in af_count.values() if af < avg_af_values), " authors below avg" n, bins, patches = plt.hist(af_count.values(), bins=max(af_count.values()), normed=False, log=True, alpha=0.75) plt.xlabel("Liczba wystapien w rekordach") plt.ylabel("Liczba autorow") plt.show()
def extract_fv_graph(zbl_generator, multival_field_name = "af", empty_value = "-", container = set): """Returns dictionary{id:container-of-ids} that describes connection graph. Between r1 and r2 is (two-way)link if both r1 contains at least single common value in field of field_name. >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) } >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) } >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) } >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])} >>> rx = {'an': 'x'} >>> ry = {'an': 'y', 'af': '-'} >>> sorted(list(extract_fv_graph([r1,r2,r3,r4]).iteritems())) == [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))] True >>> sorted(list(extract_fv_graph([r1,r2,ry,r3,r4,rx]).iteritems())) == [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))] True """ fv2ids = extract_fieldvalue2ids(zbl_generator, multival_field_name, empty_value, list) fv2count = dict( (fv,len(ids)) for fv,ids in fv2ids.iteritems()) #DBG logging.info("[extract_fv_graph] max="+ str( max( fv2count.values() ) ) ) #DBG logging.info("[extract_fv_graph] avg="+ str( stats.avg( fv2count.values() ) ) ) #DBG logging.info("[extract_fv_graph] std="+ str( stats.std( fv2count.values() ) ) ) #DBG id2ids = {} for rr, (fv,ids) in enumerate(fv2ids.iteritems()): if rr%10000 == 0: logging.info("[extract_fv_graph]"+str(rr)+" records processed") #logging.info("[extract_fv_graph] considering fv="+str(fv)+" ids="+str(ids)) for i in xrange(len(ids)): #po kazdym majacym dana wartosc curr_id = ids[i] #wybierz jednego peer_ids = [ ids[j] for j in xrange(len(ids)) if i!=j ] #jego wszysycy sasiedzi if len(peer_ids) > 0: all_peer_ids = id2ids.get(curr_id, []) all_peer_ids.extend(peer_ids) id2ids[curr_id] = all_peer_ids return dict( (id,container(ids)) for id,ids in id2ids.iteritems())
def _report_af_quality_(path, records_filter = allow_all_filter): """Prints report about authors' identities quality.""" afs_len = [] afs_ok_len = [] for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list( af for af in afs if af!='-' ) afs_len.append(len(afs)) afs_ok_len.append(len(afs_ok)) afs_ok_frac = list( float(m)/float(l) for m,l in zip(afs_ok_len, afs_len) ) print max(afs_len), "\n", round(avg(afs_len),2), "\n", round(std(afs_len),2) print max(afs_ok_len), "\n", round(avg(afs_ok_len),2), "\n", round(std(afs_ok_len),2) print round(max(afs_ok_frac),2), "\n", round(avg(afs_ok_frac),2), "\n", round(std(afs_ok_frac),2)
def extract_fv_graph(zbl_generator, multival_field_name="af", empty_value="-", container=set): """Returns dictionary{id:container-of-ids} that describes connection graph. Between r1 and r2 is (two-way)link if both r1 contains at least single common value in field of field_name. >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) } >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) } >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) } >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])} >>> rx = {'an': 'x'} >>> ry = {'an': 'y', 'af': '-'} >>> sorted(list(extract_fv_graph([r1,r2,r3,r4]).iteritems())) == [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))] True >>> sorted(list(extract_fv_graph([r1,r2,ry,r3,r4,rx]).iteritems())) == [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))] True """ fv2ids = extract_fieldvalue2ids(zbl_generator, multival_field_name, empty_value, list) fv2count = dict((fv, len(ids)) for fv, ids in fv2ids.iteritems()) #DBG logging.info("[extract_fv_graph] max=" + str(max(fv2count.values()))) #DBG logging.info("[extract_fv_graph] avg=" + str(stats.avg(fv2count.values()))) #DBG logging.info("[extract_fv_graph] std=" + str(stats.std(fv2count.values()))) #DBG id2ids = {} for rr, (fv, ids) in enumerate(fv2ids.iteritems()): if rr % 10000 == 0: logging.info("[extract_fv_graph]" + str(rr) + " records processed") #logging.info("[extract_fv_graph] considering fv="+str(fv)+" ids="+str(ids)) for i in xrange(len(ids)): #po kazdym majacym dana wartosc curr_id = ids[i] #wybierz jednego peer_ids = [ids[j] for j in xrange(len(ids)) if i != j] #jego wszysycy sasiedzi if len(peer_ids) > 0: all_peer_ids = id2ids.get(curr_id, []) all_peer_ids.extend(peer_ids) id2ids[curr_id] = all_peer_ids return dict((id, container(ids)) for id, ids in id2ids.iteritems())
def _draw_mc_hist(path, records_filter = allow_all_filter): """Draws histogram of MSC codes occurrence in records.""" mc_counts = [] for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("mc") or not records_filter(record): continue mc = zbl_io.unpack_multivalue_field(record["mc"]) mc_counts.append(len(mc)) print len(mc_counts), " record found." print max(mc_counts), " = max" print min(mc_counts), " = min" print round(avg(mc_counts), 2), " = avg" print round(std(mc_counts), 2), " = std" n, bins, patches = plt.hist(mc_counts, bins = max(mc_counts), normed=False, alpha=0.75) plt.xlabel("Liczba kodow MSC w rekordzie") plt.ylabel("Liczba rekordow") plt.show()
def _draw_mc_hist(path, records_filter=allow_all_filter): """Draws histogram of MSC codes occurrence in records.""" mc_counts = [] for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("mc") or not records_filter(record): continue mc = zbl_io.unpack_multivalue_field(record["mc"]) mc_counts.append(len(mc)) print len(mc_counts), " record found." print max(mc_counts), " = max" print min(mc_counts), " = min" print round(avg(mc_counts), 2), " = avg" print round(std(mc_counts), 2), " = std" n, bins, patches = plt.hist(mc_counts, bins=max(mc_counts), normed=False, alpha=0.75) plt.xlabel("Liczba kodow MSC w rekordzie") plt.ylabel("Liczba rekordow") plt.show()
def _report_ci_(path, records_filter = allow_all_filter, \ ci_dst_records_filter = allow_all_filter, \ uq_id_field_name = zbl_io.ZBL_ID_FIELD): """Prints report about citations in ZBL file. records_filter(record) - should return True if record is admitted ci_dst_records_filter(record) - should return True if record that citation is pointing at is admitted uq_id_field_name - name of a field that uniquely identifies record """ #wczytywanie zbioru na ktory moga wskazywac cytowania: print "Loading ids of records that may be citation destination." dst_records_ids = set() for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if record.has_key(uq_id_field_name) and ci_dst_records_filter(record): dst_records_ids.add(record[uq_id_field_name]) print "Done.", len(dst_records_ids), " records loaded." #statystyki: cis_len = [] #liczba cytowan cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("ci") or not records_filter(record): continue cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) #identyfikatory cytowan: identified_ci_ids = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name)) #rekordy dopsowane do cytowan i w zadanym zbiorze: filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids) cis_len.append(len(cis)) cis_matched.append(len(filtered_matched_records)) cis_matched_div_len = list( float(m) / float(l) for m, l in zip(cis_matched, cis_len)) print "Citation statistics (only on records with citations) [total, min avg max std]: " print "-Number of citations :", "\t", round(sum(cis_len), 0), "\t", round( min(cis_len), 0), "\t", round(avg(cis_len), 2), "\t", round(max(cis_len), 0), "\t", round(std(cis_len), 2) print "-Matching citations:", "\t", round( sum(cis_matched), 0), "\t", round(min(cis_matched), 0), "\t", round( avg(cis_matched), 2), "\t", round(max(cis_matched), 0), "\t", round(std(cis_matched), 2) print "-Fraction of matching citations: - ", "\t", round( min(cis_matched_div_len), 3), "\t", round(avg(cis_matched_div_len), 3), "\t", round( max(cis_matched_div_len), 3), "\t", round(std(cis_matched_div_len), 3) print "-Total Number of citations/Matching citations:", "\t", round( float(sum(cis_matched)) / sum(cis_len), 3) print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \ round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \ round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) cis_matched_hist = {} for i in xrange(0, max(cis_matched) + 1): cis_matched_hist[i] = sum(1 for c in cis_matched if c == i) print "Histogram:", cis_matched_hist n, bins, patches = plt.hist(sorted(cis_matched), bins=max(cis_matched), normed=False, alpha=0.75) plt.xlabel("Liczba dopasowanych cytowan") plt.ylabel("Liczba rekordow") plt.show()