def _draw_af_hist_(path, records_filter = allow_all_filter): """Draws histogram of authorship.""" af_count = {} #dict{author: count} for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list( af for af in afs if af!='-' ) for af in afs_ok: af_count[af] = af_count.get(af, 0) + 1 print len(af_count), " authors found." print max(af_count.values()), " = max" print min(af_count.values()), " = min" avg_af_values = avg(af_count.values()) print round(avg_af_values, 2), " = avg" print round(std(af_count.values()), 2), " = std" print sum(1 for af in af_count.values() if af > avg_af_values) , " authors above avg" print sum(1 for af in af_count.values() if af < avg_af_values) , " authors below avg" n, bins, patches = plt.hist(af_count.values(), bins = max(af_count.values()), normed=False, log=True, alpha=0.75) plt.xlabel("Liczba wystapien w rekordach") plt.ylabel("Liczba autorow") plt.show()
def _draw_af_hist_(path, records_filter=allow_all_filter): """Draws histogram of authorship.""" af_count = {} #dict{author: count} for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list(af for af in afs if af != '-') for af in afs_ok: af_count[af] = af_count.get(af, 0) + 1 print len(af_count), " authors found." print max(af_count.values()), " = max" print min(af_count.values()), " = min" avg_af_values = avg(af_count.values()) print round(avg_af_values, 2), " = avg" print round(std(af_count.values()), 2), " = std" print sum(1 for af in af_count.values() if af > avg_af_values), " authors above avg" print sum(1 for af in af_count.values() if af < avg_af_values), " authors below avg" n, bins, patches = plt.hist(af_count.values(), bins=max(af_count.values()), normed=False, log=True, alpha=0.75) plt.xlabel("Liczba wystapien w rekordach") plt.ylabel("Liczba autorow") plt.show()
def _report_af_quality_(path, records_filter=allow_all_filter): """Prints report about authors' identities quality.""" afs_len = [] afs_ok_len = [] for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list(af for af in afs if af != '-') afs_len.append(len(afs)) afs_ok_len.append(len(afs_ok)) afs_ok_frac = list( float(m) / float(l) for m, l in zip(afs_ok_len, afs_len)) print max(afs_len), "\n", round(avg(afs_len), 2), "\n", round(std(afs_len), 2) print max(afs_ok_len), "\n", round(avg(afs_ok_len), 2), "\n", round(std(afs_ok_len), 2) print round(max(afs_ok_frac), 2), "\n", round(avg(afs_ok_frac), 2), "\n", round(std(afs_ok_frac), 2)
def extract_fieldvalue2ids(zbl_generator, multivalue_field_name = "af", empty_value = "-", container = list): """Returns dictionary{field-value: container-of-identified-ids-that-have-this-value}. >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) } >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) } >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) } >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])} >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3]).iteritems())) [('a1', ['1', '2']), ('a2', ['1', '2']), ('a3', ['3'])] >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3,r4]).iteritems())) [('a1', ['1', '2']), ('a2', ['1', '2', '4']), ('a3', ['3', '4'])] """ af2ids = {} skipped = 0 novals = 0 for i,zbl in enumerate(zbl_generator): if i%10000==0: logging.info("[extract_fieldvalue2ids]"+str(i)+" records processed") if not multivalue_field_name in zbl: skipped = skipped + 1 continue zbl_id = zbl[zbl_io.ZBL_ID_FIELD] afs = zbl_io.unpack_multivalue_field(zbl[multivalue_field_name]) afs_ok = list(af for af in afs if af!=empty_value) if len(afs_ok)==0: novals = novals + 1 for af in afs_ok: af2ids[af] = af2ids.get(af, []) + [zbl_id] logging.info("[extract_fieldvalue2ids] "+str(i)+" records processed") logging.info("[extract_fieldvalue2ids] "+str(skipped)+" records skipped") logging.info("[extract_fieldvalue2ids] "+str(novals)+" records with only empty values in field") fv2ids = dict( (id,container(ids)) for id,ids in af2ids.iteritems()) logging.info("[extract_fieldvalue2ids] "+str(len(fv2ids))+" authors found.") return fv2ids
def update(self, zbl_generator, \ msc_predicate = lambda msc: MSC_ORDINARY_LEAF_PATTERN_RE.match(msc)): logging.info("[MscModel.update] building msc2lists") for zbl in zbl_generator:############ msc_codes = zbl_io.unpack_multivalue_field(zbl['mc']) zbl_id = zbl[zbl_io.ZBL_ID_FIELD] _update_(self.msc2zblidlist, msc_codes, zbl_id, msc_predicate) _update_(self.mscprim2zblidlist, msc_codes[:1], zbl_id, msc_predicate) _update_(self.mscsec2zblidlist, msc_codes[1:], zbl_id, msc_predicate) self._update_counts_()
def count_msc_occurences(file, records_filter=lambda x: True, field_name="mc"): """Counts number of occurrences of MSC codes in ZBL file. Returns dictionary{code_name: count}""" counts = {} for record in zbl_io.read_zbl_records(file): if not records_filter(record) or not record.has_key(field_name): continue codes = zbl_io.unpack_multivalue_field(record[field_name]) for code in codes: counts[code] = counts.get(code, 0) + 1 return counts
def update(self, zbl_generator, \ msc_predicate = lambda msc: MSC_ORDINARY_LEAF_PATTERN_RE.match(msc)): logging.info("[MscModel.update] building msc2lists") for zbl in zbl_generator: ############ msc_codes = zbl_io.unpack_multivalue_field(zbl['mc']) zbl_id = zbl[zbl_io.ZBL_ID_FIELD] _update_(self.msc2zblidlist, msc_codes, zbl_id, msc_predicate) _update_(self.mscprim2zblidlist, msc_codes[:1], zbl_id, msc_predicate) _update_(self.mscsec2zblidlist, msc_codes[1:], zbl_id, msc_predicate) self._update_counts_()
def count_msc_occurences(file, records_filter = lambda x: True, field_name = "mc"): """Counts number of occurrences of MSC codes in ZBL file. Returns dictionary{code_name: count}""" counts = {} for record in zbl_io.read_zbl_records(file): if not records_filter(record) or not record.has_key(field_name): continue codes = zbl_io.unpack_multivalue_field(record[field_name]) for code in codes: counts[code] = counts.get(code, 0) + 1 return counts
def group_zbl_by_msc(zbl_generator, \ msc_primary_predicate = lambda mscprim: MSC_ORDINARY_LEAF_PATTERN_RE.match(mscprim), \ zbl_extract = lambda zbl: zbl[zbl_io.ZBL_ID_FIELD]): """Returns dictionary{msc-primary-code: list-of-extracted-by-zbl_extract(zbl)-values}. When msc_primary_predicate(msc_primary_code)==False record will be skipped. """ msc2zbllist = {} for zbl in zbl_generator: msc_primary_code = zbl_io.unpack_multivalue_field(zbl['mc'])[0] if not msc_primary_predicate(msc_primary_code): continue zbllist = msc2zbllist.get(msc_primary_code, []) zbllist.append( zbl_extract(zbl) ) msc2zbllist[msc_primary_code] = zbllist return msc2zbllist
def group_zbl_by_msc(zbl_generator, \ msc_primary_predicate = lambda mscprim: MSC_ORDINARY_LEAF_PATTERN_RE.match(mscprim), \ zbl_extract = lambda zbl: zbl[zbl_io.ZBL_ID_FIELD]): """Returns dictionary{msc-primary-code: list-of-extracted-by-zbl_extract(zbl)-values}. When msc_primary_predicate(msc_primary_code)==False record will be skipped. """ msc2zbllist = {} for zbl in zbl_generator: msc_primary_code = zbl_io.unpack_multivalue_field(zbl['mc'])[0] if not msc_primary_predicate(msc_primary_code): continue zbllist = msc2zbllist.get(msc_primary_code, []) zbllist.append(zbl_extract(zbl)) msc2zbllist[msc_primary_code] = zbllist return msc2zbllist
def _draw_mc_hist(path, records_filter = allow_all_filter): """Draws histogram of MSC codes occurrence in records.""" mc_counts = [] for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("mc") or not records_filter(record): continue mc = zbl_io.unpack_multivalue_field(record["mc"]) mc_counts.append(len(mc)) print len(mc_counts), " record found." print max(mc_counts), " = max" print min(mc_counts), " = min" print round(avg(mc_counts), 2), " = avg" print round(std(mc_counts), 2), " = std" n, bins, patches = plt.hist(mc_counts, bins = max(mc_counts), normed=False, alpha=0.75) plt.xlabel("Liczba kodow MSC w rekordzie") plt.ylabel("Liczba rekordow") plt.show()
def _report_af_quality_(path, records_filter = allow_all_filter): """Prints report about authors' identities quality.""" afs_len = [] afs_ok_len = [] for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list( af for af in afs if af!='-' ) afs_len.append(len(afs)) afs_ok_len.append(len(afs_ok)) afs_ok_frac = list( float(m)/float(l) for m,l in zip(afs_ok_len, afs_len) ) print max(afs_len), "\n", round(avg(afs_len),2), "\n", round(std(afs_len),2) print max(afs_ok_len), "\n", round(avg(afs_ok_len),2), "\n", round(std(afs_ok_len),2) print round(max(afs_ok_frac),2), "\n", round(avg(afs_ok_frac),2), "\n", round(std(afs_ok_frac),2)
def extract_fieldvalue2ids(zbl_generator, multivalue_field_name="af", empty_value="-", container=list): """Returns dictionary{field-value: container-of-identified-ids-that-have-this-value}. >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) } >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) } >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) } >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])} >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3]).iteritems())) [('a1', ['1', '2']), ('a2', ['1', '2']), ('a3', ['3'])] >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3,r4]).iteritems())) [('a1', ['1', '2']), ('a2', ['1', '2', '4']), ('a3', ['3', '4'])] """ af2ids = {} skipped = 0 novals = 0 for i, zbl in enumerate(zbl_generator): if i % 10000 == 0: logging.info("[extract_fieldvalue2ids]" + str(i) + " records processed") if not multivalue_field_name in zbl: skipped = skipped + 1 continue zbl_id = zbl[zbl_io.ZBL_ID_FIELD] afs = zbl_io.unpack_multivalue_field(zbl[multivalue_field_name]) afs_ok = list(af for af in afs if af != empty_value) if len(afs_ok) == 0: novals = novals + 1 for af in afs_ok: af2ids[af] = af2ids.get(af, []) + [zbl_id] logging.info("[extract_fieldvalue2ids] " + str(i) + " records processed") logging.info("[extract_fieldvalue2ids] " + str(skipped) + " records skipped") logging.info("[extract_fieldvalue2ids] " + str(novals) + " records with only empty values in field") fv2ids = dict((id, container(ids)) for id, ids in af2ids.iteritems()) logging.info("[extract_fieldvalue2ids] " + str(len(fv2ids)) + " authors found.") return fv2ids
def _draw_mc_hist(path, records_filter=allow_all_filter): """Draws histogram of MSC codes occurrence in records.""" mc_counts = [] for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("mc") or not records_filter(record): continue mc = zbl_io.unpack_multivalue_field(record["mc"]) mc_counts.append(len(mc)) print len(mc_counts), " record found." print max(mc_counts), " = max" print min(mc_counts), " = min" print round(avg(mc_counts), 2), " = avg" print round(std(mc_counts), 2), " = std" n, bins, patches = plt.hist(mc_counts, bins=max(mc_counts), normed=False, alpha=0.75) plt.xlabel("Liczba kodow MSC w rekordzie") plt.ylabel("Liczba rekordow") plt.show()
def _update_docmodel_(zbl): if not "mc" in zbl: return docid = zbl[zbl_io.ZBL_ID_FIELD] msc_codes = zbl_io.unpack_multivalue_field(zbl['mc']) docid2primcode[docid] = msc_codes[0] docid2seccodes[docid] = msc_codes[1:]
except: print "First argument expected: zbl-file-path." sys.exit(-1) #l - low level, m - medium level, h - highest level of MSC tree print "MIN_COUNT_MSC=",MIN_COUNT_MSC print "NUM_TRIES=",NUM_TRIES print "VALID_LEAF_PATTERN_RE=",VALID_LEAF_PATTERN_RE print "must_have_fields=",must_have_fields print "zbl_path=",zbl_path print "Building list of msc-primary-codes that should be considered..." start = time.clock() msc2count = {} for i,zbl in enumerate(_get_zbl_generator_(zbl_path, must_have_fields)): if i%10000 == 0: print "",i,"records processed in",(time.clock()-start),"s ->",sum(msc2count.values()),"kept" msc_codes = [ zbl_io.unpack_multivalue_field(zbl['mc'])[0] ] #only primary for msc in msc_codes: #print msc,"->",(not VALID_LEAF_PATTERN_RE.match(msc) is None) if not VALID_LEAF_PATTERN_RE.match(msc) is None: msc2count[msc] = msc2count.get(msc, 0)+1 print "Filtering for with MIN_COUNT_MSC:",MIN_COUNT_MSC," out of", sum(msc2count.values()) msc2count = dict((msc,count) for msc,count in msc2count.iteritems() if count>=MIN_COUNT_MSC) print "Building mapping msc2ix" msc2ix = dict((msc,ix) for ix,msc in enumerate(msc2count)) ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems()) leaves = list( msc2ix ) num_leaves = len(leaves) print "Building MSC tree out of", num_leaves, "leaves" msc_tree = trees.build_msctree(msc2ix.keys(), msc2ix) #print str(trees.map_tree_leaves(msc_tree, ix2msc))[:400]