def _report_ci_(path, records_filter = allow_all_filter, \ ci_dst_records_filter = allow_all_filter, \ uq_id_field_name = zbl_io.ZBL_ID_FIELD): """Prints report about citations in ZBL file. records_filter(record) - should return True if record is admitted ci_dst_records_filter(record) - should return True if record that citation is pointing at is admitted uq_id_field_name - name of a field that uniquely identifies record """ #wczytywanie zbioru na ktory moga wskazywac cytowania: print "Loading ids of records that may be citation destination." dst_records_ids = set() for i,record in enumerate( zbl_io.read_zbl_records(open(path)) ): if i%100000 == 0: print i," records considered" #progress bar if record.has_key(uq_id_field_name) and ci_dst_records_filter(record): dst_records_ids.add(record[uq_id_field_name]) print "Done.", len(dst_records_ids), " records loaded." #statystyki: cis_len = [] #liczba cytowan cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("ci") or not records_filter(record): continue cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) #identyfikatory cytowan: identified_ci_ids = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name)) #rekordy dopsowane do cytowan i w zadanym zbiorze: filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids) cis_len.append(len(cis)) cis_matched.append(len(filtered_matched_records)) cis_matched_div_len = list( float(m)/float(l) for m,l in zip(cis_matched, cis_len) ) print "Citation statistics (only on records with citations) [total, min avg max std]: " print "-Number of citations :", "\t", round(sum(cis_len),0), "\t", round(min(cis_len),0), "\t", round(avg(cis_len),2), "\t", round(max(cis_len),0), "\t", round(std(cis_len),2) print "-Matching citations:", "\t", round(sum(cis_matched),0), "\t", round(min(cis_matched),0), "\t", round(avg(cis_matched),2), "\t", round(max(cis_matched),0), "\t", round(std(cis_matched),2) print "-Fraction of matching citations: - ", "\t", round(min(cis_matched_div_len),3), "\t", round(avg(cis_matched_div_len),3), "\t", round(max(cis_matched_div_len),3), "\t", round(std(cis_matched_div_len),3) print "-Total Number of citations/Matching citations:", "\t", round(float(sum(cis_matched))/sum(cis_len),3) print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \ round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \ round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) cis_matched_hist = {} for i in xrange(0, max(cis_matched)+1): cis_matched_hist[i] = sum(1 for c in cis_matched if c==i) print "Histogram:", cis_matched_hist n, bins, patches = plt.hist(sorted(cis_matched), bins = max(cis_matched), normed=False, alpha=0.75) plt.xlabel("Liczba dopasowanych cytowan") plt.ylabel("Liczba rekordow") plt.show()
def yield_citations(zbl_generator): """Yields pairs(zbl_id:list-of-ids-of-known-citations). >>> ci1 = zbl_io.pack_list_of_dictionaries([{'an':2, 'ti':'TITLE'}, {'an':3}]) >>> ci2 = zbl_io.pack_list_of_dictionaries([{'an':3}, {'py':'1990', 'an':1}]) >>> ci3 = zbl_io.pack_list_of_dictionaries([{'an':1}]) >>> r1 = {'an':'1', 'ci': ci1} >>> rx = {'an':'x'} >>> r2 = {'an':'2', 'ci': ci2} >>> r3 = {'an':'3', 'ci': ci3} >>> list( yield_citations([r1, rx, r2, r3]) ) [('1', ['2', '3']), ('2', ['3', '1']), ('3', ['1'])] """ for i,zbl in enumerate(zbl_generator): if i%10000==0: logging.info("[yield_citations]"+str(i)+" records processed") if not "ci" in zbl: continue zbl_id = zbl[zbl_io.ZBL_ID_FIELD] cis = zbl_io.unpack_list_of_dictionaries(zbl["ci"]) identified_ci_ids = list(ci[zbl_io.ZBL_ID_FIELD] for ci in cis if ci.has_key(zbl_io.ZBL_ID_FIELD)) if len(identified_ci_ids) == 0: continue yield (zbl_id, identified_ci_ids )
def yield_citations(zbl_generator): """Yields pairs(zbl_id:list-of-ids-of-known-citations). >>> ci1 = zbl_io.pack_list_of_dictionaries([{'an':2, 'ti':'TITLE'}, {'an':3}]) >>> ci2 = zbl_io.pack_list_of_dictionaries([{'an':3}, {'py':'1990', 'an':1}]) >>> ci3 = zbl_io.pack_list_of_dictionaries([{'an':1}]) >>> r1 = {'an':'1', 'ci': ci1} >>> rx = {'an':'x'} >>> r2 = {'an':'2', 'ci': ci2} >>> r3 = {'an':'3', 'ci': ci3} >>> list( yield_citations([r1, rx, r2, r3]) ) [('1', ['2', '3']), ('2', ['3', '1']), ('3', ['1'])] """ for i, zbl in enumerate(zbl_generator): if i % 10000 == 0: logging.info("[yield_citations]" + str(i) + " records processed") if not "ci" in zbl: continue zbl_id = zbl[zbl_io.ZBL_ID_FIELD] cis = zbl_io.unpack_list_of_dictionaries(zbl["ci"]) identified_ci_ids = list(ci[zbl_io.ZBL_ID_FIELD] for ci in cis if ci.has_key(zbl_io.ZBL_ID_FIELD)) if len(identified_ci_ids) == 0: continue yield (zbl_id, identified_ci_ids)
def _report_ci_(path, records_filter = allow_all_filter, \ ci_dst_records_filter = allow_all_filter, \ uq_id_field_name = zbl_io.ZBL_ID_FIELD): """Prints report about citations in ZBL file. records_filter(record) - should return True if record is admitted ci_dst_records_filter(record) - should return True if record that citation is pointing at is admitted uq_id_field_name - name of a field that uniquely identifies record """ #wczytywanie zbioru na ktory moga wskazywac cytowania: print "Loading ids of records that may be citation destination." dst_records_ids = set() for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if record.has_key(uq_id_field_name) and ci_dst_records_filter(record): dst_records_ids.add(record[uq_id_field_name]) print "Done.", len(dst_records_ids), " records loaded." #statystyki: cis_len = [] #liczba cytowan cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("ci") or not records_filter(record): continue cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) #identyfikatory cytowan: identified_ci_ids = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name)) #rekordy dopsowane do cytowan i w zadanym zbiorze: filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids) cis_len.append(len(cis)) cis_matched.append(len(filtered_matched_records)) cis_matched_div_len = list( float(m) / float(l) for m, l in zip(cis_matched, cis_len)) print "Citation statistics (only on records with citations) [total, min avg max std]: " print "-Number of citations :", "\t", round(sum(cis_len), 0), "\t", round( min(cis_len), 0), "\t", round(avg(cis_len), 2), "\t", round(max(cis_len), 0), "\t", round(std(cis_len), 2) print "-Matching citations:", "\t", round( sum(cis_matched), 0), "\t", round(min(cis_matched), 0), "\t", round( avg(cis_matched), 2), "\t", round(max(cis_matched), 0), "\t", round(std(cis_matched), 2) print "-Fraction of matching citations: - ", "\t", round( min(cis_matched_div_len), 3), "\t", round(avg(cis_matched_div_len), 3), "\t", round( max(cis_matched_div_len), 3), "\t", round(std(cis_matched_div_len), 3) print "-Total Number of citations/Matching citations:", "\t", round( float(sum(cis_matched)) / sum(cis_len), 3) print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \ round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \ round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) cis_matched_hist = {} for i in xrange(0, max(cis_matched) + 1): cis_matched_hist[i] = sum(1 for c in cis_matched if c == i) print "Histogram:", cis_matched_hist n, bins, patches = plt.hist(sorted(cis_matched), bins=max(cis_matched), normed=False, alpha=0.75) plt.xlabel("Liczba dopasowanych cytowan") plt.ylabel("Liczba rekordow") plt.show()