def _report_ci_(path, records_filter = allow_all_filter, \
                ci_dst_records_filter = allow_all_filter, \
                uq_id_field_name = zbl_io.ZBL_ID_FIELD):
    """Prints report about citations in ZBL file.
    
    records_filter(record) - should return True if record is admitted
    ci_dst_records_filter(record) -  should return True if record that citation is pointing at is admitted
    uq_id_field_name - name of a field that uniquely identifies record 
    """
    #wczytywanie zbioru na ktory moga wskazywac cytowania:
    print "Loading ids of records that may be citation destination."
    dst_records_ids = set()
    for i,record in enumerate( zbl_io.read_zbl_records(open(path)) ):
        if i%100000 == 0: print i," records considered" #progress bar
        if record.has_key(uq_id_field_name) and ci_dst_records_filter(record):
            dst_records_ids.add(record[uq_id_field_name])            
    print "Done.", len(dst_records_ids), " records loaded."
    
    #statystyki:
    cis_len = [] #liczba cytowan
    cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior 
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("ci") or not records_filter(record):
            continue
                
        cis                 = zbl_io.unpack_list_of_dictionaries(record["ci"])
        #identyfikatory cytowan:
        identified_ci_ids   = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name))
        #rekordy dopsowane do cytowan i w zadanym zbiorze:
        filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids)
                                         
        cis_len.append(len(cis))
        cis_matched.append(len(filtered_matched_records))
      
    cis_matched_div_len = list( float(m)/float(l) for m,l in zip(cis_matched, cis_len) )
        
    print "Citation statistics (only on records with citations) [total, min avg max std]: "
    print "-Number of citations :", "\t", round(sum(cis_len),0), "\t", round(min(cis_len),0), "\t", round(avg(cis_len),2), "\t", round(max(cis_len),0), "\t", round(std(cis_len),2)
    print "-Matching citations:",  "\t", round(sum(cis_matched),0), "\t", round(min(cis_matched),0), "\t", round(avg(cis_matched),2), "\t", round(max(cis_matched),0), "\t", round(std(cis_matched),2)
    print "-Fraction of matching citations: - ",  "\t", round(min(cis_matched_div_len),3), "\t", round(avg(cis_matched_div_len),3), "\t", round(max(cis_matched_div_len),3), "\t", round(std(cis_matched_div_len),3)
    print "-Total Number of citations/Matching citations:", "\t", round(float(sum(cis_matched))/sum(cis_len),3)    
    print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \
     round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \
      round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) 
    
    cis_matched_hist = {}
    for i in xrange(0, max(cis_matched)+1):
        cis_matched_hist[i] = sum(1 for c in cis_matched if c==i)
    print "Histogram:", cis_matched_hist
    
    n, bins, patches = plt.hist(sorted(cis_matched), bins = max(cis_matched), normed=False, alpha=0.75)
    plt.xlabel("Liczba dopasowanych cytowan")
    plt.ylabel("Liczba rekordow")    
    plt.show()     
def yield_citations(zbl_generator):
    """Yields pairs(zbl_id:list-of-ids-of-known-citations).
    
    >>> ci1 = zbl_io.pack_list_of_dictionaries([{'an':2, 'ti':'TITLE'}, {'an':3}])
    >>> ci2 = zbl_io.pack_list_of_dictionaries([{'an':3}, {'py':'1990', 'an':1}])
    >>> ci3 = zbl_io.pack_list_of_dictionaries([{'an':1}])
    >>> r1 = {'an':'1', 'ci': ci1}
    >>> rx = {'an':'x'}
    >>> r2 = {'an':'2', 'ci': ci2}
    >>> r3 = {'an':'3', 'ci': ci3}    
    >>> list( yield_citations([r1, rx, r2, r3]) )
    [('1', ['2', '3']), ('2', ['3', '1']), ('3', ['1'])]
    """    
    for i,zbl in enumerate(zbl_generator):
        if i%10000==0: logging.info("[yield_citations]"+str(i)+" records processed")
        if not "ci" in zbl: continue
        zbl_id = zbl[zbl_io.ZBL_ID_FIELD]
        cis = zbl_io.unpack_list_of_dictionaries(zbl["ci"])
        identified_ci_ids = list(ci[zbl_io.ZBL_ID_FIELD] for ci in cis if ci.has_key(zbl_io.ZBL_ID_FIELD))
        if len(identified_ci_ids) == 0: continue
        yield (zbl_id, identified_ci_ids )
def yield_citations(zbl_generator):
    """Yields pairs(zbl_id:list-of-ids-of-known-citations).
    
    >>> ci1 = zbl_io.pack_list_of_dictionaries([{'an':2, 'ti':'TITLE'}, {'an':3}])
    >>> ci2 = zbl_io.pack_list_of_dictionaries([{'an':3}, {'py':'1990', 'an':1}])
    >>> ci3 = zbl_io.pack_list_of_dictionaries([{'an':1}])
    >>> r1 = {'an':'1', 'ci': ci1}
    >>> rx = {'an':'x'}
    >>> r2 = {'an':'2', 'ci': ci2}
    >>> r3 = {'an':'3', 'ci': ci3}    
    >>> list( yield_citations([r1, rx, r2, r3]) )
    [('1', ['2', '3']), ('2', ['3', '1']), ('3', ['1'])]
    """
    for i, zbl in enumerate(zbl_generator):
        if i % 10000 == 0:
            logging.info("[yield_citations]" + str(i) + " records processed")
        if not "ci" in zbl: continue
        zbl_id = zbl[zbl_io.ZBL_ID_FIELD]
        cis = zbl_io.unpack_list_of_dictionaries(zbl["ci"])
        identified_ci_ids = list(ci[zbl_io.ZBL_ID_FIELD] for ci in cis
                                 if ci.has_key(zbl_io.ZBL_ID_FIELD))
        if len(identified_ci_ids) == 0: continue
        yield (zbl_id, identified_ci_ids)
Exemplo n.º 4
0
def _report_ci_(path, records_filter = allow_all_filter, \
                ci_dst_records_filter = allow_all_filter, \
                uq_id_field_name = zbl_io.ZBL_ID_FIELD):
    """Prints report about citations in ZBL file.
    
    records_filter(record) - should return True if record is admitted
    ci_dst_records_filter(record) -  should return True if record that citation is pointing at is admitted
    uq_id_field_name - name of a field that uniquely identifies record 
    """
    #wczytywanie zbioru na ktory moga wskazywac cytowania:
    print "Loading ids of records that may be citation destination."
    dst_records_ids = set()
    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if record.has_key(uq_id_field_name) and ci_dst_records_filter(record):
            dst_records_ids.add(record[uq_id_field_name])
    print "Done.", len(dst_records_ids), " records loaded."

    #statystyki:
    cis_len = []  #liczba cytowan
    cis_matched = []  #liczba cytowan ktore trafiaja w zadany zbior

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("ci") or not records_filter(record):
            continue

        cis = zbl_io.unpack_list_of_dictionaries(record["ci"])
        #identyfikatory cytowan:
        identified_ci_ids = list(ci[uq_id_field_name] for ci in cis
                                 if ci.has_key(uq_id_field_name))
        #rekordy dopsowane do cytowan i w zadanym zbiorze:
        filtered_matched_records = list(id for id in identified_ci_ids
                                        if id in dst_records_ids)

        cis_len.append(len(cis))
        cis_matched.append(len(filtered_matched_records))

    cis_matched_div_len = list(
        float(m) / float(l) for m, l in zip(cis_matched, cis_len))

    print "Citation statistics (only on records with citations) [total, min avg max std]: "
    print "-Number of citations :", "\t", round(sum(cis_len), 0), "\t", round(
        min(cis_len),
        0), "\t", round(avg(cis_len),
                        2), "\t", round(max(cis_len),
                                        0), "\t", round(std(cis_len), 2)
    print "-Matching citations:", "\t", round(
        sum(cis_matched), 0), "\t", round(min(cis_matched), 0), "\t", round(
            avg(cis_matched),
            2), "\t", round(max(cis_matched),
                            0), "\t", round(std(cis_matched), 2)
    print "-Fraction of matching citations: - ", "\t", round(
        min(cis_matched_div_len),
        3), "\t", round(avg(cis_matched_div_len), 3), "\t", round(
            max(cis_matched_div_len), 3), "\t", round(std(cis_matched_div_len),
                                                      3)
    print "-Total Number of citations/Matching citations:", "\t", round(
        float(sum(cis_matched)) / sum(cis_len), 3)
    print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \
     round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \
      round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3)

    cis_matched_hist = {}
    for i in xrange(0, max(cis_matched) + 1):
        cis_matched_hist[i] = sum(1 for c in cis_matched if c == i)
    print "Histogram:", cis_matched_hist

    n, bins, patches = plt.hist(sorted(cis_matched),
                                bins=max(cis_matched),
                                normed=False,
                                alpha=0.75)
    plt.xlabel("Liczba dopasowanych cytowan")
    plt.ylabel("Liczba rekordow")
    plt.show()