示例#1
0
def _report_af_quality_(path, records_filter=allow_all_filter):
    """Prints report about authors' identities quality."""
    afs_len = []
    afs_ok_len = []

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue

        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list(af for af in afs if af != '-')

        afs_len.append(len(afs))
        afs_ok_len.append(len(afs_ok))

    afs_ok_frac = list(
        float(m) / float(l) for m, l in zip(afs_ok_len, afs_len))

    print max(afs_len), "\n", round(avg(afs_len),
                                    2), "\n", round(std(afs_len), 2)
    print max(afs_ok_len), "\n", round(avg(afs_ok_len),
                                       2), "\n", round(std(afs_ok_len), 2)
    print round(max(afs_ok_frac),
                2), "\n", round(avg(afs_ok_frac),
                                2), "\n", round(std(afs_ok_frac), 2)
def _report_ci_(path, records_filter = allow_all_filter, \
                ci_dst_records_filter = allow_all_filter, \
                uq_id_field_name = zbl_io.ZBL_ID_FIELD):
    """Prints report about citations in ZBL file.
    
    records_filter(record) - should return True if record is admitted
    ci_dst_records_filter(record) -  should return True if record that citation is pointing at is admitted
    uq_id_field_name - name of a field that uniquely identifies record 
    """
    #wczytywanie zbioru na ktory moga wskazywac cytowania:
    print "Loading ids of records that may be citation destination."
    dst_records_ids = set()
    for i,record in enumerate( zbl_io.read_zbl_records(open(path)) ):
        if i%100000 == 0: print i," records considered" #progress bar
        if record.has_key(uq_id_field_name) and ci_dst_records_filter(record):
            dst_records_ids.add(record[uq_id_field_name])            
    print "Done.", len(dst_records_ids), " records loaded."
    
    #statystyki:
    cis_len = [] #liczba cytowan
    cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior 
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("ci") or not records_filter(record):
            continue
                
        cis                 = zbl_io.unpack_list_of_dictionaries(record["ci"])
        #identyfikatory cytowan:
        identified_ci_ids   = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name))
        #rekordy dopsowane do cytowan i w zadanym zbiorze:
        filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids)
                                         
        cis_len.append(len(cis))
        cis_matched.append(len(filtered_matched_records))
      
    cis_matched_div_len = list( float(m)/float(l) for m,l in zip(cis_matched, cis_len) )
        
    print "Citation statistics (only on records with citations) [total, min avg max std]: "
    print "-Number of citations :", "\t", round(sum(cis_len),0), "\t", round(min(cis_len),0), "\t", round(avg(cis_len),2), "\t", round(max(cis_len),0), "\t", round(std(cis_len),2)
    print "-Matching citations:",  "\t", round(sum(cis_matched),0), "\t", round(min(cis_matched),0), "\t", round(avg(cis_matched),2), "\t", round(max(cis_matched),0), "\t", round(std(cis_matched),2)
    print "-Fraction of matching citations: - ",  "\t", round(min(cis_matched_div_len),3), "\t", round(avg(cis_matched_div_len),3), "\t", round(max(cis_matched_div_len),3), "\t", round(std(cis_matched_div_len),3)
    print "-Total Number of citations/Matching citations:", "\t", round(float(sum(cis_matched))/sum(cis_len),3)    
    print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \
     round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \
      round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) 
    
    cis_matched_hist = {}
    for i in xrange(0, max(cis_matched)+1):
        cis_matched_hist[i] = sum(1 for c in cis_matched if c==i)
    print "Histogram:", cis_matched_hist
    
    n, bins, patches = plt.hist(sorted(cis_matched), bins = max(cis_matched), normed=False, alpha=0.75)
    plt.xlabel("Liczba dopasowanych cytowan")
    plt.ylabel("Liczba rekordow")    
    plt.show()     
def _report_counts_(counts, label = None):
    """Prints report on count statistics (counts = dictionary{name:count}/list of numbers )."""
    try:
        counts = counts.values()
    except:
        pass
    
    if not label is None:
        print label        
    print " Max:", max(counts)
    print " Min:", min(counts)
    try:
        print " Avg:", round(avg(counts),2)        
    except:
        print " Avg: -"        
    try:
        print " Std:", round(std(counts),2)
    except:
        print " Std: -"
    print " Total num of categories:", len(counts)
    #ile mamy kategorii ktora wystepuje tyle a tyle razy
    print " <5:", sum(1 for c in counts if c<5)
    print " <10:", sum(1 for c in counts if c<10)
    print " <25:", sum(1 for c in counts if c<25)
    print " <50:", sum(1 for c in counts if c<50)
    print " <100:", sum(1 for c in counts if c<100)
    print " <1000:", sum(1 for c in counts if c<1000)
    print " <10000:", sum(1 for c in counts if c<10000)
def _draw_af_hist_(path, records_filter = allow_all_filter):
    """Draws histogram of authorship."""    
    af_count = {} #dict{author: count}
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue        
        
        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list( af for af in afs if af!='-' )
        
        for af in afs_ok:
            af_count[af] = af_count.get(af, 0) + 1
             
    print len(af_count), " authors found."
    print max(af_count.values()), " = max"
    print min(af_count.values()), " = min"
    avg_af_values = avg(af_count.values())
    print round(avg_af_values, 2), " = avg"
    print round(std(af_count.values()), 2), " = std"
    print sum(1 for af in af_count.values() if af > avg_af_values) , " authors above avg"
    print sum(1 for af in af_count.values() if af < avg_af_values) , " authors below avg"
    
    n, bins, patches = plt.hist(af_count.values(), bins = max(af_count.values()), normed=False, log=True, alpha=0.75)
    plt.xlabel("Liczba wystapien w rekordach")
    plt.ylabel("Liczba autorow")    
    plt.show()     
示例#5
0
def _report_counts_(counts, label=None):
    """Prints report on count statistics (counts = dictionary{name:count}/list of numbers )."""
    try:
        counts = counts.values()
    except:
        pass

    if not label is None:
        print label
    print " Max:", max(counts)
    print " Min:", min(counts)
    try:
        print " Avg:", round(avg(counts), 2)
    except:
        print " Avg: -"
    try:
        print " Std:", round(std(counts), 2)
    except:
        print " Std: -"
    print " Total num of categories:", len(counts)
    #ile mamy kategorii ktora wystepuje tyle a tyle razy
    print " <5:", sum(1 for c in counts if c < 5)
    print " <10:", sum(1 for c in counts if c < 10)
    print " <25:", sum(1 for c in counts if c < 25)
    print " <50:", sum(1 for c in counts if c < 50)
    print " <100:", sum(1 for c in counts if c < 100)
    print " <1000:", sum(1 for c in counts if c < 1000)
    print " <10000:", sum(1 for c in counts if c < 10000)
示例#6
0
def _draw_af_hist_(path, records_filter=allow_all_filter):
    """Draws histogram of authorship."""
    af_count = {}  #dict{author: count}

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue

        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list(af for af in afs if af != '-')

        for af in afs_ok:
            af_count[af] = af_count.get(af, 0) + 1

    print len(af_count), " authors found."
    print max(af_count.values()), " = max"
    print min(af_count.values()), " = min"
    avg_af_values = avg(af_count.values())
    print round(avg_af_values, 2), " = avg"
    print round(std(af_count.values()), 2), " = std"
    print sum(1 for af in af_count.values()
              if af > avg_af_values), " authors above avg"
    print sum(1 for af in af_count.values()
              if af < avg_af_values), " authors below avg"

    n, bins, patches = plt.hist(af_count.values(),
                                bins=max(af_count.values()),
                                normed=False,
                                log=True,
                                alpha=0.75)
    plt.xlabel("Liczba wystapien w rekordach")
    plt.ylabel("Liczba autorow")
    plt.show()
def extract_fv_graph(zbl_generator, multival_field_name = "af", empty_value = "-", container = set):
    """Returns dictionary{id:container-of-ids} that describes connection graph.
    
    Between r1 and r2 is (two-way)link if both r1 contains at least single common value in field of field_name.
        
    >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) }
    >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) }
    >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) }
    >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])}
    >>> rx = {'an': 'x'}
    >>> ry = {'an': 'y', 'af': '-'}
    >>> sorted(list(extract_fv_graph([r1,r2,r3,r4]).iteritems())) ==     [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))]
    True
    >>> sorted(list(extract_fv_graph([r1,r2,ry,r3,r4,rx]).iteritems())) ==     [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))]
    True
    """
    fv2ids = extract_fieldvalue2ids(zbl_generator, multival_field_name, empty_value, list)
    fv2count = dict( (fv,len(ids)) for fv,ids in fv2ids.iteritems()) #DBG
    logging.info("[extract_fv_graph] max="+ str( max( fv2count.values() ) ) )  #DBG
    logging.info("[extract_fv_graph] avg="+ str( stats.avg( fv2count.values() ) ) )  #DBG
    logging.info("[extract_fv_graph] std="+ str( stats.std( fv2count.values() ) ) )  #DBG
    id2ids = {}
    for rr, (fv,ids) in enumerate(fv2ids.iteritems()):
        if rr%10000 == 0: logging.info("[extract_fv_graph]"+str(rr)+" records processed")
        #logging.info("[extract_fv_graph] considering fv="+str(fv)+" ids="+str(ids)) 
        for i in xrange(len(ids)): #po kazdym majacym dana wartosc
            curr_id = ids[i] #wybierz jednego
            peer_ids = [ ids[j] for j in xrange(len(ids)) if i!=j ] #jego wszysycy sasiedzi
            if len(peer_ids) > 0:
                all_peer_ids = id2ids.get(curr_id, [])
                all_peer_ids.extend(peer_ids)
                id2ids[curr_id] = all_peer_ids                                  
    return dict( (id,container(ids)) for id,ids in id2ids.iteritems())
def _report_af_quality_(path, records_filter = allow_all_filter):
    """Prints report about authors' identities quality."""    
    afs_len = []
    afs_ok_len = []
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue        
        
        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list( af for af in afs if af!='-' )
        
        afs_len.append(len(afs))
        afs_ok_len.append(len(afs_ok))
        
    afs_ok_frac = list( float(m)/float(l) for m,l in zip(afs_ok_len, afs_len) )    
        
    print max(afs_len), "\n", round(avg(afs_len),2), "\n", round(std(afs_len),2)
    print max(afs_ok_len), "\n", round(avg(afs_ok_len),2), "\n", round(std(afs_ok_len),2)
    print round(max(afs_ok_frac),2), "\n", round(avg(afs_ok_frac),2), "\n", round(std(afs_ok_frac),2)
def extract_fv_graph(zbl_generator,
                     multival_field_name="af",
                     empty_value="-",
                     container=set):
    """Returns dictionary{id:container-of-ids} that describes connection graph.
    
    Between r1 and r2 is (two-way)link if both r1 contains at least single common value in field of field_name.
        
    >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) }
    >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) }
    >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) }
    >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])}
    >>> rx = {'an': 'x'}
    >>> ry = {'an': 'y', 'af': '-'}
    >>> sorted(list(extract_fv_graph([r1,r2,r3,r4]).iteritems())) ==     [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))]
    True
    >>> sorted(list(extract_fv_graph([r1,r2,ry,r3,r4,rx]).iteritems())) ==     [('1', set(['2', '4'])), ('2', set(['1', '4'])), ('3', set(['4'])), ('4', set(['1', '2', '3']))]
    True
    """
    fv2ids = extract_fieldvalue2ids(zbl_generator, multival_field_name,
                                    empty_value, list)
    fv2count = dict((fv, len(ids)) for fv, ids in fv2ids.iteritems())  #DBG
    logging.info("[extract_fv_graph] max=" + str(max(fv2count.values())))  #DBG
    logging.info("[extract_fv_graph] avg=" +
                 str(stats.avg(fv2count.values())))  #DBG
    logging.info("[extract_fv_graph] std=" +
                 str(stats.std(fv2count.values())))  #DBG
    id2ids = {}
    for rr, (fv, ids) in enumerate(fv2ids.iteritems()):
        if rr % 10000 == 0:
            logging.info("[extract_fv_graph]" + str(rr) + " records processed")
        #logging.info("[extract_fv_graph] considering fv="+str(fv)+" ids="+str(ids))
        for i in xrange(len(ids)):  #po kazdym majacym dana wartosc
            curr_id = ids[i]  #wybierz jednego
            peer_ids = [ids[j] for j in xrange(len(ids))
                        if i != j]  #jego wszysycy sasiedzi
            if len(peer_ids) > 0:
                all_peer_ids = id2ids.get(curr_id, [])
                all_peer_ids.extend(peer_ids)
                id2ids[curr_id] = all_peer_ids
    return dict((id, container(ids)) for id, ids in id2ids.iteritems())
def _draw_mc_hist(path, records_filter = allow_all_filter):
    """Draws histogram of MSC codes occurrence in records."""    
    mc_counts = []
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("mc") or not records_filter(record):
            continue          
        
        mc = zbl_io.unpack_multivalue_field(record["mc"])
        mc_counts.append(len(mc))
        
    print len(mc_counts), " record found."
    print max(mc_counts), " = max"
    print min(mc_counts), " = min"
    print round(avg(mc_counts), 2), " = avg"
    print round(std(mc_counts), 2), " = std"
    n, bins, patches = plt.hist(mc_counts, bins = max(mc_counts), normed=False, alpha=0.75)
    plt.xlabel("Liczba kodow MSC w rekordzie")
    plt.ylabel("Liczba rekordow")    
    plt.show()                  
示例#11
0
def _draw_mc_hist(path, records_filter=allow_all_filter):
    """Draws histogram of MSC codes occurrence in records."""
    mc_counts = []

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("mc") or not records_filter(record):
            continue

        mc = zbl_io.unpack_multivalue_field(record["mc"])
        mc_counts.append(len(mc))

    print len(mc_counts), " record found."
    print max(mc_counts), " = max"
    print min(mc_counts), " = min"
    print round(avg(mc_counts), 2), " = avg"
    print round(std(mc_counts), 2), " = std"
    n, bins, patches = plt.hist(mc_counts,
                                bins=max(mc_counts),
                                normed=False,
                                alpha=0.75)
    plt.xlabel("Liczba kodow MSC w rekordzie")
    plt.ylabel("Liczba rekordow")
    plt.show()
示例#12
0
def _report_ci_(path, records_filter = allow_all_filter, \
                ci_dst_records_filter = allow_all_filter, \
                uq_id_field_name = zbl_io.ZBL_ID_FIELD):
    """Prints report about citations in ZBL file.
    
    records_filter(record) - should return True if record is admitted
    ci_dst_records_filter(record) -  should return True if record that citation is pointing at is admitted
    uq_id_field_name - name of a field that uniquely identifies record 
    """
    #wczytywanie zbioru na ktory moga wskazywac cytowania:
    print "Loading ids of records that may be citation destination."
    dst_records_ids = set()
    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if record.has_key(uq_id_field_name) and ci_dst_records_filter(record):
            dst_records_ids.add(record[uq_id_field_name])
    print "Done.", len(dst_records_ids), " records loaded."

    #statystyki:
    cis_len = []  #liczba cytowan
    cis_matched = []  #liczba cytowan ktore trafiaja w zadany zbior

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("ci") or not records_filter(record):
            continue

        cis = zbl_io.unpack_list_of_dictionaries(record["ci"])
        #identyfikatory cytowan:
        identified_ci_ids = list(ci[uq_id_field_name] for ci in cis
                                 if ci.has_key(uq_id_field_name))
        #rekordy dopsowane do cytowan i w zadanym zbiorze:
        filtered_matched_records = list(id for id in identified_ci_ids
                                        if id in dst_records_ids)

        cis_len.append(len(cis))
        cis_matched.append(len(filtered_matched_records))

    cis_matched_div_len = list(
        float(m) / float(l) for m, l in zip(cis_matched, cis_len))

    print "Citation statistics (only on records with citations) [total, min avg max std]: "
    print "-Number of citations :", "\t", round(sum(cis_len), 0), "\t", round(
        min(cis_len),
        0), "\t", round(avg(cis_len),
                        2), "\t", round(max(cis_len),
                                        0), "\t", round(std(cis_len), 2)
    print "-Matching citations:", "\t", round(
        sum(cis_matched), 0), "\t", round(min(cis_matched), 0), "\t", round(
            avg(cis_matched),
            2), "\t", round(max(cis_matched),
                            0), "\t", round(std(cis_matched), 2)
    print "-Fraction of matching citations: - ", "\t", round(
        min(cis_matched_div_len),
        3), "\t", round(avg(cis_matched_div_len), 3), "\t", round(
            max(cis_matched_div_len), 3), "\t", round(std(cis_matched_div_len),
                                                      3)
    print "-Total Number of citations/Matching citations:", "\t", round(
        float(sum(cis_matched)) / sum(cis_len), 3)
    print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \
     round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \
      round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3)

    cis_matched_hist = {}
    for i in xrange(0, max(cis_matched) + 1):
        cis_matched_hist[i] = sum(1 for c in cis_matched if c == i)
    print "Histogram:", cis_matched_hist

    n, bins, patches = plt.hist(sorted(cis_matched),
                                bins=max(cis_matched),
                                normed=False,
                                alpha=0.75)
    plt.xlabel("Liczba dopasowanych cytowan")
    plt.ylabel("Liczba rekordow")
    plt.show()