def _report_msc_counts_(path, selected_fields): """Prints report on MSC distribution among records that have fields from selected_fields.""" msc_counts = count_msc_occurences(open(path), lambda record: has_record_fields(record, selected_fields)) msc_counts = dict( (n.upper(), c) for n,c in msc_counts.iteritems() ) #To upper case msc_counts_lp = filter_counts(msc_counts, msc_processing.MSC_LEAF_PATTERN) msc_counts_olp = filter_counts(msc_counts, msc_processing.MSC_ORDINARY_LEAF_PATTERN) msc_counts_slp = filter_counts(msc_counts, msc_processing.MSC_SPECIAL_LEAF_PATTERN) msc_counts_sl = filter_counts(msc_counts, msc_processing.MSC_SECOND_LEVEL) msc_counts_osl = filter_counts(msc_counts, msc_processing.MSC_ORDINARY_SECOND_LEVEL) msc_counts_ssl = filter_counts(msc_counts, msc_processing.MSC_SPECIAL_SECOND_LEVEL) ############################################# #draw_occur_hist(msc_counts, zoom_out=100, title='Histogram liczby wystapien kategorii w rekordach', xlabel='Ranga kategorii', ylabel='Liczba wystapien w rekordach') n, bins, patches = plt.hist(msc_counts.values(), len(msc_counts)/100, log=True, normed=False, alpha=0.75) plt.xlabel("Ranga kategorii") plt.ylabel("Liczba wystapien w rekordach") plt.show() _report_counts_(msc_counts, "--All categories:--") _report_counts_(msc_counts_lp, "--MSC_LEAF_PATTERN:--") _report_counts_(msc_counts_olp, "--MSC_ORDINARY_LEAF_PATTERN:--") _report_counts_(msc_counts_slp, "--MSC_SPECIAL_LEAF_PATTERN:--") _report_counts_(msc_counts_sl, "--MSC_SECOND_LEVEL:--") _report_counts_(msc_counts_osl, "--MSC_ORDINARY_SECOND_LEVEL:--") _report_counts_(msc_counts_ssl, "--MSC_SPECIAL_SECOND_LEVEL:--")
def _report_msc_counts_(path, selected_fields): """Prints report on MSC distribution among records that have fields from selected_fields.""" msc_counts = count_msc_occurences( open(path), lambda record: has_record_fields(record, selected_fields)) msc_counts = dict( (n.upper(), c) for n, c in msc_counts.iteritems()) #To upper case msc_counts_lp = filter_counts(msc_counts, msc_processing.MSC_LEAF_PATTERN) msc_counts_olp = filter_counts(msc_counts, msc_processing.MSC_ORDINARY_LEAF_PATTERN) msc_counts_slp = filter_counts(msc_counts, msc_processing.MSC_SPECIAL_LEAF_PATTERN) msc_counts_sl = filter_counts(msc_counts, msc_processing.MSC_SECOND_LEVEL) msc_counts_osl = filter_counts(msc_counts, msc_processing.MSC_ORDINARY_SECOND_LEVEL) msc_counts_ssl = filter_counts(msc_counts, msc_processing.MSC_SPECIAL_SECOND_LEVEL) ############################################# #draw_occur_hist(msc_counts, zoom_out=100, title='Histogram liczby wystapien kategorii w rekordach', xlabel='Ranga kategorii', ylabel='Liczba wystapien w rekordach') n, bins, patches = plt.hist(msc_counts.values(), len(msc_counts) / 100, log=True, normed=False, alpha=0.75) plt.xlabel("Ranga kategorii") plt.ylabel("Liczba wystapien w rekordach") plt.show() _report_counts_(msc_counts, "--All categories:--") _report_counts_(msc_counts_lp, "--MSC_LEAF_PATTERN:--") _report_counts_(msc_counts_olp, "--MSC_ORDINARY_LEAF_PATTERN:--") _report_counts_(msc_counts_slp, "--MSC_SPECIAL_LEAF_PATTERN:--") _report_counts_(msc_counts_sl, "--MSC_SECOND_LEVEL:--") _report_counts_(msc_counts_osl, "--MSC_ORDINARY_SECOND_LEVEL:--") _report_counts_(msc_counts_ssl, "--MSC_SPECIAL_SECOND_LEVEL:--")
avg, std_dev = calc_avg_dev(val['degrees']) print "List of degrees: "+str(val['degrees']) print "Average degree: "+str(avg) print "Standard deviation of a degree: "+str(std_dev) avg, std_dev = calc_avg_dev(val['counts']) print "List of counts: "+str(val['counts']) print "Average count: "+str(avg) print "Standard deviation of a count: "+str(std_dev) #def print_counts(counts_lowest, counts_higher, curr_labels, start_printable): # ''' # Print the dictionaries in a nested way. # ''' # if counts_higher: # for curr_l in curr_labels: # print start_printable+"[print_counts] key: "+curr_l+": "+str(counts_higher[0][curr_l]['count']+", children: "+str(len(counts_higher[0][curr_l]['elements']))) # print_counts(counts_lowest, counts_higher[1:], counts_higher[0][curr_l]['elements'], start_printable+'\t') # else: # for curr_l in curr_labels: # print start_printable+"[print_counts] key: "+curr_l+": "+str(counts_lowest[curr_l]) if __name__ == '__main__': fname = sys.argv[1] print "fname:", fname counts_lowest, counts_higher = count_label_statistics(count_msc_occurences(open(fname, 'r')), label_mappings = [lambda x: x[:3], lambda x: x[:2]]) #print counts_lowest, counts_higher data4avg_stats = {} print_counts(counts_lowest, counts_higher, list(counts_higher[0].iterkeys()), '', data4avg_stats) print_avg_stats(data4avg_stats)