from pubstats import h_index import gscholar if __name__ == '__main__': name = sys.argv[1] url = sys.argv[2] year = int(sys.argv[3]) pubs,all_pubs,hidx,eidx = compute_individual_stats(name,url) # get all citing publications print 'Obtaining all citations' citations = {} for pub in pubs: citations[pub] = gscholar.get_citations(pub) # compute the h-index for author over productive years now = datetime.date.today().year X = range(year,now+1) Y = [] for yr in range(year,now+1): print 'YEAR=%d' % yr for pub in pubs: pub.cites = len(filter(lambda x: x.year is not None and x.year <= yr, citations[pub])) Y.append(h_index(pubs)) figure() plot(X,Y) show()
def validate(filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0,replace_url_fxn=None): """ Run Topp on the test data using the filters and parameters specified (filters, min_vocab_match_size,vocab_use_pct). replace_url_fxn(name,url,data) is a function that produces text content that should be used in place of the individual's URL. The output should be a string. """ tmp_path = rb().data_path('validation','__TMP__REPLACE_CONTENT__.txt') tmp_url = 'file://%s' % tmp_path reader = csv.reader(rb().open_data(['validation','batch.txt'],'r'),delimiter=',') senses = [] specs = [] fp_pcts = [] hindicies = [] true_hindicies = [] eindicies = [] true_eindicies = [] #print 'All_pubs Est_Pubs True_Pubs True_Pubs_1995 TP FP FN' #print 'Name\tSense\t\tSpec\t\t%FP' num_records = 0 for data in reader: num_records += 1 if replace_url_fxn is None: all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*(list(data) + [filters,min_vocab_match_size,vocab_use_pct])) else: # get the new content content = replace_url_fxn(*data) # write the content to the temporary file fh = open(tmp_path,'w') fh.write(content) fh.close() all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*([data[0],tmp_url,data[2]] + [filters,min_vocab_match_size,vocab_use_pct])) hindicies.append(pubstats.h_index(est_pubs)) eindicies.append(pubstats.e_index(est_pubs)) true_hindicies.append(pubstats.h_index(true_pubs)) true_eindicies.append(pubstats.e_index(true_pubs)) tn = num_pubs - tp - fp - fn sense = sensitivity(tp,fn) senses.append(sense) spec = specificity(tn,fp) specs.append(spec) fp_pct = float(fp) / float(tp + fp) if (tp + fp) != 0 else float('infinity') fp_pcts.append(fp_pct) names = data[0].split() #print names[0][0] + ' ' + names[-1][:4],'\t','%f\t%f\t%f' % (sense,spec,fp_pct) avg_sense = sum(senses) / float(num_records) sdv_sense = sum([math.pow(x-avg_sense,2) for x in senses]) / float(num_records) avg_spec = sum(specs) / float(num_records) sdv_spec = sum([math.pow(x-avg_spec,2) for x in specs]) / float(num_records) avg_fp_pct = sum(fp_pcts) / float(num_records) sdv_fp_pct = sum([math.pow(x-avg_fp_pct,2) for x in fp_pcts]) / float(num_records) print '='*80 print 'AVG\t%f\t%f\t%f' % (avg_sense,avg_spec,avg_fp_pct) print 'SDEV\t%f\t%f\t%f' % (sdv_sense,sdv_spec,sdv_fp_pct) return (avg_sense,sdv_sense),(avg_spec,sdv_spec),(avg_fp_pct,sdv_fp_pct),(hindicies,true_hindicies),(eindicies,true_eindicies)