Пример #1
0
from pubstats import h_index
import gscholar

if __name__ == '__main__':
	name = sys.argv[1]
	url = sys.argv[2]
	year = int(sys.argv[3])
	
	pubs,all_pubs,hidx,eidx = compute_individual_stats(name,url)
	
	# get all citing publications
	print 'Obtaining all citations'
	citations = {}
	for pub in pubs:
		citations[pub] = gscholar.get_citations(pub)
	
	# compute the h-index for author over productive years
	now = datetime.date.today().year
	
	X = range(year,now+1)
	Y = []
	for yr in range(year,now+1):
		print 'YEAR=%d' % yr
		for pub in pubs:
			pub.cites = len(filter(lambda x: x.year is not None and x.year <= yr, citations[pub]))
		
		Y.append(h_index(pubs))
		
	figure()
	plot(X,Y)
	show()
Пример #2
0
def validate(filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0,replace_url_fxn=None):
	"""
	Run Topp on the test data using the filters and parameters specified (filters, min_vocab_match_size,vocab_use_pct).
	
	replace_url_fxn(name,url,data) is a function that produces text content that should be used in place of the individual's URL.
	The output should be a string.
	"""
	tmp_path = rb().data_path('validation','__TMP__REPLACE_CONTENT__.txt')
	tmp_url = 'file://%s' % tmp_path
	
	reader = csv.reader(rb().open_data(['validation','batch.txt'],'r'),delimiter=',')
	
	senses = []
	specs = []
	fp_pcts = []
	hindicies = []
	true_hindicies = []
	eindicies = []
	true_eindicies = []
	
	#print 'All_pubs Est_Pubs True_Pubs True_Pubs_1995 TP FP FN'
	#print 'Name\tSense\t\tSpec\t\t%FP'
	num_records = 0
	for data in reader:
		num_records += 1
		if replace_url_fxn is None:
			all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*(list(data) + [filters,min_vocab_match_size,vocab_use_pct]))
		else:
			# get the new content
			content = replace_url_fxn(*data)
			
			# write the content to the temporary file
			fh = open(tmp_path,'w')
			fh.write(content)
			fh.close()
			
			all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*([data[0],tmp_url,data[2]] + [filters,min_vocab_match_size,vocab_use_pct]))
			
		hindicies.append(pubstats.h_index(est_pubs))
		eindicies.append(pubstats.e_index(est_pubs))
		true_hindicies.append(pubstats.h_index(true_pubs))
		true_eindicies.append(pubstats.e_index(true_pubs))
		tn = num_pubs - tp - fp - fn
		sense = sensitivity(tp,fn)
		senses.append(sense)
		spec = specificity(tn,fp)
		specs.append(spec)
		fp_pct = float(fp) / float(tp + fp) if (tp + fp) != 0 else float('infinity')
		fp_pcts.append(fp_pct)
		
		names = data[0].split()
		#print names[0][0] + ' ' + names[-1][:4],'\t','%f\t%f\t%f' % (sense,spec,fp_pct)
	
	avg_sense = sum(senses) / float(num_records)
	sdv_sense = sum([math.pow(x-avg_sense,2) for x in senses]) / float(num_records)
	avg_spec = sum(specs) / float(num_records)
	sdv_spec = sum([math.pow(x-avg_spec,2) for x in specs]) / float(num_records)
	avg_fp_pct = sum(fp_pcts) / float(num_records)
	sdv_fp_pct = sum([math.pow(x-avg_fp_pct,2) for x in fp_pcts]) / float(num_records)
	
	print '='*80
	print 'AVG\t%f\t%f\t%f' % (avg_sense,avg_spec,avg_fp_pct)
	print 'SDEV\t%f\t%f\t%f' % (sdv_sense,sdv_spec,sdv_fp_pct)
	
	return (avg_sense,sdv_sense),(avg_spec,sdv_spec),(avg_fp_pct,sdv_fp_pct),(hindicies,true_hindicies),(eindicies,true_eindicies)