Пример #1
0
def generateY(filData):

	# initiate transformer : binary count vectoriser
	# stopword=> None to classify the None observations as negative examples
	# yTransformer = CountVectorizer(min_df = 0.0, binary=True, lowercase = False)#, stop_words=[u'None']);

	# vectorize \m/
	# Y = yTransformer.fit_transform(filData['answer']); 
	# print Y

	newY = [];
	for answer in filData['answer']:
		temp = qbPre.convClasses(answer,'|');
		newY.append(temp);

	Y = yTransformer.fit_transform(newY)

	qbGbl.classDict = yTransformer.classes_;

	# tempY = Y.todense()
	
	# for row in tempY:
	# 	temp = []
	# 	for topic in xrange(0,row.shape[1]-1):
	# 		# if row[topic] != 0:
	# 			print 
	# # save topic labels to a reference dictionary

	return Y
Пример #2
0
def analyse(filename):

	# filData = qbPre.readDataFrame(filename,None,0);
	# filData = filData[['WorkerId','Input.declaration','Answer.Q1']]

	# new = filData

	# filename = '{0}'.format(qbGbl.filFileName) 
	# # filData = pd.DataFrame(columns=['index','worker','declaration','answer'])
	# filData = qbPre.readDataFrame(filename,None,None);
	# filData.columns = ['index','worker','declaration','answer'];

	# del filData['index']

	# old = filData

	# oldDecs = []
	
	# for row in new['Input.declaration']:
	# 	if (old[old['declaration'] == row].empty):
	# 		continue;
	# 	else:
	# 		oldDecs.append(numpy.array(old[old['declaration'] == row])[1])

	# oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer'])
	
	# oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False);
	
	## ===============================================================================
	filData = qbPre.readDataFrame(filename,None,0);
	# filData = filData[['WorkerId','Input.declaration','Answer.Q1']]
	filData = filData[['WorkerId','Input.declaration','Answer.Q1']]

	new = filData

	filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName)

	old = qbPre.readDataFrame(filename,None,0)

	# print new['Input.declaration'].nunique()
	# print len(old['declaration'].unique())
	filData = pd.Series(old['declaration']).drop_duplicates()
	print filData
	# tempSer = pd.Series(new['Input.declaration']).drop_duplicates()
	tempSer = new.drop_duplicates(cols=['Input.declaration'])
			
	print len (new)
	print len(tempSer)

	# print len(filData)
	# print '================='
	accuracy = []
	count=0;
	# print len(new)
	# print len(old)

	for row in filData:
		# print row
		if not (new[new['Input.declaration'] == row].empty):
			if len(new[new['Input.declaration'] == row])>1:
				print new[new['Input.declaration'] == row]
			count += len(new[new['Input.declaration'] == row])
			tempOld = qbPre.convClasses(list(old[old['declaration'] == row]['answer'])[0],'|')
			# print tempOld
			tempNew = qbPre.convClasses(list(new[new['Input.declaration'] == row]['Answer.Q1'])[0],'|')
			# print tempNew
			tempScore = 0.0;
			for topic in tempNew:
				if topic in tempOld:
					tempScore += 1.0;

			tempScore /= float(len(tempNew))
			accuracy.append(tempScore)
	
	print scipy.stats.tmean(accuracy) 

	print count

	P.figure();

	n, bins, patches = P.hist(accuracy,len(set(accuracy)), histtype='step',cumulative=True,normed=1)

	P.title("Score distribution")
	P.xlabel("score")
	P.ylabel("Frequency")
	P.show()
Пример #3
0
def analyseWorkers():
	filData = qbPre.readDataFrame(qbGbl.filFileName,None,0);

	workers = filData.WorkerId.drop_duplicates();

	filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName)

	old = qbPre.readDataFrame(filename,None,0)

	perfectDecs = old.declaration.drop_duplicates()

	dataSet = pd.DataFrame()

	for row in perfectDecs:
		if not filData[filData['Input.declaration']==row].empty:
			if dataSet.empty:
				dataSet = filData[filData['Input.declaration']==row]
			else:
				dataSet = dataSet.append(filData[filData['Input.declaration']==row])


	dataSet.SubmitTime = pd.to_datetime(dataSet.SubmitTime)

	# dataSet = pd.DataFrame(dataSet.values,
	# 	columns=['SubmitTime','WorkerId','Input.declaration','Answer.Q1'])

	# print dataSet

	dataSet = dataSet.sort(columns=['SubmitTime'])
	# firstDate = list(dataSet.SubmitTime)[0]

	records = [];
	# print dataSet	
	workers = dataSet.WorkerId.drop_duplicates();
	


	for worker in workers:
		tempStats = {'score':0.0,'freq':0}
		tempRecords = dataSet[dataSet.WorkerId==worker]
		for row in tempRecords.itertuples():
			newRow = list(row)
			# print newRow[-2]
		 	# print old[old['declaration'] == newRow[-2]]
			tempOld = qbPre.convClasses(list(old[old['declaration'] == newRow[-2]].answer)[0],'|')
			# print tempOld
			tempNew = qbPre.convClasses(newRow[-1],'|')
			# print tempNew
			tempScore = 0.0;
			for topic in tempNew:
				if topic in tempOld:
					tempScore += 1.0;
			tempScore /= float(len(tempNew))

		 	tempStats['freq']+=1; # frequency ++
			tempStats['score']+=tempScore;
			
			aggrScore = tempStats['score']/tempStats['freq']
			tm = row[1].time()
			tm = float(tm.hour) + float(tm.minute) / 60
			# print tm

			compl = float(tempStats['freq'])/float(len(tempRecords))
			# del newRow[0]
			newRow.extend([tm,tempScore,aggrScore,compl])

			records.append(newRow)

	records = numpy.array(records);

	dataSet = pd.DataFrame(records[:,1:],
		columns = ['SubmitTime','WorkerId','Input.declaration','Answer.Q1','Time','TempScore','AggrScore','Completion'],
		index=records[:,0])

	return dataSet
Пример #4
0
def analyse(filename):

    # filData = qbPre.readDataFrame(filename,None,0);
    # filData = filData[['WorkerId','Input.declaration','Answer.Q1']]

    # new = filData

    # filename = '{0}'.format(qbGbl.filFileName)
    # # filData = pd.DataFrame(columns=['index','worker','declaration','answer'])
    # filData = qbPre.readDataFrame(filename,None,None);
    # filData.columns = ['index','worker','declaration','answer'];

    # del filData['index']

    # old = filData

    # oldDecs = []

    # for row in new['Input.declaration']:
    # 	if (old[old['declaration'] == row].empty):
    # 		continue;
    # 	else:
    # 		oldDecs.append(numpy.array(old[old['declaration'] == row])[1])

    # oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer'])

    # oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False);

    ## ===============================================================================
    filData = qbPre.readDataFrame(filename, None, 0)
    # filData = filData[['WorkerId','Input.declaration','Answer.Q1']]
    filData = filData[['WorkerId', 'Input.declaration', 'Answer.Q1']]

    new = filData

    filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName)

    old = qbPre.readDataFrame(filename, None, 0)

    # print new['Input.declaration'].nunique()
    # print len(old['declaration'].unique())
    filData = pd.Series(old['declaration']).drop_duplicates()
    print filData
    # tempSer = pd.Series(new['Input.declaration']).drop_duplicates()
    tempSer = new.drop_duplicates(cols=['Input.declaration'])

    print len(new)
    print len(tempSer)

    # print len(filData)
    # print '================='
    accuracy = []
    count = 0
    # print len(new)
    # print len(old)

    for row in filData:
        # print row
        if not (new[new['Input.declaration'] == row].empty):
            if len(new[new['Input.declaration'] == row]) > 1:
                print new[new['Input.declaration'] == row]
            count += len(new[new['Input.declaration'] == row])
            tempOld = qbPre.convClasses(
                list(old[old['declaration'] == row]['answer'])[0], '|')
            # print tempOld
            tempNew = qbPre.convClasses(
                list(new[new['Input.declaration'] == row]['Answer.Q1'])[0],
                '|')
            # print tempNew
            tempScore = 0.0
            for topic in tempNew:
                if topic in tempOld:
                    tempScore += 1.0

            tempScore /= float(len(tempNew))
            accuracy.append(tempScore)

    print scipy.stats.tmean(accuracy)

    print count

    P.figure()

    n, bins, patches = P.hist(accuracy,
                              len(set(accuracy)),
                              histtype='step',
                              cumulative=True,
                              normed=1)

    P.title("Score distribution")
    P.xlabel("score")
    P.ylabel("Frequency")
    P.show()