def getReliableData(cols=[ 'HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward', 'CreationTime', 'MaxAssignments', 'RequesterAnnotation', 'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds', 'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds', 'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime', 'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime', 'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate', 'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.pv_id', 'Input.global_user_id', 'Input.time', 'Input.declaration', 'Answer.Q1', 'Approve', 'Reject' ]): dataSet = qbRel.analyseWorkers() badEntries = qbRel.pickBadEntries(dataSet) # print dataSet dataSet = dataSet.drop(badEntries.index) dataSet = dataSet[['WorkerId', 'Input.declaration', 'Answer.Q1']] silverSet = dataSet['Input.declaration'] # print silverSet # load the fulDataset paths = qbPre.listFiles(qbGbl.oriFileName) filData = qbPre.readFiles(paths) filData = filData[cols] filData.index = (xrange(0, len(filData))) # remove the observations that were verified earlier as the silver set dups = pd.DataFrame() for dec in silverSet: # dups = dups.append(filData[filData['Input.declaration']==dec]) filData = filData.drop( filData[filData['Input.declaration'] == dec].index) badEntries = qbRel.pickBadObs(filData) filData = filData.drop(badEntries.index) dataSet = dataSet.append(filData) dataSet = dataSet[['WorkerId', 'Input.declaration', 'Answer.Q1']] dataSet.index = (xrange(0, len(dataSet))) # print dataSet dataSet.to_csv(qbGbl.finalReaderFile, header=False)
def getReliableData(cols=['HITId','HITTypeId','Title','Description','Keywords','Reward', 'CreationTime','MaxAssignments','RequesterAnnotation','AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds','Expiration','NumberOfSimilarHITs','LifetimeInSeconds', 'AssignmentId','WorkerId','AssignmentStatus','AcceptTime','SubmitTime','AutoApprovalTime', 'ApprovalTime','RejectionTime','RequesterFeedback','WorkTimeInSeconds','LifetimeApprovalRate', 'Last30DaysApprovalRate','Last7DaysApprovalRate','Input.pv_id','Input.global_user_id', 'Input.time','Input.declaration','Answer.Q1','Approve','Reject']): dataSet = qbRel.analyseWorkers(); badEntries = qbRel.pickBadEntries(dataSet) # print dataSet dataSet = dataSet.drop(badEntries.index) dataSet = dataSet[['WorkerId','Input.declaration','Answer.Q1']] silverSet = dataSet['Input.declaration'] # print silverSet # load the fulDataset paths = qbPre.listFiles(qbGbl.oriFileName) filData = qbPre.readFiles(paths) filData = filData[cols]; filData.index = (xrange(0,len(filData))) # remove the observations that were verified earlier as the silver set dups = pd.DataFrame() for dec in silverSet: # dups = dups.append(filData[filData['Input.declaration']==dec]) filData = filData.drop(filData[filData['Input.declaration']==dec].index) badEntries = qbRel.pickBadObs(filData) filData = filData.drop(badEntries.index) dataSet = dataSet.append(filData) dataSet = dataSet[['WorkerId','Input.declaration','Answer.Q1']] dataSet.index = (xrange(0,len(dataSet))) # print dataSet dataSet.to_csv(qbGbl.finalReaderFile,header=False);
def analyseWorkers(): dataSet = qbRel.analyseWorkers(); workers = list(dataSet['WorkerId'].drop_duplicates())
def analyseWorkers(): dataSet = qbRel.analyseWorkers() workers = list(dataSet['WorkerId'].drop_duplicates())