def processSteps(self, trainDocuments, trainLabels, testDocuments, testLabels, split, active): trainDoc = trainDocuments trainLab = trainLabels results = [] exampleCtr = 0 stepCtr = 0 for step in split: stepCtr = stepCtr +1 if active: [trainDoc,trainLab] = sel.uncertainty_sampling(trainDoc,trainLab) # # train all document for the current step for idx in range(step): exampleCtr = exampleCtr +1 print 'Step ' + str(stepCtr) + ' Example ' +str(exampleCtr) clf.online_train(trainDoc[idx], trainLab[idx]) trainDoc[idx].save() # # drop already trained examples trainDoc = trainDoc[step:] trainLab = trainLab[step:] # make predictions preds = [clf.predict(testDocuments[idx]) for idx in range(len(testDocuments))] results = results + [preds] # with open('results', 'wt') as resultFile: pprint(results, stream=resultFile) # print self.evaluate(testLabels)
def run(self, trainDocs, trainLabels, testDocs, testLabels, options, active=False): trainD = trainDocs trainL = trainLabels instanceCtr = 0 stepCtr = 0 export = {} for step in options['split']: stepCtr += 1 if active: [trainD, trainL] = sel.uncertainty_sampling(trainD, trainL, saveScores=False) # for i in range(step): instanceCtr += 1 print 'Step ' + str(stepCtr) + ' Example ' +str(instanceCtr) clf.online_train(trainD[i], [trainL[i]]) # preds = map(lambda t: clf.predict(t, saveScores=False), testDocs) accuracy = self.evaluate(preds, testLabels) # export.update({stepCtr: {"step": step, "preds": preds, "accuracy": accuracy, "runOrder": map(lambda i: trainD[i].doc_id, range(step))}}) # remove preciding training instances trainD = trainD[step:] trainL = trainL[step:] # # clear database from all data produced by the run (WARNING: also clears data not produced by the run) call_command('wipeDB', 'label') return export
def training(request): context = {} # a dict with content used in the template labels = Label.objects.all() context['labels'] = labels if request.method == 'POST': form = TrainingForm(labels, request.POST) # check whether it's valid: if form.is_valid(): # depending on whether the classify button # (classifySubmit) or the train button (trainSubmit) was # pressed diffent actions are performed if 'classifySubmit' in form.data: document = Document(document=form.data.get('trainDocument'), doc_id='doc to be classified', preprocessed=' '.join(clf.preprocessing( form.data.get('trainDocument'))), trainInstance=True) # only if there is a document and it contains words it # will be classified scores = clf.predict(document, saveScores=False) if scores: context['scores'] = scores proposals = clf.predict_label(document, scores=scores) if proposals: context['proposals'] = map(lambda l: l.pk, [proposals]) else: raise ValidationError( 'There is no predictive model yet. Please train at least one document before classifing', code='invalid') context['classifiedDocument'] = document.document else: document = Document(document=form.data.get('trainDocument'), doc_id=str(datetime.datetime.now()), preprocessed=' '.join(clf.preprocessing( form.data.get('trainDocument'))), trainInstance=True) document.save() annotation = Annotation(document=document, user=request.user, duration=-1) annotation.save() [annotation.labels.add(label) for label in form.cleaned_data['labels']] clf.online_train(document, form.cleaned_data['labels']) return render(request, 'annotation/training.html', context) else: raise Http404("Ups something went wrong.") else: return render(request, 'annotation/training.html', context)
def handle(self, *args, **options): with open(options['filename']) as trainfile: train_content = trainfile.read() # raw_labels = ''.join(re.findall( r'\t[\d\-]+?\n', train_content)).replace('\t', '').splitlines() relevant = Label.objects.filter(label='Pos').first() irrelevant = Label.objects.filter(label='Neg').first() labels = [ relevant if label == '1' else irrelevant for label in raw_labels ] doc_ids = ''.join(re.findall(r'\n.+?\t', train_content)).replace('\t', '').splitlines() document_texts = re.findall(r'\t.+?\t', train_content) if options['maxTokenCount'] == 'all': maxTokenCount = len(fullTokens) else: maxTokenCount = int(options['maxTokenCount']) document_texts = map( lambda d: ' '.join( d.decode('utf-8').encode('utf-8').split(' ')[:maxTokenCount]), document_texts) documents = [ Document(document=document_texts[idx], doc_id=doc_ids[idx], preprocessed=' '.join( clf.preprocessing(document_texts[idx])), trainInstance=True) for idx in range(len(document_texts)) ] # for document in documents: document.save() # for idx in range(len(documents)): clf.online_train(documents[idx], [labels[idx]])
def index(request): context = {} # a dict with content used in the template labels = Label.objects.all() context['labels'] = labels # if this is a POST request we need to process the form data if request.method == 'POST': # create a form instance and populate it with data from the request: form = AnnotationForm(labels, request.POST) # check whether it's valid: if form.is_valid(): # create the new annotation old_pk = int(form.data.get('old_pk')) old_doc = Document.objects.get(pk=old_pk) annotation = Annotation(document=old_doc, user=request.user, duration=form.data.get('duration'), proposalFlag=form.data.get('oldProposalFlag')) annotation.save() # save the annotation to the DB # annotation.labels.add(*form.cleaned_data['labels']) #logging.info('form.data.get(oldProposals) request: ' + str(form.data.get('oldProposals'))) oldProposals = form.data.get('oldProposals') if oldProposals: oldProposals = map(int, re.findall(r'\d+', form.data.get('oldProposals'))) else: oldProposals = [-1] annotation.proposals.add(*oldProposals) # clf.online_train(old_doc, form.cleaned_data['labels']) # After we trained the newly annotated document, the # corresponding QueueElement can be deleted. oQE = form.data.get('oldQueueElement') oldQueueElement = QueueElement.objects.filter(pk=oQE).first() if oldQueueElement: oldQueueElement.delete() # document, proposalFlag, queueElement = sel.selectDocument(request.user) proposals = selectProposal(document, proposalFlag, onlineProposal=True) context['proposals'] = proposals context['document'] = document context['sessionStart'] = form.data['sessionStart'] if queueElement: context['oldQueueElement'] = queueElement context['oldProposalFlag'] = queueElement.proposalFlag form = AnnotationForm(labels) context['form'] = form #queries = connection.queries #print 'len: {}, connection: {}'.format(str(len(queries)), str(filter(lambda q: True if q['time'] != '0.000' else False, queries))) return render(request, 'annotation/index.html', context) else: document, proposalFlag, queueElement = sel.selectDocument(request.user) context['proposals'] = selectProposal(document, proposalFlag, onlineProposal=True) context['document'] = document if queueElement: context['oldQueueElement'] = queueElement context['oldProposalFlag'] = queueElement.proposalFlag # form = AnnotationForm(labels) context['form'] = form return render(request, 'annotation/index.html', context)
(trainDocuments, trainLabels) = readTSV(trainFile) (testDocuments, testLabels) = readTSV(testFile) split = [0.1,0.1,0.2,0.3,0.3] N = len(trainDocuments) stepCount = reduce(lambda seq, step: seq+[int(round(N*step))+seq[-1]], split, [0])[:-1]+[N] rlt = [] rlta = [] # create the steps for (fromStep, toStep) in zip(stepCount[:-1], stepCount[1:]): # step over and train for idx in range(fromStep, toStep): print str(idx) + ' ' + str((fromStep, toStep)) clf.online_train(trainDocuments[idx], [trainLabels[idx]]) trainDocuments[idx].save() # make predictions and convert to binary vector # preds = [1 if clf.predict_label(testDocuments[idx]) == Label(label='relevant') else 0 # for idx in range(len(testDocuments))] preds = [clf.predict(testDocuments[idx]) for idx in range(len(testDocuments))] pprint(preds) # convert true labels to binary vector and add results # to list. true = map(lambda p: 1 if p == Label(label='relevant') else 0, testLabels) rlt = rlt + [confusion_matrix(true, preds)] rlta = rlta + [accuracy_score(true, preds)] print rlt print rlta