def training(request): context = {} # a dict with content used in the template labels = Label.objects.all() context['labels'] = labels if request.method == 'POST': form = TrainingForm(labels, request.POST) # check whether it's valid: if form.is_valid(): # depending on whether the classify button # (classifySubmit) or the train button (trainSubmit) was # pressed diffent actions are performed if 'classifySubmit' in form.data: document = Document(document=form.data.get('trainDocument'), doc_id='doc to be classified', preprocessed=' '.join(clf.preprocessing( form.data.get('trainDocument'))), trainInstance=True) # only if there is a document and it contains words it # will be classified scores = clf.predict(document, saveScores=False) if scores: context['scores'] = scores proposals = clf.predict_label(document, scores=scores) if proposals: context['proposals'] = map(lambda l: l.pk, [proposals]) else: raise ValidationError( 'There is no predictive model yet. Please train at least one document before classifing', code='invalid') context['classifiedDocument'] = document.document else: document = Document(document=form.data.get('trainDocument'), doc_id=str(datetime.datetime.now()), preprocessed=' '.join(clf.preprocessing( form.data.get('trainDocument'))), trainInstance=True) document.save() annotation = Annotation(document=document, user=request.user, duration=-1) annotation.save() [annotation.labels.add(label) for label in form.cleaned_data['labels']] clf.online_train(document, form.cleaned_data['labels']) return render(request, 'annotation/training.html', context) else: raise Http404("Ups something went wrong.") else: return render(request, 'annotation/training.html', context)
def processFile(self, filename, options): print 'Add documents from ' + filename with open(filename) as csvfile: rows = csv.reader(csvfile, delimiter=options['delimiter']) count = 1 for row in rows: if row: print count count += 1 iD = row[0].decode('utf-8').encode('utf-8') fullTokens = row[1].decode('utf-8').encode('utf-8').split( ' ') if options['maxTokenCount'] == 'all': maxTokenCount = len(fullTokens) else: maxTokenCount = int(options['maxTokenCount']) text = ' '.join(fullTokens[:maxTokenCount]) trainInstance = False if options['train']: trainInstance = True # document = Document(document=text, doc_id=iD, preprocessed=' '.join( clf.preprocessing(text)), trainInstance=trainInstance) document.save()
def readTSV(self, filename): with open(filename) as trainfile: train_content = trainfile.read().decode('utf-8') # # Extract the raw label representation from the total text. # raw_labels = ''.join(re.findall(r'\t[\d\-]+?\n',train_content)).replace('\t', '').splitlines() raw_labels = ''.join(re.findall(r'(\t[\-1]{1,2})*?\n',train_content)).split('\t')[1:] # Get the Labels from the db. pdb.set_trace() relevant = Label.objects.filter(label='Relevant').first() irrelevant = Label.objects.filter(label='Irrelevant').first() # Tranform label representation into the true Labels. labels = [relevant if label=='1' else irrelevant for label in raw_labels] doc_ids = ''.join(re.findall(r'\n.+?\t', train_content)).replace('\t', '').splitlines() document_texts = re.findall(r'\t.+?\t', train_content) documents = [Document(document=document_texts[idx], doc_id=doc_ids[idx], preprocessed=' '.join(clf.preprocessing( document_texts[idx])), trainInstance=True) for idx in range(len(document_texts))] for document in documents: document.save() rlt = (documents, map(lambda l: [l], labels)) pprint(map(lambda r: (r[0].document[:20],r[1][0].label),zip(rlt[0], rlt[1]))) return rlt
def objectTransformation(self, texts, classes, labelMap): documents = [Document(document=texts[i], doc_id=i, preprocessed=' '.join(clf.preprocessing(texts[i])), trainInstance=True) for i in range(len(texts))] if not set(classes)-set(labelMap.keys()): labels = map(lambda t: Label(label=labelMap[t], option='radio'), classes) return documents, labels else: print 'Labels found in '+filename+': '+str(set(classes)) print 'Labels fount in label_mapping: '+str(set(labelMap.keys())) return None, None
def readTSV(filename): with open(filename) as trainfile: train_content = trainfile.read().decode('utf-8') # # Extract the raw label representation from the total text. # raw_labels = ''.join(re.findall(r'\t[\d\-]+?\n',train_content)).replace('\t', '').splitlines() raw_labels = ''.join(re.findall(r'(\t[\-1]{1,2})*?\n',train_content)).split('\t')[1:] # Get the Labels from the db. relevant = Label.objects.filter(label='relevant').first() irrelevant = Label.objects.filter(label='irrelevant').first() # Tranform label representation into the true Labels. labels = [relevant if label=='1' else irrelevant for label in raw_labels] doc_ids = ''.join(re.findall(r'\n.+?\t', train_content)).replace('\t', '').splitlines() document_texts = re.findall(r'\t.+?\t', train_content) documents = [Document(document=document_texts[idx], doc_id=doc_ids[idx], preprocessed=' '.join(clf.preprocessing( document_texts[idx])), trainInstance=True) for idx in range(len(document_texts))] return (documents, labels)
def predict_labels(self, options): if options['delimiter'] == 't': options['delimiter'] = '\t' # # read test instances from file with open(options['testfile']) as csvfile: rows = list(csv.reader(csvfile, delimiter=options['delimiter'])) # if len(rows[0]) == 2: doc_ids, texts = zip(*rows) trueLabels = [] else: doc_ids, texts, trueLabels = zip(*rows) # # convert the file content to Document objects documents = map( lambda (doc_id, text): Document(document=text, doc_id=doc_id, preprocessed=' '.join( clf.preprocessing(text)), trainInstance=True), zip(doc_ids, texts)) # predict labels return doc_ids, texts, map(clf.predict_label, documents), trueLabels
def handle(self, *args, **options): with open(options['filename']) as trainfile: train_content = trainfile.read() # raw_labels = ''.join(re.findall( r'\t[\d\-]+?\n', train_content)).replace('\t', '').splitlines() relevant = Label.objects.filter(label='Pos').first() irrelevant = Label.objects.filter(label='Neg').first() labels = [ relevant if label == '1' else irrelevant for label in raw_labels ] doc_ids = ''.join(re.findall(r'\n.+?\t', train_content)).replace('\t', '').splitlines() document_texts = re.findall(r'\t.+?\t', train_content) if options['maxTokenCount'] == 'all': maxTokenCount = len(fullTokens) else: maxTokenCount = int(options['maxTokenCount']) document_texts = map( lambda d: ' '.join( d.decode('utf-8').encode('utf-8').split(' ')[:maxTokenCount]), document_texts) documents = [ Document(document=document_texts[idx], doc_id=doc_ids[idx], preprocessed=' '.join( clf.preprocessing(document_texts[idx])), trainInstance=True) for idx in range(len(document_texts)) ] # for document in documents: document.save() # for idx in range(len(documents)): clf.online_train(documents[idx], [labels[idx]])