def main(*args, **kwargs): classifier_models = ClassifierModel.objects.all() classifiers_map = {c.id: pickle.loads(c.data) for c in classifier_models} # first run for ClassifiedDocuments chunksize = 20 chunkcounter = 0 while True: frm = chunkcounter * chunksize to = (chunkcounter + 1) * chunksize texts = ClassifiedDocument.objects.all().\ values('id', 'text', 'classifier')[frm:to] if not texts: print("No more data") break print('RUNNING CHUNK', chunkcounter) for x in texts: continue clf = classifiers_map.get(x['classifier']) clfn = clf.classify_as_label_probs(clf.preprocess(x['text'])) x['classification_probabilities'] = clfn x['confidence'] = classification_confidence(clfn) # make atomic transaction with transaction.atomic(): for x in texts: continue probs = x['classification_probabilities'] ClassifiedDocument.objects.filter(id=x['id']).update( classification_label=probs[0][0], confidence=x['confidence'], classification_probabilities=probs) # now the excerpts for x in texts: excerpts = ClassifiedExcerpt.objects.filter( classified_document__id=x['id']).values( 'id', 'start_pos', 'end_pos') for y in excerpts: print("EXC ID", y['id']) clf = classifiers_map.get(x['classifier']) clfn = clf.classify_as_label_probs( clf.preprocess(x['text'][y['start_pos']:y['end_pos']])) y['classification_probabilities'] = clfn y['confidence'] = classification_confidence(clfn) # update the excerpts with transaction.atomic(): for y in excerpts: probs = y['classification_probabilities'] ClassifiedExcerpt.objects.filter(id=y['id']).update( classification_label=probs[0][0], classification_probabilities=probs, confidence=y['confidence']) chunkcounter += 1
def get_confidences(classifier, test_data): """ Get the confidences values for all the datasets @classifier: classifier object @test_data: test data specific to the classifier It returns {correct_confidences: [float], incorrect_confidences:[float]} """ # deep_data = get_processed_data( # '_playground/sample_data/processed_sectors_subsectors.csv' # # 'fixtures/processed_data_for_testing.csv' # ) # confidences for correct and incorrect prediction correct_confidences = [] incorrect_confidences = [] for text, label in test_data: classification = classifier.classify_as_label_probs(text) confidence = classification_confidence(classification) classified_label = classification[0][0] # get the max if classified_label == label: # means correct confidence correct_confidences.append(confidence) else: incorrect_confidences.append(confidence) print("correct:", len(correct_confidences), "incorrect:", len(incorrect_confidences)) return { 'correct_confidences': correct_confidences, 'incorrect_confidences': incorrect_confidences }
def update_classified_documents_with_classifier(classifier_model): """ Update the existing classified documents and excerpts with the classifier """ classifier = pickle.loads(classifier_model.data) print("Updating classified docs ...") for doc in ClassifiedDocument.objects.all(): classified = classify_text(classifier, doc.text) doc.classifier = classifier_model doc.confidence = classification_confidence(classified) doc.classification_label = classified[0][0] doc.classification_probabilities = classified doc.save() print("Updated classified docs ...") print("Updating classified excerpts ...") for excerpt in ClassifiedExcerpt.objects.all(): classified = classify_text(classifier, excerpt.text) excerpt.classification_label = classified[0][0] excerpt.confidence = classification_confidence(classified) excerpt.classification_probabilities = classified excerpt.save() print("Updated classified excerpts ...")
def classification_confidence(self): return classification_confidence(self.classification_probabilities)
def post(self, request, version): data = dict(request.data.items()) validation_details = self._validate_classification_params(data) if not validation_details['status']: return Response(validation_details['error_data'], status=status.HTTP_400_BAD_REQUEST) # check if deeper and doc_id present deeper = True if data.get('deeper') else False if deeper and data.get('doc_id'): # get already classified data try: classified_doc = ClassifiedDocument.objects.get( id=data['doc_id']) return_data = ClassifiedDocumentSerializer(classified_doc).data return_data['excerpts_classification'] = \ ClassifiedExcerptSerializer( classified_doc.excerpts, many=True ).data return Response(return_data) except ClassifiedDocument.DoesNotExist: return Response({'error': 'Classified Document not found'}, status=status.HTTP_404_NOT_FOUND) except Exception as e: return Response({ 'status': False, 'message': 'Invalid doc_id' }, status=status.HTTP_400_BAD_REQUEST) classifier = self.classifiers.get(version) if not classifier: return Response( { 'status': False, 'message': 'Classifier not found' }, status.HTTP_404_NOT_FOUND) text = data['text'] # get language language = langdetect.detect(text) original = None try: if language != 'en': original = text logger.info("not english language") translation = self.translator.translate(text) translated = translation.text text = translated logger.info("Translated text: {}".format(translated)) except Exception as e: logger.warn("Exception while translating text. {}".format(e)) classified = classify_text(classifier['classifier'], text) if not data.get('deeper'): return Response({ 'classification': classified, 'classification_confidence': classification_confidence(classified) }) # Create classified Document grp_id = data.get('group_id') extra_info = {"language": language} if original: extra_info['original'] = original doc = ClassifiedDocument.objects.create( text=text, classifier=classifier['classifier_model'], confidence=classified[0][1], classification_label=classified[0][0], classification_probabilities=classified, group_id=grp_id, extra_info=extra_info) # now add the doc to a cluster, only if new doc is present if not data.get('doc_id'): # doc id is send for already present doc # we want to cluster new document assign_cluster_to_doc.delay(doc.id) classified_excerpts = classify_lead_excerpts( classifier['classifier'], text, ) # create excerpts excerpts = [] for x in classified_excerpts: excerpts.append( ClassifiedExcerpt.objects.create( classified_document=doc, start_pos=x['start_pos'], end_pos=x['end_pos'], classification_label=x['classification'][0][0], confidence=x['classification'][0][1], classification_probabilities=x['classification'])) ret = ClassifiedDocumentSerializer(doc).data ret['excerpts_classification'] = ClassifiedExcerptSerializer( excerpts, many=True).data return Response(ret)
def post(self, request, version): data = dict(request.data.items()) validation_details = self._validate_classification_params(data) if not validation_details['status']: return Response(validation_details['error_data'], status=status.HTTP_400_BAD_REQUEST) # check if deeper and doc_id present deeper = True if data.get('deeper') else False if deeper and data.get('doc_id'): # get already classified data try: classified_doc = ClassifiedDocument.objects.get( id=data['doc_id']) return_data = ClassifiedDocumentSerializer(classified_doc).data return_data['excerpts_classification'] = \ ClassifiedExcerptSerializer( classified_doc.excerpts, many=True ).data return Response(return_data) except ClassifiedDocument.DoesNotExist: return Response({'error': 'Classified Document not found'}, status=status.HTTP_404_NOT_FOUND) except Exception as e: return Response({ 'status': False, 'message': 'Invalid doc_id' }, status=status.HTTP_400_BAD_REQUEST) classifier = self.classifiers.get(version) if not classifier: return Response( { 'status': False, 'message': 'Classifier not found' }, status.HTTP_404_NOT_FOUND) text = data['text'] classified = classify_text(classifier['classifier'], text) if not data.get('deeper'): return Response({ 'classification': classified, 'classification_confidence': classification_confidence(classified) }) # Create classified Document grp_id = data.get('group_id') # get language language = langdetect.detect(text) doc = ClassifiedDocument.objects.create( text=text, classifier=classifier['classifier_model'], confidence=classified[0][1], classification_label=classified[0][0], classification_probabilities=classified, group_id=grp_id, extra_info={"language": language}) classified_excerpts = classify_lead_excerpts( classifier['classifier'], text, ) # create excerpts excerpts = [] for x in classified_excerpts: excerpts.append( ClassifiedExcerpt.objects.create( classified_document=doc, start_pos=x['start_pos'], end_pos=x['end_pos'], classification_label=x['classification'][0][0], confidence=x['classification'][0][1], classification_probabilities=x['classification'])) ret = ClassifiedDocumentSerializer(doc).data ret['excerpts_classification'] = ClassifiedExcerptSerializer( excerpts, many=True).data return Response(ret)