Пример #1
0
 def _to_hit_group_class(results):
     for result in results:
         doc = result["document"]
         prob = result["probabilities"]
         yield HitGroupClass(group_id=doc["group_id"],
                             classes=NaiveBayesClassifier.most_likely(result),
                             probabilities=json.dumps(prob))
Пример #2
0
 def handle(self, *args, **options):
     def _to_hit_group_class(results):
         for result in results:
             doc = result["document"]
             prob = result["probabilities"]
             yield HitGroupClass(group_id=doc["group_id"],
                                 classes=NaiveBayesClassifier.most_likely(result),
                                 probabilities=json.dumps(prob))
     if options["remove"]:
         logger.info("Removing existing classification")
         HitGroupClass.objects.all().delete()
         logger.info("Classification removed")
         return
     with open(options["input_path"], "r") as file:
         probabilities = json.load(file)
         classifier = NaiveBayesClassifier(probabilities=probabilities)
         # TODO remove this -- for debug purposes only
         if options["single"] and options["group_id"]:
             model = HitGroupContent.objects.get(group_id=options['group_id'])
             print classifier.classify(model)
             return
         logger.info("Classification of hit groups started. Processing in "\
                     "batches size of {}".format(self.BATCH_SIZE))
         while True:
             models = query_to_dicts(
                 """ SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     WHERE NOT EXISTS(
                         SELECT * FROM main_hitgroupclass as class
                         WHERE content.group_id = class.group_id
                     ) LIMIT {};
                 """.format(self.BATCH_SIZE))
             logger.info("Batch classification stated")
             try:
                 results = _to_hit_group_class(classifier.classify_batch(models))
                 HitGroupClass.objects.bulk_create(results)
             except EmptyBatchException:
                 logger.info("Batch is empty no hit groups to classify")
                 break
             logger.info("Batch classified successfully")
Пример #3
0
def classification(request):
    data = query_to_dicts(
        """ SELECT classes, COUNT(classes) number
            FROM main_hitgroupclass
            GROUP BY classes;
        """)
    data = list(data)
    sum = 0
    for d in data:
        sum += d["number"]

    for d in data:
        d["name"] = NaiveBayesClassifier.label(d["classes"])
        d["part"] = 100 * float(d["number"]) / sum
    params = {"data":data}
    return direct_to_template(request, 'main/classification.html',
                              params)
Пример #4
0
def hit_group_details(request, hit_group_id):

    try:
        hit_group = HitGroupContent.objects.get(group_id=hit_group_id)
        if RequesterProfile.objects.filter(requester_id=hit_group.requester_id,
            is_public=False):
            raise HitGroupContent.DoesNotExist()
    except HitGroupContent.DoesNotExist:
        messages.info(request, 'Hitgroup with id "{0}" was not found!'.format(
            hit_group_id))
        return redirect('haystack_search')

    try:
        hit_group_class = HitGroupClass.objects.get(group_id=hit_group_id)
    except ObjectDoesNotExist:
        # TODO classification should be done on all models.
        hit_group_class = None
        try:
            with open(settings.CLASSIFIER_PATH, "r") as file:
                classifier = NaiveBayesClassifier(probabilities=json.load(file))
                classified = classifier.classify(hit_group)
                most_likely = classifier.most_likely(classified)
                document = classified["document"]
                hit_group_class = HitGroupClass(
                        group_id=document.group_id,
                        classes=most_likely,
                        probabilities=classified["probabilities"])
                hit_group_class.save()
        except IOError:
            # We do not want make hit group details page unavailable when
            # classifier file does not exist.
            pass

    if hit_group_class is not None:
        hit_group_class_label = NaiveBayesClassifier.label(hit_group_class.classes)
    else:
        hit_group_class_label = NaiveBayesClassifier.label()

    params = {
        'multichart': False,
        'columns': HIT_DETAILS_COLUMNS,
        'title': '#Hits',
        'class': hit_group_class_label,
    }

    def hit_group_details_data_formater(input):
        for cc in input:
            yield {
                'date': cc['start_time'],
                'row': (str(cc['hits_available']),),
            }

    dicts = query_to_dicts(
                """ select start_time, hits_available from hits_mv
                    where group_id = '{}' order by start_time asc """
                .format(hit_group_id))
    data = hit_group_details_data_formater(dicts)
    params['date_from'] = hit_group.occurrence_date
    params['date_to'] = datetime.datetime.utcnow()
    params['data'] = data
    params['hit_group'] = hit_group
    return direct_to_template(request, 'main/hit_group_details.html', params)