def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) for attr in analysis.dataset.attribute_set.all(): for val in attr.value_set.all(): metric_name = 'Document Entropy for %s: %s' % (attr.name, val.value) print metric_name try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError( '%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() topics = analysis.topics.all() docs = [ d.id for d in analysis.dataset.documents.filter( attributevaluedocument__attribute=attr, attributevaluedocument__value=val) ] for topic in topics: ent = 0 for dt in topic.documenttopic_set.filter( document__id__in=docs): prob = dt.count / topic.total_count ent -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=ent) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name='Percent Tokens Positive ' 'Sentiment', analysis=analysis) if not force_import: raise RuntimeError('Sentiment is already in the database ' 'for this analysis!') except TopicMetric.DoesNotExist: metric = TopicMetric(name='Percent Tokens Positive Sentiment', analysis=analysis) metric.save() # call stuff to classify documents and get sentiment information, as in # parse_dependencies.py data_root = analysis.dataset.dataset_dir topics = analysis.topics.all() for topic in topics: positive = 0; negative = 0; for docTopic in topic.documenttopic_set.all(): filename = data_root + '/' + docTopic.document.filename print topic, filename sentiment = float(sentiment_document(filename)) print 'sentiment returned:', sentiment if sentiment == 1 : positive += docTopic.count print '%d/%d' % (positive, topic.total_count) # compute aggregate information for topic topicSentiment = float(positive)/float(topic.total_count) tmv = TopicMetricValue(topic=topic, metric=metric, value=topicSentiment) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() conn = sqlite3.connect(kwargs['counts']) c = conn.cursor() c.execute("select words from total_counts") for row in c: total_words = float(row[0]) c.execute("select cooccurrences from total_counts") for row in c: total_cooccurrences = float(row[0]) topics = analysis.topics.all() for topic in topics: topicwords = topic.topicword_set.filter( word__ngram=False).order_by('-count') # We just grab the first ten words - there's probably a better way to # do this words = [tw.word.type for tw in topicwords[:10]] total_pmi = 0 for w1 in words: for w2 in words: if w1 == w2: continue total_pmi += compute_pmi(w1, w2, c, total_words, total_cooccurrences) average_pmi = total_pmi / (len(words)**2) tmv = TopicMetricValue(topic=topic, metric=metric, value=average_pmi) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) for attr in analysis.dataset.attribute_set.all(): name = 'Value Entropy for Attribute %s' % attr.name try: metric = TopicMetric.objects.get(name=name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=name, analysis=analysis) metric.save() topics = analysis.topics.all() for topic in topics: entropy = 0 for avt in topic.attributevaluetopic_set.filter(attribute=attr): prob = avt.count / topic.total_count entropy -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy) tmv.save()