Пример #1
0
 def _update_total_counts(self):
     print 'Update total counts:'
     self.StatsModel.all().update(count=0)
     total_counts = defaultdict(lambda: 0)
     print 'Collecting new counts...'
     for ngram, count in print_progress(self.Model.values_list('ngram', 'count'), 10):
         total_counts[ngram] += count
     print 'Updating total counts...'
     for ngram, count in print_progress(total_counts.items(), 5):
         self.StatsModel.filter(ngram=ngram).update(count=count)
         self.Model.filter(ngram=ngram).update(total_count=count)
Пример #2
0
        def global_collocations_rejoin():
            print 'Global re-population...'
            # add existing if do not exist yet
            for article in cls.objects.filter(cluster_id=cluster_id):
                index = json.loads(article.index)
                for colloc in all_collocs.intersection(index.keys()):
                    # get or create because we are not filtrating old ones
                    TestCollocations.objects.get_or_create(ngram=colloc,
                                                           article=article,
                                                           defaults={'count': index[colloc]})
                # we could screw up counts completely, need to update them
            print 'Starting updates...'
            from axel.libs.utils import print_progress
            from axel.libs.nlp import _update_ngram_counts

            for article in print_progress(cls.objects.filter(cluster_id=cluster_id)):
                ngrams = sorted(article.testcollocations_set.values_list('ngram', 'count'),
                                key=lambda x: (x[1], x[0]))
                if not ngrams:
                    continue
                index = json.loads(article.index)
                new_ngrams = nlp._generate_possible_ngrams([tuple(c.split()) for c in zip(*ngrams)[0]],
                                                           index)
                new_ngrams = _update_ngram_counts(new_ngrams, index)
                new_ngrams = sorted(new_ngrams.items(), key=lambda x: (x[1], x[0]))
                new_ngrams = [k for k in new_ngrams if k[1] > 0]
                if new_ngrams != ngrams:
                    obsolete_ngrams = set(ngrams).difference(new_ngrams)
                    article.testcollocations_set.filter(ngram__in=zip(*obsolete_ngrams)[0]) \
                        .delete()
                    for ngram, score in set(new_ngrams).difference(ngrams):
                        TestCollocations.objects.create(ngram=ngram, count=score, article=article)
Пример #3
0
def populate_article_dict_ML(model, cutoff=1):
    """
    :type model: Model
    """
    article_dict = defaultdict(dict)
    article_rel_dict = defaultdict(dict)
    for key, is_rel in model.judged_data.iteritems():
        ngram, article_id = key.split(',')
        is_rel = int(is_rel)
        article_rel_dict[article_id][ngram] = is_rel

    for article in print_progress(Article.objects.filter(cluster_id=model.CLUSTER_ID)):
        text = article.stemmed_text
        # create correspondence dict
        all_ngrams = list(model.objects.filter(article=article).values_list('ngram', flat=True))
        for ngram in sorted(model.objects.filter(article=article),
                            key=lambda x: len(x.ngram.split())):
            part_count = 0
            for p_ngram in all_ngrams:
                if p_ngram != ngram.ngram and ngram.ngram in p_ngram:
                    part_count += 1
            try:
                is_rel = article_rel_dict[unicode(article)][ngram.ngram]
            except KeyError:
                continue
            ngram_abs_count = text.count(ngram.ngram)
            if ngram_abs_count <= cutoff:
                continue
            collection_ngram = model.COLLECTION_MODEL.objects.get(ngram=ngram.ngram)
            article_dict[article][ngram.ngram] = {'is_rel': is_rel, 'ngram': ngram,
                                                  'collection_ngram': collection_ngram,
                                                  'participation_count': part_count}

    return article_dict
Пример #4
0
    def get_scores(cls, queryset):
        """
        :param queryset: QuerySet
        :returns: accuracy scores for each method
        :rtype: defaultdict
        """

        relevant_names = set(queryset.filter(tags__is_relevant=True).values_list('ngram',
                                                                                 flat=True))
        irrelevant_names = set(queryset.filter(tags__is_relevant=False).values_list('ngram',
                                                                                    flat=True))

        unjudged = defaultdict(lambda: 0)
        orderings = defaultdict(lambda: {'relevant': defaultdict(lambda: 0),
                                         'irrelevant': defaultdict(lambda: 0)})

        print 'Starting article processing...'
        df_dict = dict(queryset.values_list('ngram', '_df_score'))
        total_docs = Article.objects.filter(cluster_id=queryset.model.CLUSTER_ID).count()
        for article in print_progress(Article.objects.filter(cluster_id=queryset.model.CLUSTER_ID)):
            index = json.loads(article.index)
            # add TF-IDF score
            ngrams = article.articlecollocation_set.values_list('ngram', 'count')
            tfidf_ordering = [(ngram, score * math.log(total_docs / df_dict[ngram]))
                              for ngram, score in ngrams if ngram in df_dict]
            tfidf_ordering.sort(key=lambda x: x[1], reverse=True)

            cur_orderings = list(cls.collocations(article.stemmed_text, index, MEASURES))
            cur_orderings.append(('tf-idf', zip(*tfidf_ordering)[0]))

            for order_name, ordering in cur_orderings:
                for i, ngram in enumerate(ordering):
                    if ngram in relevant_names:
                        orderings[order_name]['relevant'][i] += 1
                    elif ngram in irrelevant_names:
                        orderings[order_name]['irrelevant'][i] += 1
                    else:
                        # not present
                        unjudged[i] += 1

        print 'End article processing...'
        print 'Starting result formatting...'

        graph_results = defaultdict(list)
        for order_name, results in orderings.iteritems():
            total_relevant = 0
            total_irrelevant = 0
            for rel_count, irrel_count in zip(results['relevant'].items(),
                                              results['irrelevant'].items()):
                total_relevant += rel_count[1]
                total_irrelevant += irrel_count[1]
                graph_results[order_name].append((rel_count[0],
                    round(total_relevant / (total_irrelevant + total_relevant), 3)))

        return graph_results
Пример #5
0
    def _punct_calculation(self):
        print "Calculating Contigency tables for after/before punctuation"
        pos_tag_prev = [0, 0, 0, 0]
        pos_tag_after = [0, 0, 0, 0]

        for article in print_progress(Article.objects.filter(cluster_id=self.cluster_id)):
            for ngram in self.Model.objects.filter(article=article):
                if ngram.ngram in self.article_rel_dict[unicode(article)][1]:
                    if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_prev)[0]):
                        pos_tag_prev[0] += 1
                    else:
                        pos_tag_prev[2] += 1

                    if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_after)[0]):
                        pos_tag_after[0] += 1
                    else:
                        pos_tag_after[2] += 1
                elif ngram.ngram in self.article_rel_dict[unicode(article)][0]:
                    if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_prev)[0]):
                        pos_tag_prev[1] += 1
                    else:
                        pos_tag_prev[3] += 1

                    if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_after)[0]):
                        pos_tag_after[1] += 1
                    else:
                        pos_tag_after[3] += 1

        print "Contigency table BEFORE:"
        print "       | Valid | Invalid | Total |"
        print "+punct | {0:>5} | {1:>7} | {2:>5} |".format(
            pos_tag_prev[0], pos_tag_prev[1], pos_tag_prev[0] + pos_tag_prev[1]
        )
        print "-punct | {0:>5} | {1:>7} | {2:>5} |".format(
            pos_tag_prev[2], pos_tag_prev[3], pos_tag_prev[2] + pos_tag_prev[3]
        )
        print "Totals | {0:>5} | {1:>7} | {2:>5} |".format(
            pos_tag_prev[0] + pos_tag_prev[2], pos_tag_prev[1] + pos_tag_prev[3], sum(pos_tag_prev)
        )
        print
        print "Contigency table AFTER:"
        print "       | Valid | Invalid | Total |"
        print "+punct | {0:>5} | {1:>7} | {2:>5} |".format(
            pos_tag_after[0], pos_tag_after[1], pos_tag_after[0] + pos_tag_after[1]
        )
        print "-punct | {0:>5} | {1:>7} | {2:>5} |".format(
            pos_tag_after[2], pos_tag_after[3], pos_tag_after[2] + pos_tag_after[3]
        )
        print "Totals | {0:>5} | {1:>7} | {2:>5} |".format(
            pos_tag_after[0] + pos_tag_after[2], pos_tag_after[1] + pos_tag_after[3], sum(pos_tag_after)
        )
Пример #6
0
def populate_article_dict(model, score_func, cutoff=1):
    """
    :type model: Model
    """
    article_dict = defaultdict(dict)
    article_rel_dict = defaultdict(dict)
    for key, is_rel in model.judged_data.iteritems():
        ngram, article_id = key.split(',')
        is_rel = int(is_rel)
        article_rel_dict[article_id][ngram] = is_rel

    for article in print_progress(Article.objects.filter(cluster_id=model.CLUSTER_ID)):
        text = article.stemmed_text
        # create correspondence dict
        corr_dict1 = defaultdict(set)
        corr_dict2 = defaultdict(set)
        all_ngrams = list(model.objects.filter(article=article).values_list('ngram', flat=True))
        for ngram in all_ngrams:
            if len(ngram.split()) == 2:
                w1, w2 = ngram.split()
                corr_dict1[w2].add(w1)
                corr_dict2[w1].add(w2)
        for ngram in sorted(model.objects.filter(article=article),
                            key=lambda x: len(x.ngram.split())):
            part_count = 0
            for p_ngram in all_ngrams:
                if p_ngram != ngram.ngram and ngram.ngram in p_ngram:
                    part_count += 1
            try:
                is_rel = article_rel_dict[unicode(article)][ngram.ngram]
            except KeyError:
                continue
            ngram_abs_count = text.count(ngram.ngram)
            if ngram_abs_count <= cutoff:
                continue
            collection_ngram = model.COLLECTION_MODEL.objects.get(ngram=ngram.ngram)
            score, ddict1, ddict2 = score_func(collection_ngram, ngram, text, article_dict[article],
                                               ngram_abs_count, corr_dict1, corr_dict2)
            nl_ngrams = [' '.join(n) for n in nltk.ngrams(ngram.ngram.split(), 2)]
            support_len = len(set(all_ngrams).intersection(nl_ngrams))
            article_dict[article][ngram.ngram] = {'abs_count': ngram_abs_count, 'score': score,
                                                  'is_rel': is_rel, 'count': ngram.count,
                                                  'ddict1': ddict1, 'ddict2': ddict2,
                                                  'collection_ngram': collection_ngram,
                                                  'ngram': ngram,
                                                  'len': support_len,
                                                  'participation_count': part_count}

    return article_dict
Пример #7
0
 def populate_wiki_index(cls, cluster_id):
     import networkx as nx
     from axel.stats.models import STATS_CLUSTERS_DICT
     for article in print_progress(cls.objects.filter(cluster_id=cluster_id)):
         text = ''
         dbpedia_graph = article.dbpedia_graph(redirects=True)
         # by default components are ordered descending by size
         max_comp = nx.connected_components(dbpedia_graph)[0]
         nodes = [node for node in max_comp if 'Category' not in node]
         statsModel = STATS_CLUSTERS_DICT[cluster_id]
         ngrams = statsModel.objects.filter(ngram__in=nodes)
         for ngram in ngrams:
             text += ngram.wikipedia_text + '\n'
         article.wiki_text_index = nlp.build_ngram_index(nlp.Stemmer.stem_wordnet(text))
         article.save()
Пример #8
0
 def _update_max_pos_tags(self):
     print 'Update max POS tags'
     self.StatsModel.all().update(_max_pos_tag=None)
     for c in print_progress(self.StatsModel.all(), 5):
         _ = c.max_pos_tag