def linked_score(collection_ngram, ngram, text, article_dict, ngram_abs_count, corr_dict1=None, corr_dict2=None, score_func='weight_both_ngram4'): """ :type collection_ngram: Collocation :type ngram: ArticleCollocation :type text: unicode """ ngram = ngram.ngram nb = NgramBindings(ngram, text, corr_dict1=corr_dict1, corr_dict2=corr_dict2) if len(ngram.split()) == 2: score = getattr(nb, score_func)() else: smaller_ngrams = set(build_ngram_index(ngram).keys()).intersection(article_dict.keys()) # select max split combination if smaller_ngrams: if len(smaller_ngrams) == 1: smaller_ngram = smaller_ngrams.pop() values = article_dict[smaller_ngram] score = values['score'] * ngram_abs_count / values['abs_count'] # reduce score from the consumed ngram according its score article_dict[smaller_ngram]['score'] = values['score'] - score score = (score + getattr(nb, score_func)(split_ngram=smaller_ngram)) / 2 else: score = 0 smaller_ngrams = sorted(smaller_ngrams, key=lambda x: len(x.split()), reverse=True) for smaller_ngram in smaller_ngrams: values = article_dict[smaller_ngram] local_score = values['score'] * ngram_abs_count / values['abs_count'] article_dict[smaller_ngram]['score'] = values['score'] - local_score score += local_score # no - full average else: score = getattr(nb, score_func)() return score, nb.ddict1, nb.ddict2
def get_context_data(self, **kwargs): """Add nodes and links to the context""" context = super(NgramParticipationView, self).get_context_data(**kwargs) # nodes are simply ngrams links = [] irrel_ngrams = set(self.queryset.filter(tags__is_relevant=False).values_list("ngram", flat=True)) rel_ngrams = set(self.queryset.filter(tags__is_relevant=True).values_list("ngram", flat=True)) all_ngrams = list(self.queryset) # Sort from longest to shortest, we use this in computing connections all_ngrams.sort(key=lambda x: len(x.ngram) + len(x.ngram.split()), reverse=True) ngrams_set = set(self.queryset.values_list("ngram", flat=True)) participation_dict = defaultdict(list) for ngram_obj in all_ngrams: ngram = ngram_obj.ngram if ngram in participation_dict: for ngram_1 in participation_dict[ngram]: links.append((ngram, ngram_1)) # replace with current ngram for ngram_i in ngrams_set.intersection(build_ngram_index(ngram).keys()): participation_dict[ngram_i] = [ngram] else: # append current ngram for ngram_i in ngrams_set.intersection(build_ngram_index(ngram).keys()): participation_dict[ngram_i].append(ngram) # keep only connected components connected_nodes = list(set(zip(*links)[0]).union(set(zip(*links)[1]))) node_dict = dict([(node, i) for i, node in enumerate(connected_nodes)]) links = [{"source": node_dict[source], "target": node_dict[target]} for source, target in links] def _get_rel_info(ngram): if ngram in rel_ngrams: return 1 elif ngram in irrel_ngrams: return -1 return 0 nodes = [{"name": ngram, "rel": _get_rel_info(ngram)} for ngram in connected_nodes] context["data"] = json.dumps({"nodes": nodes, "links": links}) return context
def populate_wiki_index(cls, cluster_id): import networkx as nx from axel.stats.models import STATS_CLUSTERS_DICT for article in print_progress(cls.objects.filter(cluster_id=cluster_id)): text = '' dbpedia_graph = article.dbpedia_graph(redirects=True) # by default components are ordered descending by size max_comp = nx.connected_components(dbpedia_graph)[0] nodes = [node for node in max_comp if 'Category' not in node] statsModel = STATS_CLUSTERS_DICT[cluster_id] ngrams = statsModel.objects.filter(ngram__in=nodes) for ngram in ngrams: text += ngram.wikipedia_text + '\n' article.wiki_text_index = nlp.build_ngram_index(nlp.Stemmer.stem_wordnet(text)) article.save()
def generate_temp_article(text): # TODO: make this Article class method from axel.articles.models import Article, Venue, TestCollocations import json venue = Venue.objects.get(acronym='SIGIR') stemmed_text = nlp.Stemmer.stem_wordnet(text) index = json.dumps(nlp.build_ngram_index(stemmed_text)) article = Article(text=text, cluster_id='CS_COLLOCS', venue=venue, year=2013, stemmed_text=stemmed_text, index=index) # TODO: extract title and abstract article.save_base(raw=True) article._create_collocations(True) for test_colloc in TestCollocations.objects.filter(article=article): obj = article.CollocationModel(ngram=test_colloc.ngram, count=test_colloc.count, article=article, total_count=0, extra_fields={}) obj.save() TestCollocations.objects.filter(article=article).delete() return article