def test_tfidf(self): record = self.records[0] create_db_entries(record) articles = Article.objects.all() article = articles[0] implementation = Term.objects.get(term='implementation') idf = math.log(1.0/2.0) self.assertEqual(idf, tfidf(implementation, article))
def search(request): if request.method == 'POST': form = SearchForm(request.POST) if form.is_valid(): query_terms = form.cleaned_data['q'].split() query_terms = [term.lower() for term in query_terms] intermediate_results = _find_articles(query_terms) # calculate the TF-IDF of each term per document, # order results by TF-IDF terms = Term.objects.filter(term__in=query_terms) ordered_results = [] for term in terms: for doc in intermediate_results: ordered_results.append((tfidf(term, doc), doc)) ordered_results.sort(reverse=True) # strip out duplicate articles without changing the order results = _deduplicate_articles(ordered_results) # calculate total number of articles for "X of Y documents" total_docs = Article.objects.count() # Calculate the average TF-IDF for each author in search results. # Average TF-IDF includes scores of zero for documents that match # term A, but not term B. That is, a doc that matches A will have a # TF-IDF of some positive float, but if that same doc does *not* # match term B, it will have a TF-IDF of 0 for term B. # ordered_results has a list of (TF-IDF, article) tuples of all # results, so start with that and create a dictionary with authors # as keys and lists of (TF-IDF, article) tuples as values. author_totals = {} for score, doc in ordered_results: for author in doc.authors.all(): scores = author_totals.setdefault(author.pk, []) scores.append(score) # average the scores per author author_averages = [] total_results = len(ordered_results) for author_pk, scores in author_totals.items(): scores_sum = fsum(scores) average = scores_sum / total_results author = Author.objects.get(pk=author_pk) author_averages.append((author, average)) return render(request, 'pubmed_search/search.html', {'articles': results, 'query_terms': query_terms, 'total_documents': total_docs, 'author_averages': author_averages}) else: return render(request, 'pubmed_search/search.html', {'query_terms': request.POST}) else: return render(request, 'pubmed_search/search.html')