예제 #1
0
    def test_tfidf(self):
        record = self.records[0]
        create_db_entries(record)
        articles = Article.objects.all()
        article = articles[0]
        implementation = Term.objects.get(term='implementation')

        idf = math.log(1.0/2.0)

        self.assertEqual(idf, tfidf(implementation, article))
예제 #2
0
def search(request):
    if request.method == 'POST':
        form = SearchForm(request.POST)
        if form.is_valid():
            query_terms = form.cleaned_data['q'].split()
            query_terms = [term.lower() for term in query_terms]
            intermediate_results = _find_articles(query_terms)

            # calculate the TF-IDF of each term per document,
            # order results by TF-IDF
            terms = Term.objects.filter(term__in=query_terms)
            ordered_results = []
            for term in terms:
                for doc in intermediate_results:
                    ordered_results.append((tfidf(term, doc), doc))
            ordered_results.sort(reverse=True)

            # strip out duplicate articles without changing the order
            results = _deduplicate_articles(ordered_results)

            # calculate total number of articles for "X of Y documents"
            total_docs = Article.objects.count()

            # Calculate the average TF-IDF for each author in search results.
            # Average TF-IDF includes scores of zero for documents that match
            # term A, but not term B. That is, a doc that matches A will have a
            # TF-IDF of some positive float, but if that same doc does *not*
            # match term B, it will have a TF-IDF of 0 for term B.
            # ordered_results has a list of (TF-IDF, article) tuples of all
            # results, so start with that and create a dictionary with authors
            # as keys and lists of (TF-IDF, article) tuples as values.
            author_totals = {}
            for score, doc in ordered_results:
                for author in doc.authors.all():
                    scores = author_totals.setdefault(author.pk, [])
                    scores.append(score)

            # average the scores per author
            author_averages = []
            total_results = len(ordered_results)
            for author_pk, scores in author_totals.items():
                scores_sum = fsum(scores)
                average = scores_sum / total_results
                author = Author.objects.get(pk=author_pk)
                author_averages.append((author, average))


            return render(request, 'pubmed_search/search.html', {'articles': results,
                                                                 'query_terms': query_terms,
                                                                 'total_documents': total_docs,
                                                                 'author_averages': author_averages})
        else:
            return render(request, 'pubmed_search/search.html', {'query_terms': request.POST})
    else:
        return render(request, 'pubmed_search/search.html')