Exemplo n.º 1
0
 def _results(self, items, **kwargs):
     # Fills in a Results object with the invariant information and the
     # given "items" (a list of (score, docnum) tuples)
     r = Results(self.top_searcher, self.q, items, **kwargs)
     r.runtime = self.runtime
     r.collector = self
     return r
    def text_term_frequency(docs: Results) -> list:
        """
        Ermittelt für eine gegebene Liste an Dokumenten (Results Objekt) die häufigsten Terme. Zurückgegeben werden
        (in Abhängigkeit der länge der Liste der gefunden Terme) die top 20%, wobei die 0.25% der häufigsten Terme
        herausgefiltert werden.
        :param docs: Zu durchsuchende Dokumente als Results Objekt
        :return: Liste aus Strings
        """

        # Filters stopwords and words shorter then 5 chars
        def word_filter(word: str) -> str:
            if word != "None" and word not in STOP_WORDS and len(word) > 5:
                return word

        terms = dict()
        for doc in docs:
            ranking_treshold = docs.score(int(len(docs) * 0.50))
            if doc.score < ranking_treshold:
                break
            content = doc["paragraph_content"]
            for word in content.split(" "):
                word_filtered = word_filter(word)
                if word_filtered in terms:
                    terms[word_filtered] += 1
                else:
                    terms[word_filtered] = 1
        sorted_by_value = sorted(terms.items(),
                                 key=lambda kv: kv[1],
                                 reverse=True)
        return [(x[0], x[1]) for x in
                sorted_by_value[int(len(sorted_by_value) *
                                    0.0025):int(len(sorted_by_value) * 0.20)]]
Exemplo n.º 3
0
 def _results(self, q, docnums, docset, runtime):
     top_n = [(None, docnum) for docnum in docnums]
     return Results(self.searcher, q, top_n, docset, runtime=runtime)