예제 #1
0
def keyphrases_table(keyphrases,
                     texts,
                     similarity_measure=None,
                     synonimizer=None,
                     language=consts.Language.ENGLISH):
    """
    Constructs the keyphrases table, containing their matching scores in a set of texts.

    The resulting table is stored as a dictionary of dictionaries,
    where the entry table["keyphrase"]["text"] corresponds
    to the matching score (0 <= score <= 1) of keyphrase "keyphrase"
    in the text named "text".
    
    :param keyphrases: list of strings
    :param texts: dictionary of form {text_name: text}
    :param similarity_measure: similarity measure to use
    :param synonimizer: SynonymExtractor object to be used
    :param language: Language of the text collection / keyphrases

    :returns: dictionary of dictionaries, having keyphrases on its first level and texts
              on the second level.
    """

    similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure()

    text_titles = texts.keys()
    text_collection = texts.values()
    similarity_measure.set_text_collection(text_collection, language)

    i = 0
    keyphrases_prepared = {
        keyphrase: utils.prepare_text(keyphrase)
        for keyphrase in keyphrases
    }
    total_keyphrases = len(keyphrases)
    total_scores = len(text_collection) * total_keyphrases
    res = {}
    for keyphrase in keyphrases:
        if not keyphrase:
            continue
        res[keyphrase] = {}
        for j in range(len(text_collection)):
            i += 1
            logging.progress("Calculating matching scores", i, total_scores)
            res[keyphrase][text_titles[j]] = similarity_measure.relevance(
                keyphrases_prepared[keyphrase],
                text=j,
                synonimizer=synonimizer)

    logging.clear()

    return res
예제 #2
0
    def set_text_collection(self, texts, language=consts.Language.ENGLISH):
        self.texts = texts
        self.language = language

        self.asts = []
        total_texts = len(texts)

        for i in xrange(total_texts):
            # NOTE(mikhaildubov): utils.text_to_strings_collection()
            #                     does utils.prepare_text() as well.
            self.asts.append(
                base.AST.get_ast(utils.text_to_strings_collection(texts[i]),
                                 self.ast_algorithm))
            logging.progress("Indexing texts with ASTs", i + 1, total_texts)

        logging.clear()
def keyphrases_table(keyphrases, texts, similarity_measure=None, synonimizer=None,
                     language=consts.Language.ENGLISH):
    """
    Constructs the keyphrases table, containing their matching scores in a set of texts.

    The resulting table is stored as a dictionary of dictionaries,
    where the entry table["keyphrase"]["text"] corresponds
    to the matching score (0 <= score <= 1) of keyphrase "keyphrase"
    in the text named "text".
    
    :param keyphrases: list of strings
    :param texts: dictionary of form {text_name: text}
    :param similarity_measure: similarity measure to use
    :param synonimizer: SynonymExtractor object to be used
    :param language: Language of the text collection / keyphrases

    :returns: dictionary of dictionaries, having keyphrases on its first level and texts
              on the second level.
    """

    similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure()

    text_titles = texts.keys()
    text_collection = texts.values()
    similarity_measure.set_text_collection(text_collection, language)

    i = 0
    keyphrases_prepared = {keyphrase: utils.prepare_text(keyphrase)
                           for keyphrase in keyphrases}
    total_keyphrases = len(keyphrases)
    total_scores = len(text_collection) * total_keyphrases
    res = {}
    for keyphrase in keyphrases:
        if not keyphrase:
            continue
        res[keyphrase] = {}
        for j in xrange(len(text_collection)):
            i += 1
            logging.progress("Calculating matching scores", i, total_scores)
            res[keyphrase][text_titles[j]] = similarity_measure.relevance(
                                                        keyphrases_prepared[keyphrase],
                                                        text=j, synonimizer=synonimizer)

    logging.clear()

    return res
예제 #4
0
    def _preprocess_tokens(self, tokens_in_texts):
        if self.vector_space == consts.VectorSpace.WORDS:
            return tokens_in_texts
        if self.vector_space == consts.VectorSpace.STEMS:
            # TODO(mikhaildubov): If the user does not specify the language, can we do some
            #                     auto language detection here?
            stemmed_tokens = []
            total_texts = len(tokens_in_texts)
            for i in xrange(total_texts):
                stemmed_tokens.append(
                    [self.stemmer.stem(token) for token in tokens_in_texts[i]])
                logging.progress("Stemming tokens in texts", i + 1,
                                 total_texts)
            return stemmed_tokens
        elif self.vector_space == consts.VectorSpace.LEMMATA:
            # TODO(mikhaildubov): Implement this (what lemmatizer to use here?)
            raise NotImplemented()

        logging.clear()
예제 #5
0
    def set_text_collection(self, texts, language=consts.Language.ENGLISH):
        self.language = language
        if self.vector_space == consts.VectorSpace.STEMS:
            self.stemmer = snowball.SnowballStemmer(self.language)
        raw_tokens = []
        total_texts = len(texts)
        for i in xrange(total_texts):
            raw_tokens.append(
                utils.tokenize_and_filter(utils.prepare_text(texts[i])))
            logging.progress("Preparing texts", i + 1, total_texts)

        logging.clear()

        # Convert to stems or lemmata, depending on the vector space type
        preprocessed_tokens = self._preprocess_tokens(raw_tokens)

        # Terms define the vector space (they can be words, stems or lemmata). They should be
        # defined once here because they will be reused when we compute td-idf for queries
        self.terms = list(set(utils.flatten(preprocessed_tokens)))
        self.tf, self.idf = self._tf_idf(preprocessed_tokens)
예제 #6
0
    def _tf_idf(self, tokens_in_texts):
        # Calculate the inverted term index to facilitate further calculations
        # This is a mapping from a token to its position in the vector
        term_index = {}
        for i in xrange(len(self.terms)):
            term_index[self.terms[i]] = i

        total_texts = len(tokens_in_texts)
        terms_count = len(self.terms)

        # Calculate TF and IDF
        tf = [np.zeros(terms_count) for _ in xrange(total_texts)]
        idf_per_ferm = defaultdict(int)
        for i in xrange(total_texts):
            logging.progress("Processing texts for TF-IDF", i + 1, total_texts)
            # NOTE(mikhaildubov): For TF, we want to count each term as many time as it appears
            for term in tokens_in_texts[i]:
                if term in term_index:
                    tf[i][term_index[term]] += 1
            # NOTE(mikhaildubov): For IDF, we want to count each document once for each term
            if self.term_weighting == consts.TermWeighting.TF_IDF:
                for term in set(tokens_in_texts[i]):
                    if term in term_index:
                        idf_per_ferm[term] += 1
            # TF Normalization
            tf[i] = [
                freq * 1.0 / max(len(tokens_in_texts[i]), 1) for freq in tf[i]
            ]
        # Actual IDF metric calculation
        if self.term_weighting == consts.TermWeighting.TF_IDF:
            idf = np.zeros(len(self.terms))
            for term in idf_per_ferm:
                idf[term_index[term]] = 1 + math.log(
                    total_texts * 1.0 / idf_per_ferm[term])
        else:
            idf = None

        logging.clear()

        return tf, idf