lsa.py

# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import math

from warnings import warn

try:
    import numpy
except ImportError:
    numpy = None

try:
    from numpy.linalg import svd as singular_value_decomposition
except ImportError:
    singular_value_decomposition = None
from base_summarizer import BaseSummarizer
import spacy.en
from spacy.parts_of_speech import VERB, NOUN, PROPN, PRON, PUNCT
from spacy.en import STOPWORDS
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class LsaSummarizer(BaseSummarizer):
    MIN_DIMENSIONS = 3
    REDUCTION_RATIO = 1/1
    _stop_words = frozenset()
    

    def __init__(self, ):
        BaseSummarizer.__init__(self, )
        self.nlp = spacy.en.English(entity=False, matcher=False)
        self.nlp_doc = None

    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = frozenset(map(self.normalize_word, words))

    def __call__(self, document, sentences_count, user_dict):
        self._ensure_dependecies_installed()
        self.nlp_doc = self.nlp(document)
        self.user_dict = user_dict
        logger.info("Created doc")
        
        dictionary = self._create_dictionary()
        # empty document
        if not dictionary:
            return ()
        matrix = self._create_matrix(dictionary)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        sents = [s.text for s in self.nlp_doc.sents]
        logger.info("Sentences generated by spacy are %s, count %s", sents, len(sents))
        new_sents = self._get_best_sentences(sents, sentences_count*2,
            lambda s: next(ranks))
        filt_sents = [sent for sent in new_sents if self.better_question(sent)]
        additional_sents = set(new_sents) - set(filt_sents)
        to_add = sentences_count - len(filt_sents)
        final_sents = filt_sents
        if to_add > 0:
            final_sents += sorted(list(additional_sents)[:to_add], key=lambda x: len(x), reverse=True)
        logger.info("Filtered sentences %s", filt_sents)
        logger.info("Final recommendations are %s", final_sents[:sentences_count])
        return final_sents


    def better_question(self, txt):
        if len(txt.split()) > 5:
            parse = self.nlp(txt)
            for sent in parse.sents:
                if len(sent) > 5:
                    p2 = self.nlp(sent.text)
                    for (i, wd) in enumerate(p2):
                        if wd.lemma_ in (u'can', u'should', u'will', u'could', u'why', u'what', u'how', u'is'):
                            return u'ROOT' in [x.dep_ for x in p2[i+1:]] and u'?' in  [x.orth_ for x in p2[i+1:]]              


    def _ensure_dependecies_installed(self):
        if numpy is None:
            raise ValueError("LSA summarizer requires NumPy. Please, install it by command 'pip install numpy'.")

    def _create_dictionary(self, ):
        """Creates mapping key = word, value = row index"""
        words = [wd.orth_ for wd in self.nlp_doc if wd.pos != PUNCT]
        unique_words = frozenset(w.lemma_ for w in self.nlp_doc if w not in STOPWORDS and w.tag_ != "PRP" and (w.pos == VERB or w.pos == NOUN))
        unique_users = frozenset(self.user_dict.values())
        logger.info("Have %s unique words" % len(unique_words))
        logger.info("Have %s unique users" % len(unique_users))
        return dict((w, i) for i, w in enumerate(unique_words|unique_users))

    def collect_bow(self, txt):
        sents = nlp(txt).sents
        return [x for x in [retrieve_main_bow(sent) for sent in sents] if x]

    def _create_matrix(self, dictionary):
        """
        Creates matrix of shape |unique words|×|sentences| where cells
        contains number of occurences of words (rows) in senteces (cols).
        """
        sentences = list(self.nlp_doc.sents)
        words_count = len(dictionary)
        sentences_count = len(sentences)
        logger.info ("Have %s sentences " % sentences_count)
        if words_count < sentences_count:
            message = (
                "Number of words (%d) is lower than number of sentences (%d). "
                "LSA algorithm may not work properly."
            )
            logger.warn(message % (words_count, sentences_count))
        # create matrix |unique words|×|sentences| filled with zeroes
        matrix = numpy.zeros((words_count, sentences_count))
        for col, sentence in enumerate(sentences):
            for word in [wd.lemma_ for wd in sentence if wd.lemma_ in dictionary]:
                matrix[dictionary[word], col] += 1
            if sentence.text in self.user_dict and len(self.user_dict[sentence.text]) > 1:
                logger.info("Matching sentence %s with user %s", sentence.text, self.user_dict[sentence.text])
                matrix[dictionary[self.user_dict[sentence.text]], col] += 1
        return matrix

    def _compute_term_frequency(self, matrix, smooth=0.4):
        """
        Computes TF metrics for each sentence (column) in the given matrix.
        You can read more about smoothing parameter at URL below:
        http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
        """
        assert 0.0 <= smooth < 1.0

        max_word_frequencies = numpy.max(matrix, axis=0)
        rows, cols = matrix.shape
        for row in range(rows):
            for col in range(cols):
                max_word_frequency = max_word_frequencies[col]
                if max_word_frequency != 0:
                    frequency = matrix[row, col]/max_word_frequency
                    matrix[row, col] = smooth + (1.0 - smooth)*frequency
        return matrix

    def _compute_ranks(self, sigma, v_matrix):
        assert len(sigma) == v_matrix.shape[0], "Matrices should be multiplicable"

        dimensions = max(LsaSummarizer.MIN_DIMENSIONS,
            int(len(sigma)*LsaSummarizer.REDUCTION_RATIO))
        powered_sigma = tuple(s**2 if i < dimensions else 0.0
            for i, s in enumerate(sigma))
        ranks = []
        # iterate over columns of matrix (rows of transposed matrix)
        for column_vector in v_matrix.T:
            rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
            ranks.append(math.sqrt(rank))
        return ranks

def retrieve_main_bow(tokens):
    bow = set()
    for tok in tokens:
        if tok.pos != PUNCT:
            if tok.dep_ == 'advcl' or tok.dep == 'xcomp':
                bow.add(' '.join([ti.lower_ for ti in list(tok.children) if tok.tag_ != "PRP" and ti.lower_ not in STOPWORDS]))
                bow.add(tok.lower_)
            if tok.pos == NOUN or tok.pos == VERB:
                if tok.tag_ != "PRP" and tok.lower_ not in STOPWORDS:
                    bow.add(tok.lower_)
    mt = re.sub(r'[\n\t\n]', u'', u' '.join(list(bow))+u'.')
    return mt if len(mt.strip().split()) > 2 else None