def vectorize(self, strings): """ Vectorize string inputs using bert attention. Parameters ---------- strings : str / list of str Returns ------- array: vectorized strings """ if isinstance(strings, list): if not isinstance(strings[0], str): raise ValueError('input must be a list of strings or a string') else: if not isinstance(strings, str): raise ValueError('input must be a list of strings or a string') if isinstance(strings, str): strings = [strings] splitted_fullstop = [summary_textcleaning(i) for i in strings] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] sequences = skip_thought.batch_sequence(cleaned_strings, self.dictionary, maxlen=self._maxlen) return self._sess.run(self._logits, feed_dict={self._X: np.array(sequences)})
def summarize(self, corpus, top_k: int = 3, important_words: int = 3, **kwargs): """ Summarize list of strings / corpus Parameters ---------- corpus: str, list top_k: int, (default=3) number of summarized strings. important_words: int, (default=3) number of important words. Returns ------- string: summarized string """ if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer): sequences = skip_thought.batch_sequence( cleaned_strings, self._vectorizer.dictionary, maxlen=self._vectorizer._maxlen, ) vectors, attention = self._vectorizer._sess.run( [self._vectorizer._logits, self._vectorizer._attention], feed_dict={self._vectorizer._X: np.array(sequences)}, ) attention = attention.sum(axis=0) indices = np.argsort(attention)[::-1] top_words = [ self._vectorizer._rev_dictionary[i] for i in indices if self._vectorizer._rev_dictionary[i] not in STOPWORDS ][:important_words] else: vectors = self._vectorizer.vectorize(corpus) attentions = self._vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True, ) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }