def _vectorize_sentence(self, corpus, isi_penting, important_words=10, batch_size=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] vectors = self._batching(cleaned_strings, batch_size=batch_size) if isi_penting: vectors_isi_penting = self._batching([isi_penting], batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention'): attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def _vectorize_word(self, corpus, isi_penting, window_size=10, important_words=10, batch_size=10, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] ngram_list, splitted = create_ngram(' '.join(cleaned_strings), ngram=window_size) splitted = ' '.join(original_strings).split() if isi_penting: isi_penting = [isi_penting] else: isi_penting = original_strings vectors = self._batching(ngram_list, batch_size=batch_size) vectors_isi_penting = self._batching(isi_penting, batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention') and important_words > 0: attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] vectors_isi_penting = np.mean(vectors_isi_penting, axis=0) vectors_isi_penting = np.expand_dims(vectors_isi_penting, axis=0) similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) scores = similar_isi_penting[:, 0] ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True) return (splitted, ranked_sentences, top_words, cluster_words(top_words))
def socialmedia_form(word: str): """ augmenting a word into socialmedia form. Parameters ---------- word: str Returns ------- result: List[str] """ word = simple_textcleaning(word) if not len(word): raise ValueError('word is too short to augment shortform.') results = [] if len(word) > 1: if word[-1] == 'a' and word[-2] in consonants: results.append(word[:-1] + 'e') if word[0] == 'f' and word[-1] == 'r': results.append('p' + word[1:]) if word[-2] in consonants and word[-1] in vowels: results.append(word + 'k') if word[-2] in vowels and word[-1] == 'h': results.append(word[:-1]) if len(word) > 2: if word[-3] in consonants and word[-2:] == 'ar': results.append(word[:-2] + 'o') if word[0] == 'h' and word[1] in vowels and word[2] in consonants: results.append(word[1:]) if word[-3] in consonants and word[-2:] == 'ng': results.append(word[:-2] + 'g') if word[1:3] == 'ng': results.append(word[:1] + x[2:]) return list(set(results))
def vowel_alternate(word: str, threshold: float = 0.5): """ augmenting a word into vowel alternate. vowel_alternate('singapore') -> sngpore vowel_alternate('kampung') -> kmpng vowel_alternate('ayam') -> aym Parameters ---------- word: str threshold: float, optional (default=0.5) Returns ------- result: str """ word = simple_textcleaning(word) if not len(word): raise ValueError('word is too short to augment shortform.') word = list(word[:]) i = 0 while i < len(word) - 2: subword = word[i: i + 3] if subword[0] in consonants and subword[1] in vowels and subword[2] in consonants \ and random.random() >= threshold: word.pop(i + 1) i += 1 return ''.join(word)
def shortform( word: str, augment_vowel: bool = True, augment_consonant: bool = True, prob_delete_vowel: float = 0.5, **kwargs, ): """ augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, purposely replaced some subwords into slang subwords. Parameters ---------- word: str augment_vowel: bool, (default=True) if True, will augment vowels for each samples generated. augment_consonant: bool, (default=True) if True, will augment consonants for each samples generated. prob_delete_vowel: float, (default=0.5) probability to delete a vowel. Returns ------- result: list """ if not 0 < prob_delete_vowel < 1: raise ValueError( 'prob_delete_vowel must be bigger than 0 and less than 1') word = simple_textcleaning(word) if not len(word): raise ValueError('word is too short to augment shortform.') check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) replace_consonants = { 'n': 'm', 't': 'y', 'r': 't', 'g': 'h', 'j': 'k', 'k': 'l', 'd': 's', 'd': 'f', 'g': 'f', 'b': 'n', } replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'} results = [word] if len(word) > 1: if word[-1] == 'a' and word[-2] in consonants: results.append(word[:-1] + 'e') if word[0] == 'f' and word[-1] == 'r': results.append('p' + words[1:]) if word[-2] in consonants and word[-1] in vowels: results.append(word + 'k') if word[-2] in vowels and word[-1] == 'h': results.append(word[:-1]) if len(word) > 2: if word[-3] in consonants and word[-2:] == 'ar': results.append(words[:-2] + 'o') if word[0] == 'h' and word[1] in vowels and word[2] in consonants: results.append(word[1:]) if word[-3] in consonants and word[-2:] == 'ng': results.append(word[:-2] + 'g') if word[1:3] == 'ng': results.append(word[:1] + x[2:]) if augment_consonant: result_consonants = [] for k, v in replace_consonants.items(): for r in results: result_consonants.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_consonants) if augment_vowel: result_vowels = [] for k, v in replace_vowels.items(): for r in results: result_vowels.extend([r.replace(k, v), r.replace(v, k)]) results.extend(result_vowels) result_deleted = [] for s in results: deleted = [] for c in s: if random.random() > prob_delete_vowel and c in vowels: continue else: deleted.append(c) result_deleted.append(''.join(deleted)) results.extend(result_deleted) filtered = [] for s in results: t = tokenizer.tokenize(s) if len(t) == 1: filtered.append(s) continue if t[0] == '▁': continue if any([len(w) < 3 for w in t]): continue filtered.append(s) return list(set(filtered))
def summarize(self, corpus, top_k: int = 3, important_words: int = 3, **kwargs): """ Summarize list of strings / corpus Parameters ---------- corpus: str, list top_k: int, (default=3) number of summarized strings. important_words: int, (default=3) number of important words. Returns ------- string: summarized string """ if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer): sequences = skip_thought.batch_sequence( cleaned_strings, self._vectorizer.dictionary, maxlen=self._vectorizer._maxlen, ) vectors, attention = self._vectorizer._sess.run( [self._vectorizer._logits, self._vectorizer._attention], feed_dict={self._vectorizer._X: np.array(sequences)}, ) attention = attention.sum(axis=0) indices = np.argsort(attention)[::-1] top_words = [ self._vectorizer._rev_dictionary[i] for i in indices if self._vectorizer._rev_dictionary[i] not in STOPWORDS ][:important_words] else: vectors = self._vectorizer.vectorize(corpus) attentions = self._vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True, ) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }