def xlnet_tokenization_siamese(tokenizer, left, right): input_ids, input_mask, all_seg_ids, s_tokens = [], [], [], [] for i in range(len(left)): tokens = tokenize_fn(transformer_textcleaning(left[i]), tokenizer) tokens_right = tokenize_fn(transformer_textcleaning(right[i]), tokenizer) segment_ids = [SEG_ID_A] * len(tokens) tokens.append(SEP_ID) s_tokens.append([tokenizer.IdToPiece(i) for i in tokens]) segment_ids.append(SEG_ID_A) tokens.extend(tokens_right) segment_ids.extend([SEG_ID_B] * len(tokens_right)) tokens.append(SEP_ID) segment_ids.append(SEG_ID_B) tokens.append(CLS_ID) segment_ids.append(SEG_ID_CLS) cur_input_ids = tokens cur_input_mask = [0] * len(cur_input_ids) assert len(tokens) == len(cur_input_mask) assert len(tokens) == len(segment_ids) input_ids.append(tokens) input_mask.append(cur_input_mask) all_seg_ids.append(segment_ids) maxlen = max([len(i) for i in input_ids]) input_ids = padding_sequence(input_ids, maxlen) input_mask = padding_sequence(input_mask, maxlen, pad_int=1) all_seg_ids = padding_sequence(all_seg_ids, maxlen, pad_int=4) return input_ids, input_mask, all_seg_ids, s_tokens
def xlnet_tokenization(tokenizer, texts): input_ids, input_masks, segment_ids, s_tokens = [], [], [], [] for text in texts: text = transformer_textcleaning(text) tokens_a = tokenize_fn(text, tokenizer)[:MAXLEN] tokens = [] segment_id = [] for token in tokens_a: tokens.append(token) segment_id.append(SEG_ID_A) tokens.append(SEP_ID) segment_id.append(SEG_ID_A) tokens.append(CLS_ID) segment_id.append(SEG_ID_CLS) input_id = tokens input_mask = [0] * len(input_id) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) s_tokens.append([tokenizer.IdToPiece(i) for i in tokens]) maxlen = max([len(i) for i in input_ids]) input_ids = padding_sequence(input_ids, maxlen) input_masks = padding_sequence(input_masks, maxlen, pad_int=1) segment_ids = padding_sequence(segment_ids, maxlen, pad_int=SEG_ID_PAD) return input_ids, input_masks, segment_ids, s_tokens
def bert_tokenization_siamese(tokenizer, left, right): input_ids, input_masks, segment_ids, s_tokens = [], [], [], [] a, b = [], [] for i in range(len(left)): tokens_a = tokenizer.tokenize(transformer_textcleaning(left[i])) logging.debug(tokens_a) tokens_b = tokenizer.tokenize(transformer_textcleaning(right[i])) logging.debug(tokens_b) a.append(tokens_a) b.append(tokens_b) for i in range(len(left)): tokens_a = a[i] tokens_b = b[i] tokens = [] segment_id = [] tokens.append('[CLS]') segment_id.append(0) for token in tokens_a: tokens.append(token) segment_id.append(0) tokens.append('[SEP]') s_tokens.append(tokens[:]) segment_id.append(0) for token in tokens_b: tokens.append(token) segment_id.append(1) tokens.append('[SEP]') segment_id.append(1) input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) maxlen = max([len(i) for i in input_ids]) input_ids = padding_sequence(input_ids, maxlen) input_masks = padding_sequence(input_masks, maxlen) segment_ids = padding_sequence(segment_ids, maxlen) return input_ids, input_masks, segment_ids, s_tokens
def paraphrase( self, string: str, beam_search: bool = True, split_fullstop: bool = True ): """ Paraphrase a string. Parameters ---------- string : str beam_search : bool, (optional=True) If True, use beam search decoder, else use greedy decoder. split_fullstop: bool, (default=True) if True, will generate paraphrase for each strings splitted by fullstop. Returns ------- result: str """ if split_fullstop: splitted_fullstop = split_into_sentences( transformer_textcleaning(string) ) results, batch, mapping = [], [], {} for no, splitted in enumerate(splitted_fullstop): if len(splitted.split()) < 4: results.append(splitted) else: mapping[len(batch)] = no results.append('REPLACE-ME') batch.append(splitted) if len(batch): output = self._paraphrase(batch, beam_search = beam_search) for no in range(len(output)): results[mapping[no]] = output[no] return ' '.join(results) else: return self._paraphrase([string], beam_search = beam_search)[0]
def bert_tokenization(tokenizer, texts): input_ids, input_masks, segment_ids, s_tokens = [], [], [], [] for text in texts: text = transformer_textcleaning(text) tokens_a = tokenizer.tokenize(text)[:MAXLEN] tokens = ['[CLS]'] + tokens_a + ['[SEP]'] segment_id = [0] * len(tokens) input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) s_tokens.append(tokens) maxlen = max([len(i) for i in input_ids]) input_ids = padding_sequence(input_ids, maxlen) input_masks = padding_sequence(input_masks, maxlen) segment_ids = padding_sequence(segment_ids, maxlen) return input_ids, input_masks, segment_ids, s_tokens
def parse_bert_tagging(left, tokenizer): left = transformer_textcleaning(left) bert_tokens = ['[CLS]'] + tokenizer.tokenize(left) + ['[SEP]'] input_mask = [1] * len(bert_tokens) return tokenizer.convert_tokens_to_ids( bert_tokens), input_mask, bert_tokens
def rake( string: str, model=None, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Rake algorithm. Parameters ---------- string: str model: Object, optional (default=None) Transformer model or any model has `attention` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. ngram: tuple, optional (default=(1,1)) n-grams size. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] For automatic Ngram generator. Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if model is not None: if not hasattr(model, 'attention'): raise ValueError('model must have `attention` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if model: string = transformer_textcleaning(string) attention = model.attention([string])[0] d = defaultdict(float) for k, v in attention: d[k] += v else: d = None if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) phrase_list = list(vocab.keys()) scores = rake_function.calculate_word_scores(phrase_list, attentions=d) keywordcandidates = rake_function.generate_candidate_keyword_scores( phrase_list, scores) sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True) total = sum([i[1] for i in sortedKeywords]) ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords if vocab[i[0]] >= atleast] return ranked_sentences[:top_k]
def similarity_transformer( string: str, model, vectorizer=None, top_k: int = 5, atleast: int = 1, use_maxsum: bool = False, use_mmr: bool = False, diversity: float = 0.5, nr_candidates: int = 20, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Sentence embedding VS keyword embedding similarity. https://github.com/MaartenGr/KeyBERT/blob/master/keybert/model.py Parameters ---------- string: str model: Object Transformer model or any model has `attention` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. use_maxsum: bool, optional (default=False) Whether to use Max Sum Similarity. use_mmr: bool, optional (default=False) Whether to use MMR. diversity: float, optional (default=0.5) The diversity of results between 0 and 1 if use_mmr is True. nr_candidates: int, optional (default=20) The number of candidates to consider if use_maxsum is set to True. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'vectorize'): raise ValueError('model must have `vectorize` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if nr_candidates < top_k: raise Exception('nr_candidates must bigger than top_k') string = transformer_textcleaning(string) if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) words = list(vocab.keys()) vectors_keywords = model.vectorize(words) vectors_string = model.vectorize([string]) if use_mmr: # https://github.com/MaartenGr/KeyBERT/blob/master/keybert/mmr.py word_doc_similarity = cosine_similarity(vectors_keywords, vectors_string) word_similarity = cosine_similarity(vectors_keywords) keywords_idx = [np.argmax(word_doc_similarity)] candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]] for _ in range(top_n - 1): candidate_similarities = word_doc_similarity[candidates_idx, :] target_similarities = np.max( word_similarity[candidates_idx][:, keywords_idx], axis=1) mmr = ( 1 - diversity ) * candidate_similarities - diversity * target_similarities.reshape( -1, 1) mmr_idx = candidates_idx[np.argmax(mmr)] keywords_idx.append(mmr_idx) candidates_idx.remove(mmr_idx) ranked_sentences = [(word_doc_similarity.reshape(1, -1)[0][idx], words[idx]) for idx in keywords_idx] elif use_maxsum: # https://github.com/MaartenGr/KeyBERT/blob/master/keybert/maxsum.py distances = cosine_similarity(vectors_string, vectors_keywords) distances_words = cosine_similarity(vectors_keywords, vectors_keywords) words_idx = list(distances.argsort()[0][-nr_candidates:]) words_vals = [words[index] for index in words_idx] candidates = distances_words[np.ix_(words_idx, words_idx)] min_sim = 100_000 candidate = None for combination in itertools.combinations(range(len(words_idx)), top_n): sim = sum([ candidates[i][j] for i in combination for j in combination if i != j ]) if sim < min_sim: candidate = combination min_sim = sim ranked_sentences = [(distances[0][idx], words_vals[idx]) for idx in candidate] else: distances = cosine_similarity(vectors_string, vectors_keywords) ranked_sentences = [(distances[0][index], words[index]) for index in distances.argsort()[0]][::-1] ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast] return ranked_sentences[:top_k]
def attention( string: str, model, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Attention mechanism. Parameters ---------- string: str model: Object Transformer model or any model has `attention` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'attention'): raise ValueError('model must have `attention` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') string = transformer_textcleaning(string) if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) attention = model.attention([string])[0] d = defaultdict(float) for k, v in attention: d[k] += v scores = [] for k in vocab.keys(): scores.append(sum([d.get(w, 0) for w in k.split()])) total = sum(scores) ranked_sentences = sorted( [(scores[i] / total, s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast], reverse=True, ) return ranked_sentences[:top_k]
def similarity( string: str, model, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Sentence embedding VS keyword embedding similarity. Parameters ---------- string: str model: Object Transformer model or any model has `vectorize` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'vectorize'): raise ValueError('model must have `vectorize` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if nr_candidates < top_k: raise ValueError('nr_candidates must bigger than top_k') string = transformer_textcleaning(string) if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) words = list(vocab.keys()) vectors_keywords = model.vectorize(words) vectors_string = model.vectorize([string]) distances = cosine_similarity(vectors_string, vectors_keywords) ranked_sentences = [(distances[0][index], words[index]) for index in distances.argsort()[0]][::-1] ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast] return ranked_sentences[:top_k]
def parse_bert_tagging(left, tokenizer, space_after_punct=False): left = transformer_textcleaning(left, space_after_punct=space_after_punct) bert_tokens = ['[CLS]'] + tokenizer.tokenize(left) + ['[SEP]'] input_mask = [1] * len(bert_tokens) logging.debug(bert_tokens) return tokenizer.convert_tokens_to_ids(bert_tokens), input_mask, bert_tokens
def rake(string: str, model=None, top_k: int = 5, auto_ngram: bool = True, ngram_method: str = 'bow', ngram: Tuple[int, int] = (1, 1), atleast: int = 1, stop_words: List[str] = STOPWORDS, **kwargs): """ Extract keywords using Rake algorithm. Parameters ---------- string: str model: Object, optional (default='None') Transformer model or any model has `attention` method. top_k: int, optional (default=5) return top-k results. auto_ngram: bool, optional (default=True) If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`. ngram_method: str, optional (default='bow') Only usable if `auto_ngram` is False. supported ngram generator: * ``'bow'`` - bag-of-word. * ``'skipgram'`` - bag-of-word with skip technique. ngram: tuple, optional (default=(1,1)) n-grams size. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stop_words: list, (default=malaya.text.function.STOPWORDS) list of stop words to remove. Returns ------- result: Tuple[float, str] """ if model is not None: if not hasattr(model, 'attention'): raise ValueError('model must has or `attention` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if ngram_method not in ('bow', 'skipgram'): raise ValueError("ngram_method must be in ['bow', 'skip-gram']") if auto_ngram and not len(stop_words): raise ValueError('insert stop_words if auto_ngram') if model: string = transformer_textcleaning(string) attention = model.attention([string])[0] d = defaultdict(float) for k, v in attention: d[k] += v else: d = None if auto_ngram: vocab = _auto_ngram(string, stop_words) else: vocab = _base(string, ngram_method=ngram_method, ngram=ngram, stop_words=stop_words, **kwargs) phrase_list = list(vocab.keys()) scores = rake_function.calculate_word_scores(phrase_list, attentions=d) keywordcandidates = rake_function.generate_candidate_keyword_scores( phrase_list, scores) sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True) total = sum([i[1] for i in sortedKeywords]) ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords if vocab[i[0]] >= atleast] return ranked_sentences[:top_k]
def attention(string: str, model, top_k: int = 5, auto_ngram: bool = True, ngram_method: str = 'bow', ngram: Tuple[int, int] = (1, 1), atleast: int = 1, stop_words: List[str] = STOPWORDS, **kwargs): """ Extract keywords using Attention mechanism. Parameters ---------- string: str model: Object, optional (default='None') Transformer model or any model has `attention` method. top_k: int, optional (default=5) return top-k results. auto_ngram: bool, optional (default=True) If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`. ngram_method: str, optional (default='bow') Only usable if `auto_ngram` is False. supported ngram generator: * ``'bow'`` - bag-of-word. * ``'skipgram'`` - bag-of-word with skip technique. ngram: tuple, optional (default=(1,1)) n-grams size. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stop_words: list, (default=malaya.text.function.STOPWORDS) list of stop words to remove. Returns ------- result: Tuple[float, str] """ if not hasattr(model, 'attention'): raise ValueError('model must has or `attention` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if ngram_method not in ('bow', 'skipgram'): raise ValueError("ngram_method must be in ['bow', 'skip-gram']") if auto_ngram and not len(stop_words): raise ValueError('insert stop_words if auto_ngram') string = transformer_textcleaning(string) if auto_ngram: vocab = _auto_ngram(string, stop_words) else: vocab = _base(string, ngram_method=ngram_method, ngram=ngram, stop_words=stop_words, **kwargs) attention = model.attention([string])[0] d = defaultdict(float) for k, v in attention: d[k] += v scores = [] for k in vocab.keys(): scores.append(sum([d.get(w, 0) for w in k.split()])) total = sum(scores) ranked_sentences = sorted( [(scores[i] / total, s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast], reverse=True, ) return ranked_sentences[:top_k]