def _predict(self, strings, beam_search=True): if self._translation_model: encoded = [ self._encoder.encode(translation_textcleaning(string)) + [1] for string in strings ] else: encoded = self._encoder.encode(strings) batch_x = pad_sentence_batch(encoded, 0)[0] if beam_search: output = 'beam' else: output = 'greedy' r = self._execute( inputs=[batch_x], input_labels=['Placeholder'], output_labels=[output], ) p = r[output].tolist() if self._translation_model: result = [] for row in p: result.append( self._encoder.decode([i for i in row if i not in [0, 1]])) else: result = self._encoder.decode(p) return result
def predict(self, strings: List[str], beam_search: bool = False): """ Convert to target strings. Parameters ---------- strings : List[str] beam_search : bool, (optional=False) If True, use beam search decoder, else use greedy decoder. Returns ------- result: List[str] """ if beam_search: output = 'beam' else: output = 'greedy' batch = [[ self._left_dict[c] for c in self._cleaning(string, self._left_dict) ] + [1] for string in strings] batch = pad_sentence_batch(batch, 0)[0] r = self._execute( inputs=[batch], input_labels=['Placeholder'], output_labels=[output], ) v = r[output] results = [ ''.join([self._rev_right_dict[i] for i in r if i > 3]) for r in v ] return results
def stem(self, string: str, beam_search: bool = False): """ Stem a string, this also include lemmatization. Parameters ---------- string : str beam_search : bool, (optional=False) If True, use beam search decoder, else use greedy decoder. Returns ------- result: str """ tokenized = self._tokenizer(string) result, batch, actual, mapping = [], [], [], {} for no, word in enumerate(tokenized): if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) elif (re.findall(_money, word.lower()) or re.findall(_date, word.lower()) or re.findall(_expressions['time'], word.lower()) or re.findall(_expressions['hashtag'], word.lower()) or re.findall(_expressions['url'], word.lower()) or re.findall(_expressions['user'], word.lower())): result.append(word) else: mapping[len(batch)] = no result.append('REPLACE-ME') actual.append(word) batch.append(word.lower()) if len(batch): batch = self._bpe.bpe.encode(batch, output_type=self._bpe.mode) batch = [i + [1] for i in batch] batch = pad_sentence_batch(batch, 0)[0] if beam_search: output = 'beam' else: output = 'greedy' r = self._execute( inputs=[batch], input_labels=['Placeholder'], output_labels=[output], ) output = r[output].tolist() for no, o in enumerate(output): predicted = list(dict.fromkeys(o)) predicted = (self._bpe.bpe.decode(predicted)[0].replace( '<EOS>', '').replace('<PAD>', '')) predicted = case_of(actual[no])(predicted) result[mapping[no]] = predicted return ' '.join(result)
def stem(self, string: str, beam_search: bool = True): """ Stem a string. Parameters ---------- string : str beam_search : bool, (optional=True) If True, use beam search decoder, else use greedy decoder. Returns ------- result: str """ tokenized = self._tokenizer(string) result, batch, actual, mapping = [], [], [], {} for no, word in enumerate(tokenized): if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) elif ( re.findall(_money, word.lower()) or re.findall(_date, word.lower()) or re.findall(_expressions['time'], word.lower()) or re.findall(_expressions['hashtag'], word.lower()) or re.findall(_expressions['url'], word.lower()) or re.findall(_expressions['user'], word.lower()) ): result.append(word) else: mapping[len(batch)] = no result.append('REPLACE-ME') actual.append(word) batch.append(word.lower()) if len(batch): batch = self._bpe.encode(batch, output_type = self._subword_mode) batch = [i + [1] for i in batch] batch = pad_sentence_batch(batch, 0)[0] if beam_search: output = self._beam else: output = self._greedy output = self._sess.run(output, feed_dict = {self._X: batch}) output = output.tolist() for no, o in enumerate(output): predicted = list(dict.fromkeys(o)) predicted = self._bpe.decode(predicted)[0].replace('<EOS>', '') predicted = case_of(actual[no])(predicted) result[mapping[no]] = predicted return ' '.join(result)
def _true_case(self, strings, beam_search = True): encoded = self._encoder.encode(strings) if beam_search: output = self._beam else: output = self._greedy batch_x = pad_sentence_batch(encoded, 0)[0] p = self._sess.run(output, feed_dict = {self._X: batch_x}).tolist() result = self._encoder.decode(p) return result
def _paraphrase(self, strings, beam_search = True): encoded = [self._tokenizer.encode(string) + [1] for string in strings] if beam_search: output = self._beam else: output = self._greedy batch_x = pad_sentence_batch(encoded, 0)[0] p = self._sess.run(output, feed_dict = {self._X: batch_x}).tolist() result = [] for row in p: result.append( self._tokenizer.decode([i for i in row if i not in [0, 1]]) ) return result
def _predict(self, strings, beam_search=True): encoded = [[ self._left_dict[c] for c in self._cleaning(string, self._left_dict) ] + [1] for string in strings] batch_x = pad_sentence_batch(encoded, 0)[0] if beam_search: output = 'beam' else: output = 'greedy' r = self._execute( inputs=[batch_x], input_labels=['Placeholder'], output_labels=[output], ) v = r[output] results = [ ''.join([self._rev_left_dict[i] for i in r if i > 3]) for r in v ] return results
def stem(self, string: str): """ Stem a string. Parameters ---------- string : str Returns ------- string: stemmed string """ token_strings = classification_textcleaning(string, True).split() idx = stemmer_str_idx(token_strings, self._dicts['dictionary_from']) predicted = self._sess.run( self._logits, feed_dict={self._x: pad_sentence_batch(idx, PAD)[0]}) results = [] for word in predicted: results.append(''.join([ self._dicts['rev_dictionary_to'][c] for c in word if c not in [GO, PAD, EOS, UNK] ])) return ' '.join(results)