class TatoebaSentenceReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer]=None, lazy=False): super().__init__(lazy=lazy) self.tokenizer = CharacterTokenizer() self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} @overrides def text_to_instance(self, tokens, label=None): fields = {} fields['tokens'] = TextField(tokens, self.token_indexers) if label: fields['label'] = LabelField(label) return Instance(fields) @overrides def _read(self, file_path: str): with open(file_path, "r") as text_file: for line in text_file: lang_id, sent = line.rstrip().split('\t') tokens = self.tokenizer.tokenize(sent) yield self.text_to_instance(tokens, lang_id)
class CharDatasetReader(DatasetReader): def __init__(self) -> None: super().__init__(lazy=False) # todo: could become args self._token_indexers = { 'tokens': SingleIdTokenIndexer() } #'tokens' is the namespace we're using self._tokenizer = CharacterTokenizer() def text_to_instance( self, sentence: str, ) -> Instance: tokenized = self._tokenizer.tokenize(sentence) # TODO: do you want to add "source" and "target" here? instance = Instance( {"source": TextField(tokenized, self._token_indexers)}) return instance def _read(self, csv_file: str) -> Iterable[Instance]: df = pd.read_csv(csv_file) df_titles = df[df.category == 'title'] for row in df_titles.itertuples(index=False): instance = self.text_to_instance(row.text) yield instance
def classify(text: str, model: LstmClassifier): tokenizer = CharacterTokenizer() token_indexers = {'tokens': SingleIdTokenIndexer()} tokens = tokenizer.tokenize(text) instance = Instance({'tokens': TextField(tokens, token_indexers)}) logits = model.forward_on_instances([instance])[0]['logits'] label_id = np.argmax(logits) label = model.vocab.get_token_from_index(label_id, 'labels') print('text: {}, label: {}'.format(text, label))
def read_shakespeare(all_chars: Set[str] = None) -> List[List[Token]]: tokenizer = CharacterTokenizer() sentences = [] with open('data/shakespeare/hamlet.txt') as f: for line in f: line = line.strip() if not line: continue line = re.sub(' +', ' ', line) tokens = tokenizer.tokenize(line) if all_chars: tokens = [token for token in tokens if token.text in all_chars] sentences.append(tokens) return sentences