Пример #1
0
class TatoebaSentenceReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer]=None, lazy=False):
        super().__init__(lazy=lazy)
        self.tokenizer = CharacterTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def text_to_instance(self, tokens, label=None):
        fields = {}

        fields['tokens'] = TextField(tokens, self.token_indexers)
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

    @overrides
    def _read(self, file_path: str):
        with open(file_path, "r") as text_file:
            for line in text_file:
                lang_id, sent = line.rstrip().split('\t')

                tokens = self.tokenizer.tokenize(sent)

                yield self.text_to_instance(tokens, lang_id)
Пример #2
0
class CharDatasetReader(DatasetReader):
    def __init__(self) -> None:
        super().__init__(lazy=False)
        # todo: could become args
        self._token_indexers = {
            'tokens': SingleIdTokenIndexer()
        }  #'tokens' is the namespace we're using
        self._tokenizer = CharacterTokenizer()

    def text_to_instance(
        self,
        sentence: str,
    ) -> Instance:

        tokenized = self._tokenizer.tokenize(sentence)
        # TODO: do you want to add "source" and "target" here?
        instance = Instance(
            {"source": TextField(tokenized, self._token_indexers)})
        return instance

    def _read(self, csv_file: str) -> Iterable[Instance]:

        df = pd.read_csv(csv_file)
        df_titles = df[df.category == 'title']

        for row in df_titles.itertuples(index=False):
            instance = self.text_to_instance(row.text)
            yield instance
Пример #3
0
def classify(text: str, model: LstmClassifier):
    tokenizer = CharacterTokenizer()
    token_indexers = {'tokens': SingleIdTokenIndexer()}

    tokens = tokenizer.tokenize(text)
    instance = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instances([instance])[0]['logits']
    label_id = np.argmax(logits)
    label = model.vocab.get_token_from_index(label_id, 'labels')

    print('text: {}, label: {}'.format(text, label))
Пример #4
0
def read_shakespeare(all_chars: Set[str] = None) -> List[List[Token]]:
    tokenizer = CharacterTokenizer()
    sentences = []
    with open('data/shakespeare/hamlet.txt') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            line = re.sub(' +', ' ', line)
            tokens = tokenizer.tokenize(line)
            if all_chars:
                tokens = [token for token in tokens if token.text in all_chars]
            sentences.append(tokens)

    return sentences