def test_array_stream(self): arr = stream_array(tokenize(StringIO('[]'))) self.assertListEqual([i for i in arr], []) arr = stream_array(tokenize(StringIO('["People", "Places", "Things"]'))) self.assertListEqual([i for i in arr], ["People", "Places", "Things"]) arr = stream_array(tokenize(StringIO('["Apples", "Bananas", ["Pears", "Limes"]]'))) self.assertListEqual([i for i in arr], ["Apples", "Bananas", ["Pears", "Limes"]]) arr = stream_array(tokenize(StringIO('["Apples", ["Pears", "Limes"], "Bananas"]'))) self.assertListEqual([i for i in arr], ["Apples", ["Pears", "Limes"], "Bananas"])
def test_array_stream(self): arr = stream_array(tokenize(StringIO('[]'))) self.assertListEqual([i for i in arr], []) arr = stream_array(tokenize(StringIO('["People", "Places", "Things"]'))) self.assertListEqual([i for i in arr], ["People", "Places", "Things"]) arr = stream_array(tokenize(StringIO('["Apples", "Bananas", ["Pears", "Limes"]]'))) self.assertListEqual([i for i in arr], ["Apples", "Bananas", ["Pears", "Limes"]]) arr = stream_array(tokenize(StringIO('["Apples", ["Pears", "Limes"], "Bananas"]'))) self.assertListEqual([i for i in arr], ["Apples", ["Pears", "Limes"], "Bananas"]) arr = stream_array(tokenize(StringIO('["Apples", {"key":"value"}, "Bananas"]'))) self.assertListEqual([i for i in arr], ["Apples", {"key": "value"}, "Bananas"])
def test_sequence(self): result = [token for token in tokenize(StringIO("123 \"abc\":{}"))] self.assertEqual(result, [(2, 123), (1, 'abc'), (0, ':'), (0, '{'), (0, '}')]) # Borrowed from http://en.wikipedia.org/wiki/JSON big_file = """{ "firstName": "John", "lastName": "Smith", "isAlive": true, "age": 25, "height_cm": 167.6, "address": { "streetAddress": "21 2nd Street", "city": "New York", "state": "NY", "postalCode": "10021-3100" }, "phoneNumbers": [ { "type": "home", "number": "212 555-1234" }, { "type": "office", "number": "646 555-4567" } ], "children": [], "spouse": null }""" result = [token for token in tokenize(StringIO(big_file))] expected = [(0, '{'), (1, 'firstName'), (0, ':'), (1, 'John'), (0, ','), (1, 'lastName'), (0, ':'), (1, 'Smith'), (0, ','), (1, 'isAlive'), (0, ':'), (3, True), (0, ','), (1, 'age'), (0, ':'), (2, 25), (0, ','), (1, 'height_cm'), (0, ':'), (2, 167.6), (0, ','), (1, 'address'), (0, ':'), (0, '{'), (1, 'streetAddress'), (0, ':'), (1, '21 2nd Street'), (0, ','), (1, 'city'), (0, ':'), (1, 'New York'), (0, ','), (1, 'state'), (0, ':'), (1, 'NY'), (0, ','), (1, 'postalCode'), (0, ':'), (1, '10021-3100'), (0, '}'), (0, ','), (1, 'phoneNumbers'), (0, ':'), (0, '['), (0, '{'), (1, 'type'), (0, ':'), (1, 'home'), (0, ','), (1, 'number'), (0, ':'), (1, '212 555-1234'), (0, '}'), (0, ','), (0, '{'), (1, 'type'), (0, ':'), (1, 'office'), (0, ','), (1, 'number'), (0, ':'), (1, '646 555-4567'), (0, '}'), (0, ']'), (0, ','), (1, 'children'), (0, ':'), (0, '['), (0, ']'), (0, ','), (1, 'spouse'), (0, ':'), (4, None), (0, '}')] self.assertListEqual(result, expected) big_file_no_space = '{"firstName":"John","lastName":"Smith","isAlive":true,"age":25,"height_cm":167.6,"addres' \ 's":{"streetAddress":"21 2nd Street","city":"New York","state":"NY","postalCode":"10021-3' \ '100"},"phoneNumbers":[{"type":"home","number":"212 555-1234"},{"type":"office","number":' \ '"646 555-4567"}],"children":[],"spouse":null}' result = [token for token in tokenize(StringIO(big_file_no_space))] self.assertListEqual(result, expected) result = [token for token in tokenize(StringIO("854.6,123"))] self.assertEqual(result, [(2, 854.6), (0, ','), (2, 123)]) self.assertRaises(ValueError, self.tokenize_sequence, "123\"text\"") self.assertRaises(ValueError, self.tokenize_sequence, "23.9e10true") self.assertRaises(ValueError, self.tokenize_sequence, "\"test\"56")
def assertStringEquals(self, expected, actual): token_list = [ token for token in tokenize(StringIO('"{}"'.format(actual))) ] self.assertEqual(1, len(token_list)) ttype, token = token_list[0] self.assertEqual(expected, token) self.assertEqual(ttype, TOKEN_TYPE.STRING)
def test_array_stream_of_documents_with_incomplete_json(self): arr = [] with self.assertRaises(ValueError): messages = stream_array( tokenize( StringIO( '[{"key": "value"}, {"key": "value"}, {"INCOMPLETE'))) for message in messages: arr.append(message) self.assertListEqual(arr, [{"key": "value"}, {"key": "value"}])
def load_data(filename): print('Loading data from:', filename) x = [] y = [] nlp = spacy.load('en_core_web_sm') # noinspection SpellCheckingInspection def handle_message(item): x.append(item['question']) y.append(1) doc = nlp(item['answer']) first_sent = next(doc.sents) x.append(first_sent.string.strip()) y.append(0) with open(filename, 'r') as f: print('Processing stream...') messages = stream_array(tokenize(f)) for message in messages: handle_message(message) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, shuffle=True) train_df = pd.DataFrame({'x': x_train, 'y': y_train}) val_df = pd.DataFrame({'x': x_val, 'y': y_val}) test_df = pd.DataFrame({'x': x_test, 'y': y_test}) classes = np.array([0, 1]) print('Lengths Train: {}, Val: {}, Test: {}, Classes: {}'.format( len(train_df), len(val_df), len(test_df), len(classes))) return train_df, val_df, test_df, classes
def tokenize_sequence(self, string): return [token for token in tokenize(StringIO(string))]
def assertStringEquals(self, expected, actual): token_list = [token for token in tokenize(StringIO('"{}"'.format(actual)))] self.assertEqual(1, len(token_list)) ttype, token = token_list[0] self.assertEqual(expected, token) self.assertEqual(ttype, TOKEN_TYPE.STRING)
def test_array_stream_of_values_with_extra_invalid_json(self): arr = stream_array(tokenize(StringIO('["People", "Places"], EXTRA'))) self.assertListEqual(list(arr), ["People", "Places"])
def test_array_stream_of_values_with_incomplete_json(self): arr = stream_array( tokenize(StringIO('["People", "Places", "INCOMPLETE'))) self.assertListEqual(list(arr), ["People", "Places"])
def test_array_stream_of_documents_with_extra_invalid_json(self): arr = stream_array( tokenize(StringIO('[{"key": "value"}, {"key": "value"}] EXTRA'))) self.assertListEqual(list(arr), [{"key": "value"}, {"key": "value"}])
def test_array_stream_of_documents(self): arr = stream_array( tokenize(StringIO('[{"key": "value"}, {"key": "value"}]'))) self.assertListEqual(list(arr), [{"key": "value"}, {"key": "value"}])