def test_parse_review_paginate_logs(): nlp_service = MockNLPService() df = pd.DataFrame([{ "userId": "test", "fireId": "8VeJWtPHdmZ4apbb3bY3ThBBFZs1", "classId": "test2", "serverTime": "2020-03-20T15:34:52Z", "time": "2020-03-20T15:34:52Z", "type": "review-paginate", "payload": "{\"type\":\"review-paginate\",\"reviewId\":\"test-review\",\"content\":[\"hello\",\"world\"],\"sids\":[\"first\",\"second\"],\"time\":64300,\"step\":\"last-words\",\"wordUnknowns\":[],\"targetWords\":[\"\"]}" }]) df = parse_review_paginate_logs(df, nlp_service) df = parse_paginate_logs(df) out = pd.DataFrame([{ "time": 1584718492, "userId": "test", "from": "review", "rcId": "test-review", "eltime": 64.3, "sids": ["first", "second"], "pos": ["NN", "NN"], "words": ["hello", "world"], "unknownWords": [], "unknownIndices": [] }]) pd.testing.assert_frame_equal(df, out)
def test_parse_ordinary_paginate_logs(): nlp_service = MockNLPService() content = "hello world don't\"believe\"" book = Book(Metadata("test", "test", "", "test", ""), [ Chapter("test", "test", "test", [ Sentence("test", False, content), Sentence("test2", False, content), ]) ]) service = BookService([book]) df = pd.DataFrame([{ "userId": "test", "fireId": "8VeJWtPHdmZ4apbb3bY3ThBBFZs1", "classId": "test2", "serverTime": "2020-03-20T15:34:52Z", "time": "2020-03-20T15:34:52Z", "type": "paginate", "payload": "{\"type\":\"paginate\",\"sids\":[\"test\",\"test2\"],\"time\":64300,\"wordUnknowns\":[{\"word\":\"believe\",\"wordIndex\":3,\"sentenceId\":\"test\",\"time\":18500},{\"word\":\"hello\",\"wordIndex\":0,\"sentenceId\":\"test2\",\"time\":26000}],\"sentenceUnknowns\":[],\"bookId\":\"test\",\"chapterId\":\"test\"}" }]) df = parse_ordinary_paginate_logs(df, nlp_service, service) df = parse_paginate_logs(df) out = pd.DataFrame([{ "time": 1584718492, "userId": "test", "from": "reader", "rcId": "test", "eltime": 64.3, "sids": ["test", "test2"], "pos": ["NN", "NN", "NN", "NN", "NN", "NN", "NN", "NN"], "words": [ "hello", "world", "don't", "believe", "hello", "world", "don't", "believe" ], "unknownWords": ["believe", "hello"], "unknownIndices": [3, 4] }]) pd.testing.assert_frame_equal(df, out) assert df.iloc[0]['words'][3] == "believe" assert df.iloc[0]['words'][4] == "hello"
def test_parse_word_unknowns_invalid(): nlp_service = MockNLPService() content = "hello world don't\"believe\"" word_unknowns = [{ 'sentenceId': 'test', 'word': 'hello', 'wordIndex': 0, 'time': 10 }, { 'sentenceId': 'test', 'word': 'believe', 'wordIndex': 2, 'time': 10 }] book = Book( Metadata("test", "test", "", "test", ""), [Chapter("test", "test", "test", [Sentence("test", False, content)])]) with pytest.raises(Exception): parse_word_unknowns(nlp_service, book, ["test"], word_unknowns)
def test_parse_word_unknowns_simple(): nlp_service = MockNLPService() word_unknowns = [{ 'sentenceId': 'test', 'word': 'hello', 'wordIndex': 0, 'time': 10 }, { 'sentenceId': 'test', 'word': 'believe', 'wordIndex': 3, 'time': 10 }] result = parse_word_unknowns(nlp_service, ["test"], ["hello world don't\"believe\""], word_unknowns) assert result['unknownIndices'] == [0, 3] assert result['unknownWords'] == ['hello', 'believe'] assert result['words'] == ['hello', 'world', "don't", 'believe']
def test_clean_signals_df(): nlp_service = MockNLPService() df = pd.DataFrame([{ "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "hello", "signal": 1.0, }, { "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "world2", "signal": 1.0, }, { "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "world2", "signal": 0.0, }, { "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "world3", "signal": 1.0, }]) df = clean_signals_df(df, nlp_service) df2 = pd.DataFrame([ { "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "world2", "signal": 1.0, }, { "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "world2", "signal": 0.0, }, ]) pd.testing.assert_frame_equal( df.sort_index(axis=1).reset_index(drop=True), df2.sort_index(axis=1))
def test_preprocess_paginate_logs(): config = Config(cluster_threshold=1, max_session_hours=12, cheat_eltime_threshold=12, filter_wpm_threshold=1000, word2vec_k=0, last_session_after_hours=0, skip_session_hours=0) nlp_service = MockNLPService() content = "hello world don't\"believe\"" book = Book(Metadata("test", "test", "", "test", ""), [ Chapter("test", "test", "test", [ Sentence("test", False, content), Sentence("test2", False, content), ]) ]) service = BookService([book]) df = pd.DataFrame([{ "userId": "test", "fireId": "8VeJWtPHdmZ4apbb3bY3ThBBFZs1", "classId": "test2", "serverTime": "2020-03-20T15:34:52Z", "time": "2020-03-20T15:34:52Z", "type": "paginate", "payload": "{\"type\":\"paginate\",\"sids\":[\"test\",\"test2\"],\"time\":64300,\"wordUnknowns\":[{\"word\":\"believe\",\"wordIndex\":3,\"sentenceId\":\"test\",\"time\":18500},{\"word\":\"hello\",\"wordIndex\":0,\"sentenceId\":\"test2\",\"time\":26000}],\"sentenceUnknowns\":[],\"bookId\":\"test\",\"chapterId\":\"test\"}" }, { "userId": "test", "fireId": "8VeJWtPHdmZ4apbb3bY3ThBBFZs1", "classId": "test2", "serverTime": "2020-03-20T15:34:52Z", "time": "2020-03-20T15:35:01Z", "type": "paginate", "payload": "{\"type\":\"paginate\",\"sids\":[\"test\",\"test2\"],\"time\":64300,\"wordUnknowns\":[{\"word\":\"believe\",\"wordIndex\":3,\"sentenceId\":\"test\",\"time\":18500},{\"word\":\"hello\",\"wordIndex\":0,\"sentenceId\":\"test2\",\"time\":26000}],\"sentenceUnknowns\":[],\"bookId\":\"test\",\"chapterId\":\"test\"}" }]) df = preprocess_paginate_logs(df, nlp_service, service, config) df2 = pd.DataFrame([{ "pageId": 0, "time": 1584718492, "userId": "test", "session": 0, "eltime": 128.6, "cheat": True, "wpm": 3.7325, "from": "reader", "rcId": "test", "sids": ["test", "test2"], "pos": ["NN", "NN", "NN", "NN", "NN", "NN", "NN", "NN"], "words": [ "hello", "world", "don't", "believe", "hello", "world", "don't", "believe" ], "unknownWords": ["believe", "hello", "believe", "hello"], "unknownIndices": [3, 4, 3, 4] }]) pd.testing.assert_frame_equal(df.sort_index(axis=1), df2.sort_index(axis=1))
def test_predict_vocab(): nlp_service = MockNLPService() signals = pd.DataFrame([{ "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "hello", "signal": 1.0, }, { "pageId": 0, "time": 60, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "hello", "signal": 1.0, }, { "pageId": 0, "time": 30, "userId": "test", "session": 0, "cheat": True, "wpm": 30.0, "pos": "NN", "word": "world", "signal": 0.0, }]) features = pd.DataFrame([{ "pageId": 0, 'oword': 'world', 'word': 'world', 'otime': 40, 'signal': 1.0, 'diff': (400000.0 / (60 * 60 * 24)), 'csignal': (2 / 3), 'count': 3, "wpm": 10.0, 'pos': "NN", 'time': 800000, 'userId': 'test' }]) class MockClassifier: def predict(self, x): return np.array([1]) class MockModel: def get_classifier(self): return MockClassifier() ndf, udf = predict_vocab(features, signals, MockModel(), nlp_service) ndf2 = pd.DataFrame([{ "pageId": 0, 'userId': 'test', 'word': 'hello', 'oword': 'hello', 'time': 60 }]) udf2 = pd.DataFrame([{ "pageId": 0, 'userId': 'test', 'word': 'world', 'oword': 'world', 'time': 40 }]) pd.testing.assert_frame_equal(ndf.sort_index(axis=1), ndf2.sort_index(axis=1)) pd.testing.assert_frame_equal(udf.sort_index(axis=1), udf2.sort_index(axis=1))
def test_prepare_numeric_features2(): nlp_service = MockNLPService() signals_df = pd.DataFrame([ { "pageId": 0, "time": 1200030, "userId": "test", "session": 0, "wpm": 10.0, "pos": "NN", "word": "hey2", "signal": 0.0, }, { "pageId": 0, "time": 400030, "userId": "test", "session": 0, "wpm": 10.0, "pos": "NN", "word": "hey", "signal": 0.0, }, { "pageId": 0, "time": 30, "userId": "test", "session": 0, "wpm": 10.0, "pos": "NN", "word": "hey", "signal": 1.0, }, { "pageId": 0, "time": 800030, "userId": "test", "session": 0, "wpm": 10.0, "pos": "NN", "word": "hey", "signal": 0.0, }, { "pageId": 0, "time": 800030, "userId": "test", "session": 0, "wpm": 10.0, "pos": "NN", "word": "hey2", "signal": 1.0, }, ]) df = prepare_simple_features(signals_df, nlp_service) df2 = pd.DataFrame([{ "pageId": 0, 'oword': 'hey', 'word': 'hey', 'otime': 400030, 'signal': 0.0, 'diff': (400000.0 / (60 * 60 * 24)), 'csignal': 1.0, 'count': 1, "wpm": 10.0, 'pos': "NN", 'time': 400000, 'userId': 'test' }, { "pageId": 0, 'oword': 'hey', 'word': 'hey', 'otime': 800030, 'signal': 0.0, 'diff': (400000.0 / (60 * 60 * 24)), 'csignal': 0.5, 'count': 2, "wpm": 10.0, 'pos': "NN", 'time': 800000, 'userId': 'test' }, { "pageId": 0, 'oword': 'hey2', 'word': 'hey2', 'signal': 0.0, 'otime': 1200030, 'diff': (400000.0 / (60 * 60 * 24)), 'csignal': 1.0, 'count': 1, "wpm": 10.0, 'pos': "NN", 'time': 400000, 'userId': 'test' }]) pd.testing.assert_frame_equal( df.sort_index(axis=1).reset_index(drop=True), df2.sort_index(axis=1))