예제 #1
0
def test_parse_review_paginate_logs():
    nlp_service = MockNLPService()

    df = pd.DataFrame([{
        "userId":
        "test",
        "fireId":
        "8VeJWtPHdmZ4apbb3bY3ThBBFZs1",
        "classId":
        "test2",
        "serverTime":
        "2020-03-20T15:34:52Z",
        "time":
        "2020-03-20T15:34:52Z",
        "type":
        "review-paginate",
        "payload":
        "{\"type\":\"review-paginate\",\"reviewId\":\"test-review\",\"content\":[\"hello\",\"world\"],\"sids\":[\"first\",\"second\"],\"time\":64300,\"step\":\"last-words\",\"wordUnknowns\":[],\"targetWords\":[\"\"]}"
    }])

    df = parse_review_paginate_logs(df, nlp_service)
    df = parse_paginate_logs(df)
    out = pd.DataFrame([{
        "time": 1584718492,
        "userId": "test",
        "from": "review",
        "rcId": "test-review",
        "eltime": 64.3,
        "sids": ["first", "second"],
        "pos": ["NN", "NN"],
        "words": ["hello", "world"],
        "unknownWords": [],
        "unknownIndices": []
    }])
    pd.testing.assert_frame_equal(df, out)
예제 #2
0
def test_parse_ordinary_paginate_logs():
    nlp_service = MockNLPService()
    content = "hello world don't\"believe\""
    book = Book(Metadata("test", "test", "", "test", ""), [
        Chapter("test", "test", "test", [
            Sentence("test", False, content),
            Sentence("test2", False, content),
        ])
    ])
    service = BookService([book])
    df = pd.DataFrame([{
        "userId":
        "test",
        "fireId":
        "8VeJWtPHdmZ4apbb3bY3ThBBFZs1",
        "classId":
        "test2",
        "serverTime":
        "2020-03-20T15:34:52Z",
        "time":
        "2020-03-20T15:34:52Z",
        "type":
        "paginate",
        "payload":
        "{\"type\":\"paginate\",\"sids\":[\"test\",\"test2\"],\"time\":64300,\"wordUnknowns\":[{\"word\":\"believe\",\"wordIndex\":3,\"sentenceId\":\"test\",\"time\":18500},{\"word\":\"hello\",\"wordIndex\":0,\"sentenceId\":\"test2\",\"time\":26000}],\"sentenceUnknowns\":[],\"bookId\":\"test\",\"chapterId\":\"test\"}"
    }])
    df = parse_ordinary_paginate_logs(df, nlp_service, service)
    df = parse_paginate_logs(df)
    out = pd.DataFrame([{
        "time":
        1584718492,
        "userId":
        "test",
        "from":
        "reader",
        "rcId":
        "test",
        "eltime":
        64.3,
        "sids": ["test", "test2"],
        "pos": ["NN", "NN", "NN", "NN", "NN", "NN", "NN", "NN"],
        "words": [
            "hello", "world", "don't", "believe", "hello", "world", "don't",
            "believe"
        ],
        "unknownWords": ["believe", "hello"],
        "unknownIndices": [3, 4]
    }])
    pd.testing.assert_frame_equal(df, out)
    assert df.iloc[0]['words'][3] == "believe"
    assert df.iloc[0]['words'][4] == "hello"
예제 #3
0
def test_parse_word_unknowns_invalid():
    nlp_service = MockNLPService()
    content = "hello world don't\"believe\""
    word_unknowns = [{
        'sentenceId': 'test',
        'word': 'hello',
        'wordIndex': 0,
        'time': 10
    }, {
        'sentenceId': 'test',
        'word': 'believe',
        'wordIndex': 2,
        'time': 10
    }]
    book = Book(
        Metadata("test", "test", "", "test", ""),
        [Chapter("test", "test", "test", [Sentence("test", False, content)])])
    with pytest.raises(Exception):
        parse_word_unknowns(nlp_service, book, ["test"], word_unknowns)
예제 #4
0
def test_parse_word_unknowns_simple():
    nlp_service = MockNLPService()
    word_unknowns = [{
        'sentenceId': 'test',
        'word': 'hello',
        'wordIndex': 0,
        'time': 10
    }, {
        'sentenceId': 'test',
        'word': 'believe',
        'wordIndex': 3,
        'time': 10
    }]

    result = parse_word_unknowns(nlp_service, ["test"],
                                 ["hello world don't\"believe\""],
                                 word_unknowns)
    assert result['unknownIndices'] == [0, 3]
    assert result['unknownWords'] == ['hello', 'believe']
    assert result['words'] == ['hello', 'world', "don't", 'believe']
예제 #5
0
def test_clean_signals_df():
    nlp_service = MockNLPService()
    df = pd.DataFrame([{
        "pageId": 0,
        "time": 30,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "hello",
        "signal": 1.0,
    }, {
        "pageId": 0,
        "time": 30,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "world2",
        "signal": 1.0,
    }, {
        "pageId": 0,
        "time": 30,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "world2",
        "signal": 0.0,
    }, {
        "pageId": 0,
        "time": 30,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "world3",
        "signal": 1.0,
    }])
    df = clean_signals_df(df, nlp_service)

    df2 = pd.DataFrame([
        {
            "pageId": 0,
            "time": 30,
            "userId": "test",
            "session": 0,
            "cheat": True,
            "wpm": 30.0,
            "pos": "NN",
            "word": "world2",
            "signal": 1.0,
        },
        {
            "pageId": 0,
            "time": 30,
            "userId": "test",
            "session": 0,
            "cheat": True,
            "wpm": 30.0,
            "pos": "NN",
            "word": "world2",
            "signal": 0.0,
        },
    ])
    pd.testing.assert_frame_equal(
        df.sort_index(axis=1).reset_index(drop=True), df2.sort_index(axis=1))
예제 #6
0
def test_preprocess_paginate_logs():
    config = Config(cluster_threshold=1,
                    max_session_hours=12,
                    cheat_eltime_threshold=12,
                    filter_wpm_threshold=1000,
                    word2vec_k=0,
                    last_session_after_hours=0,
                    skip_session_hours=0)
    nlp_service = MockNLPService()
    content = "hello world don't\"believe\""
    book = Book(Metadata("test", "test", "", "test", ""), [
        Chapter("test", "test", "test", [
            Sentence("test", False, content),
            Sentence("test2", False, content),
        ])
    ])
    service = BookService([book])
    df = pd.DataFrame([{
        "userId":
        "test",
        "fireId":
        "8VeJWtPHdmZ4apbb3bY3ThBBFZs1",
        "classId":
        "test2",
        "serverTime":
        "2020-03-20T15:34:52Z",
        "time":
        "2020-03-20T15:34:52Z",
        "type":
        "paginate",
        "payload":
        "{\"type\":\"paginate\",\"sids\":[\"test\",\"test2\"],\"time\":64300,\"wordUnknowns\":[{\"word\":\"believe\",\"wordIndex\":3,\"sentenceId\":\"test\",\"time\":18500},{\"word\":\"hello\",\"wordIndex\":0,\"sentenceId\":\"test2\",\"time\":26000}],\"sentenceUnknowns\":[],\"bookId\":\"test\",\"chapterId\":\"test\"}"
    }, {
        "userId":
        "test",
        "fireId":
        "8VeJWtPHdmZ4apbb3bY3ThBBFZs1",
        "classId":
        "test2",
        "serverTime":
        "2020-03-20T15:34:52Z",
        "time":
        "2020-03-20T15:35:01Z",
        "type":
        "paginate",
        "payload":
        "{\"type\":\"paginate\",\"sids\":[\"test\",\"test2\"],\"time\":64300,\"wordUnknowns\":[{\"word\":\"believe\",\"wordIndex\":3,\"sentenceId\":\"test\",\"time\":18500},{\"word\":\"hello\",\"wordIndex\":0,\"sentenceId\":\"test2\",\"time\":26000}],\"sentenceUnknowns\":[],\"bookId\":\"test\",\"chapterId\":\"test\"}"
    }])
    df = preprocess_paginate_logs(df, nlp_service, service, config)
    df2 = pd.DataFrame([{
        "pageId":
        0,
        "time":
        1584718492,
        "userId":
        "test",
        "session":
        0,
        "eltime":
        128.6,
        "cheat":
        True,
        "wpm":
        3.7325,
        "from":
        "reader",
        "rcId":
        "test",
        "sids": ["test", "test2"],
        "pos": ["NN", "NN", "NN", "NN", "NN", "NN", "NN", "NN"],
        "words": [
            "hello", "world", "don't", "believe", "hello", "world", "don't",
            "believe"
        ],
        "unknownWords": ["believe", "hello", "believe", "hello"],
        "unknownIndices": [3, 4, 3, 4]
    }])
    pd.testing.assert_frame_equal(df.sort_index(axis=1),
                                  df2.sort_index(axis=1))
예제 #7
0
def test_predict_vocab():
    nlp_service = MockNLPService()
    signals = pd.DataFrame([{
        "pageId": 0,
        "time": 30,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "hello",
        "signal": 1.0,
    }, {
        "pageId": 0,
        "time": 60,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "hello",
        "signal": 1.0,
    }, {
        "pageId": 0,
        "time": 30,
        "userId": "test",
        "session": 0,
        "cheat": True,
        "wpm": 30.0,
        "pos": "NN",
        "word": "world",
        "signal": 0.0,
    }])

    features = pd.DataFrame([{
        "pageId": 0,
        'oword': 'world',
        'word': 'world',
        'otime': 40,
        'signal': 1.0,
        'diff': (400000.0 / (60 * 60 * 24)),
        'csignal': (2 / 3),
        'count': 3,
        "wpm": 10.0,
        'pos': "NN",
        'time': 800000,
        'userId': 'test'
    }])

    class MockClassifier:
        def predict(self, x):
            return np.array([1])

    class MockModel:
        def get_classifier(self):
            return MockClassifier()

    ndf, udf = predict_vocab(features, signals, MockModel(), nlp_service)

    ndf2 = pd.DataFrame([{
        "pageId": 0,
        'userId': 'test',
        'word': 'hello',
        'oword': 'hello',
        'time': 60
    }])
    udf2 = pd.DataFrame([{
        "pageId": 0,
        'userId': 'test',
        'word': 'world',
        'oword': 'world',
        'time': 40
    }])
    pd.testing.assert_frame_equal(ndf.sort_index(axis=1),
                                  ndf2.sort_index(axis=1))
    pd.testing.assert_frame_equal(udf.sort_index(axis=1),
                                  udf2.sort_index(axis=1))
예제 #8
0
def test_prepare_numeric_features2():
    nlp_service = MockNLPService()

    signals_df = pd.DataFrame([
        {
            "pageId": 0,
            "time": 1200030,
            "userId": "test",
            "session": 0,
            "wpm": 10.0,
            "pos": "NN",
            "word": "hey2",
            "signal": 0.0,
        },
        {
            "pageId": 0,
            "time": 400030,
            "userId": "test",
            "session": 0,
            "wpm": 10.0,
            "pos": "NN",
            "word": "hey",
            "signal": 0.0,
        },
        {
            "pageId": 0,
            "time": 30,
            "userId": "test",
            "session": 0,
            "wpm": 10.0,
            "pos": "NN",
            "word": "hey",
            "signal": 1.0,
        },
        {
            "pageId": 0,
            "time": 800030,
            "userId": "test",
            "session": 0,
            "wpm": 10.0,
            "pos": "NN",
            "word": "hey",
            "signal": 0.0,
        },
        {
            "pageId": 0,
            "time": 800030,
            "userId": "test",
            "session": 0,
            "wpm": 10.0,
            "pos": "NN",
            "word": "hey2",
            "signal": 1.0,
        },
    ])

    df = prepare_simple_features(signals_df, nlp_service)

    df2 = pd.DataFrame([{
        "pageId": 0,
        'oword': 'hey',
        'word': 'hey',
        'otime': 400030,
        'signal': 0.0,
        'diff': (400000.0 / (60 * 60 * 24)),
        'csignal': 1.0,
        'count': 1,
        "wpm": 10.0,
        'pos': "NN",
        'time': 400000,
        'userId': 'test'
    }, {
        "pageId": 0,
        'oword': 'hey',
        'word': 'hey',
        'otime': 800030,
        'signal': 0.0,
        'diff': (400000.0 / (60 * 60 * 24)),
        'csignal': 0.5,
        'count': 2,
        "wpm": 10.0,
        'pos': "NN",
        'time': 800000,
        'userId': 'test'
    }, {
        "pageId": 0,
        'oword': 'hey2',
        'word': 'hey2',
        'signal': 0.0,
        'otime': 1200030,
        'diff': (400000.0 / (60 * 60 * 24)),
        'csignal': 1.0,
        'count': 1,
        "wpm": 10.0,
        'pos': "NN",
        'time': 400000,
        'userId': 'test'
    }])

    pd.testing.assert_frame_equal(
        df.sort_index(axis=1).reset_index(drop=True), df2.sort_index(axis=1))