Пример #1
0
def test_ScaleAndShift_test_a():
    preprocessing_queue = [preprocessing.ScaleAndShift()]
    s = (
        '[[{"x":232,"y":423,"time":1407885913983},'
        '{"x":267,"y":262,"time":1407885914315},'
        '{"x":325,"y":416,"time":1407885914650}],'
        '[{"x":252,"y":355,"time":1407885915675},'
        '{"x":305,"y":351,"time":1407885916361}]]'
    )
    a = HandwrittenData(s)
    a.preprocessing(preprocessing_queue)
    s = a.get_pointlist()
    expectation = [
        [
            {"y": 1.0, "x": 0.0, "time": 0},
            {"y": 0.0, "x": 0.2174, "time": 332},
            {"y": 0.9565, "x": 0.5776, "time": 667},
        ],
        [
            {"y": 0.5776, "x": 0.1242, "time": 1692},
            {"y": 0.5528, "x": 0.4534, "time": 2378},
        ],
    ]
    assert testhelper.compare_pointlists(
        s, expectation
    ), f"Got: {s}; expected {expectation}"
Пример #2
0
def test_ScaleAndShift_test_a_center():
    preprocessing_queue = [preprocessing.ScaleAndShift(center=True)]
    s = (
        '[[{"y": 1.0, "x": -0.3655913978494625, "time": 0}, '
        '{"y": 0.0, "x": -0.1482000935016364, "time": 332}, '
        '{"y": 0.9565, "x": 0.21204835370333253, "time": 667}], '
        '[{"y": 0.5776, "x": -0.24136779536499045, "time": 1692}, '
        '{"y": 0.5528, "x": 0.08782475121886046, "time": 2378}]]'
    )
    a = HandwrittenData(s)
    a.preprocessing(preprocessing_queue)
    s = a.get_pointlist()
    expectation = [
        [
            {"y": 1.0, "x": -0.2888198757763975, "time": 0},
            {"y": 0.0, "x": -0.07142857142857142, "time": 332},
            {"y": 0.9565, "x": 0.2888198757763975, "time": 667},
        ],
        [
            {"y": 0.5776, "x": -0.16459627329192547, "time": 1692},
            {"y": 0.5528, "x": 0.16459627329192544, "time": 2378},
        ],
    ]
    assert testhelper.compare_pointlists(
        s, expectation
    ), f"Got: {s}; expected {expectation}"
Пример #3
0
def test_ScaleAndShift_test_simple_5():
    preprocessing_queue = [preprocessing.ScaleAndShift()]
    s = '[[{"x":42, "y":12, "time": 10}]]'
    a = HandwrittenData(s)
    a.preprocessing(preprocessing_queue)
    s = a.get_pointlist()
    expectation = [[{"x": 0, "y": 0, "time": 0}]]
    assert s == expectation, f"Got: {s}; expected {expectation}"
Пример #4
0
def ScaleAndShift_test_simple_4():
    preprocessing_queue = [preprocessing.ScaleAndShift()]
    s = '[[{"x":0, "y":0, "time": 10}]]'
    a = HandwrittenData(s)
    a.preprocessing(preprocessing_queue)
    s = a.get_pointlist()
    expectation = [[{"x": 0, "y": 0, "time": 0}]]
    assert s == expectation, "Got: %s; expected %s" % (s, expectation)
Пример #5
0
def dataset_preparation_test():
    d = os.path.dirname(__file__)
    target = os.path.join(utils.get_project_root(),
                          'raw-datasets/unittests-tiny-raw.pickle')
    shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), target)
    preprocess_dataset.create_preprocessed_dataset(
        target,
        os.path.join(utils.get_project_root(),
                     'preprocessed/small-baseline/data.pickle'),
        [preprocessing.ScaleAndShift()])
Пример #6
0
def test_preprocessing_detection_test():
    preprocessing_queue = [
        {"ScaleAndShift": None},
        {"StrokeConnect": None},
        {"DouglasPeucker": [{"epsilon": 0.2}]},
        {"SpaceEvenly": [{"number": 100}]},
    ]
    correct = [
        preprocessing.ScaleAndShift(),
        preprocessing.StrokeConnect(),
        preprocessing.DouglasPeucker(epsilon=0.2),
        preprocessing.SpaceEvenly(number=100),
    ]
    feature_list = preprocessing.get_preprocessing_queue(preprocessing_queue)
    # TODO: Not only compare lengths of lists but actual contents.
    assert len(feature_list) == len(correct)
Пример #7
0
def preprocessing_detection_test():
    preprocessing_queue = [{
        'ScaleAndShift': None
    }, {
        'StrokeConnect': None
    }, {
        'DouglasPeucker': [{
            'epsilon': 0.2
        }]
    }, {
        'SpaceEvenly': [{
            'number': 100
        }]
    }]
    correct = [
        preprocessing.ScaleAndShift(),
        preprocessing.StrokeConnect(),
        preprocessing.DouglasPeucker(epsilon=0.2),
        preprocessing.SpaceEvenly(number=100)
    ]
    feature_list = preprocessing.get_preprocessing_queue(preprocessing_queue)
    # TODO: Not only compare lengths of lists but actual contents.
    nose.tools.assert_equal(len(feature_list), len(correct))
Пример #8
0
def test_ScaleAndShift_test_all():
    preprocessing_queue = [preprocessing.ScaleAndShift()]
    for a in testhelper.get_all_symbols_as_handwriting():
        a.preprocessing(preprocessing_queue)
        s = a.get_pointlist()
        assert len(s) > 0
Пример #9
0
def main(cfg, raw_data_start_id):
    mysql = cfg['mysql_online']
    connection = pymysql.connect(host=mysql['host'],
                                 user=mysql['user'],
                                 passwd=mysql['passwd'],
                                 db=mysql['db'],
                                 cursorclass=pymysql.cursors.DictCursor)
    cursor = connection.cursor()

    # Get formulas
    logger.info("Get formulas")
    print("get formulas")
    sql = ("SELECT `id`, `formula_in_latex` FROM `wm_formula` "
           "WHERE `id` > %s ORDER BY `id`")
    cursor.execute(sql, (raw_data_start_id, ))
    formulas = cursor.fetchall()
    formulaid2latex = {}
    for el in formulas:
        formulaid2latex[el['id']] = el['formula_in_latex']

    preprocessing_queue = [
        preprocessing.ScaleAndShift(),
        # preprocessing.Douglas_peucker(EPSILON=0.2),
        # preprocessing.Space_evenly(number=100,
        #                            kind='cubic')
    ]

    checked_formulas = 0
    checked_raw_data_instances = 0

    for formula_id in formulaid2latex.keys():
        alread_shown_in_browser = False
        if formula_id == 1:
            # This formula id is for trash. No need to look at it.
            continue
        # Get data
        logger.info("Get data for formula_id %i (%s)" %
                    (formula_id, formulaid2latex[formula_id]))
        sql = ("SELECT `id`, `data`, `accepted_formula_id`, "
               "`wild_point_count`, `missing_line`, `has_hook`, "
               "`has_too_long_line`, `is_image`, `administrator_edit`, "
               "`other_problem`, `has_interrupted_line` "
               "FROM  `wm_raw_draw_data` "
               "WHERE `accepted_formula_id` = %i "
               "ORDER BY `administrator_edit` DESC, "
               "`creation_date` ASC;") % formula_id
        cursor.execute(sql)
        raw_datasets = cursor.fetchall()
        logger.info("Raw datasets: %i" % len(raw_datasets))
        checked_raw_data_instances += len(raw_datasets)
        checked_formulas += 1
        if len(raw_datasets) < 100:
            continue
        As = []
        for i, data in enumerate(raw_datasets):
            if data['data'] == "[]":
                continue
            B = HandwrittenDataM(data['data'], data['accepted_formula_id'],
                                 data['wild_point_count'],
                                 data['missing_line'], data['has_hook'],
                                 data['has_too_long_line'], data['is_image'],
                                 data['other_problem'],
                                 data['has_interrupted_line'], data['id'],
                                 formulaid2latex[formula_id])
            B.preprocessing(preprocessing_queue)
            B_pll = B.get_pointlist()
            distance = float('inf')
            for A_pll in As:
                distance = min(distance, dtw(A_pll, B_pll))
            if distance > 100:
                if data['administrator_edit'] is not None:
                    As.append(B.get_pointlist())
                else:
                    if not alread_shown_in_browser:
                        alread_shown_in_browser = True
                        webbrowser.open(
                            "http://www.martin-thoma.de/"
                            "write-math/view/?"
                            "raw_data_id=%i" % data['id'], 2)
                    B.show()
                    if B.ok:
                        As.append(B.get_pointlist())
                        update_data(cfg, B)
                    else:
                        update_data(cfg, B, True)
        logger.info("[Status] Checked formulas: %i of %i" %
                    (checked_formulas, len(formulaid2latex)))
        logger.info("[Status] Checked raw_data_instances: %i" %
                    checked_raw_data_instances)
    logger.info("done")
Пример #10
0
def main(cfg, raw_data_start_id):
    cfg = utils.get_database_configuration()
    mysql = cfg['mysql_online']
    connection = pymysql.connect(host=mysql['host'],
                                 user=mysql['user'],
                                 passwd=mysql['passwd'],
                                 db=mysql['db'],
                                 cursorclass=pymysql.cursors.DictCursor)
    cursor = connection.cursor()

    # Get formulas
    print("Get formulas")
    sql = ("SELECT `id`, `formula_in_latex` FROM `wm_formula` WHERE `id` > %s")
    cursor.execute(sql, (raw_data_start_id, ))
    formulas = cursor.fetchall()
    formulaid2latex = {}
    for el in formulas:
        formulaid2latex[el['id']] = el['formula_in_latex']

    preprocessing_queue = [
        preprocessing.ScaleAndShift(),
        # preprocessing.Douglas_peucker(EPSILON=0.2),
        # preprocessing.Space_evenly(number=100,
        #                            kind='cubic')
    ]

    checked_formulas = 0
    checked_raw_data_instances = 0

    for formula_id in formulaid2latex.keys():
        if formula_id == 1:
            # This formula id is for trash. No need to look at it.
            continue
        # Get data
        print("Get data for formula_id %i (%s)" %
              (formula_id, formulaid2latex[formula_id]))
        sql = ("SELECT `id`, `data`, `accepted_formula_id`, "
               "`wild_point_count`, `missing_line`, `has_hook`, "
               "`has_too_long_line`, `is_image`, `administrator_edit`, "
               "`other_problem`, `has_interrupted_line` "
               "FROM  `wm_raw_draw_data` "
               "WHERE `accepted_formula_id` = %i "
               "ORDER BY `administrator_edit` DESC, "
               "`creation_date` ASC;") % formula_id
        cursor.execute(sql)
        raw_datasets = cursor.fetchall()
        print("Raw datasets: %i" % len(raw_datasets))
        checked_raw_data_instances += len(raw_datasets)
        checked_formulas += 1
        if len(raw_datasets) < 100:
            continue

        for i, data in enumerate(raw_datasets):
            if data['data'] == "[]":
                continue
            B = HandwrittenDataM(data['data'], data['accepted_formula_id'],
                                 data['wild_point_count'],
                                 data['missing_line'], data['has_hook'],
                                 data['has_too_long_line'], data['is_image'],
                                 data['other_problem'],
                                 data['has_interrupted_line'], data['id'],
                                 formulaid2latex[formula_id])
            B.preprocessing(preprocessing_queue)
            Bs = deepcopy(B)
            Bs.preprocessing([preprocessing.DotReduction(0.01)])
            if B != Bs:
                before_pointcount = sum(
                    [len(line) for line in B.get_pointlist()])
                after_pointcount = sum(
                    [len(line) for line in Bs.get_pointlist()])
                print("Reduced %i lines to %i lines." %
                      (len(B.get_pointlist()), len(Bs.get_pointlist())))
                print("Reduced %i points to %i points." %
                      (before_pointcount, after_pointcount))
                if before_pointcount - after_pointcount > 2:
                    B.show()
                    Bs.show()

        print("[Status] Checked formulas: %i of %i" %
              (checked_formulas, len(formulaid2latex)))
        print("[Status] Checked raw_data_instances: %i" %
              checked_raw_data_instances)
    print("done")