def test_ScaleAndShift_test_a(): preprocessing_queue = [preprocessing.ScaleAndShift()] s = ( '[[{"x":232,"y":423,"time":1407885913983},' '{"x":267,"y":262,"time":1407885914315},' '{"x":325,"y":416,"time":1407885914650}],' '[{"x":252,"y":355,"time":1407885915675},' '{"x":305,"y":351,"time":1407885916361}]]' ) a = HandwrittenData(s) a.preprocessing(preprocessing_queue) s = a.get_pointlist() expectation = [ [ {"y": 1.0, "x": 0.0, "time": 0}, {"y": 0.0, "x": 0.2174, "time": 332}, {"y": 0.9565, "x": 0.5776, "time": 667}, ], [ {"y": 0.5776, "x": 0.1242, "time": 1692}, {"y": 0.5528, "x": 0.4534, "time": 2378}, ], ] assert testhelper.compare_pointlists( s, expectation ), f"Got: {s}; expected {expectation}"
def test_ScaleAndShift_test_a_center(): preprocessing_queue = [preprocessing.ScaleAndShift(center=True)] s = ( '[[{"y": 1.0, "x": -0.3655913978494625, "time": 0}, ' '{"y": 0.0, "x": -0.1482000935016364, "time": 332}, ' '{"y": 0.9565, "x": 0.21204835370333253, "time": 667}], ' '[{"y": 0.5776, "x": -0.24136779536499045, "time": 1692}, ' '{"y": 0.5528, "x": 0.08782475121886046, "time": 2378}]]' ) a = HandwrittenData(s) a.preprocessing(preprocessing_queue) s = a.get_pointlist() expectation = [ [ {"y": 1.0, "x": -0.2888198757763975, "time": 0}, {"y": 0.0, "x": -0.07142857142857142, "time": 332}, {"y": 0.9565, "x": 0.2888198757763975, "time": 667}, ], [ {"y": 0.5776, "x": -0.16459627329192547, "time": 1692}, {"y": 0.5528, "x": 0.16459627329192544, "time": 2378}, ], ] assert testhelper.compare_pointlists( s, expectation ), f"Got: {s}; expected {expectation}"
def test_ScaleAndShift_test_simple_5(): preprocessing_queue = [preprocessing.ScaleAndShift()] s = '[[{"x":42, "y":12, "time": 10}]]' a = HandwrittenData(s) a.preprocessing(preprocessing_queue) s = a.get_pointlist() expectation = [[{"x": 0, "y": 0, "time": 0}]] assert s == expectation, f"Got: {s}; expected {expectation}"
def ScaleAndShift_test_simple_4(): preprocessing_queue = [preprocessing.ScaleAndShift()] s = '[[{"x":0, "y":0, "time": 10}]]' a = HandwrittenData(s) a.preprocessing(preprocessing_queue) s = a.get_pointlist() expectation = [[{"x": 0, "y": 0, "time": 0}]] assert s == expectation, "Got: %s; expected %s" % (s, expectation)
def dataset_preparation_test(): d = os.path.dirname(__file__) target = os.path.join(utils.get_project_root(), 'raw-datasets/unittests-tiny-raw.pickle') shutil.copyfile(os.path.join(d, 'data/unittests-tiny-raw.pickle'), target) preprocess_dataset.create_preprocessed_dataset( target, os.path.join(utils.get_project_root(), 'preprocessed/small-baseline/data.pickle'), [preprocessing.ScaleAndShift()])
def test_preprocessing_detection_test(): preprocessing_queue = [ {"ScaleAndShift": None}, {"StrokeConnect": None}, {"DouglasPeucker": [{"epsilon": 0.2}]}, {"SpaceEvenly": [{"number": 100}]}, ] correct = [ preprocessing.ScaleAndShift(), preprocessing.StrokeConnect(), preprocessing.DouglasPeucker(epsilon=0.2), preprocessing.SpaceEvenly(number=100), ] feature_list = preprocessing.get_preprocessing_queue(preprocessing_queue) # TODO: Not only compare lengths of lists but actual contents. assert len(feature_list) == len(correct)
def preprocessing_detection_test(): preprocessing_queue = [{ 'ScaleAndShift': None }, { 'StrokeConnect': None }, { 'DouglasPeucker': [{ 'epsilon': 0.2 }] }, { 'SpaceEvenly': [{ 'number': 100 }] }] correct = [ preprocessing.ScaleAndShift(), preprocessing.StrokeConnect(), preprocessing.DouglasPeucker(epsilon=0.2), preprocessing.SpaceEvenly(number=100) ] feature_list = preprocessing.get_preprocessing_queue(preprocessing_queue) # TODO: Not only compare lengths of lists but actual contents. nose.tools.assert_equal(len(feature_list), len(correct))
def test_ScaleAndShift_test_all(): preprocessing_queue = [preprocessing.ScaleAndShift()] for a in testhelper.get_all_symbols_as_handwriting(): a.preprocessing(preprocessing_queue) s = a.get_pointlist() assert len(s) > 0
def main(cfg, raw_data_start_id): mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # Get formulas logger.info("Get formulas") print("get formulas") sql = ("SELECT `id`, `formula_in_latex` FROM `wm_formula` " "WHERE `id` > %s ORDER BY `id`") cursor.execute(sql, (raw_data_start_id, )) formulas = cursor.fetchall() formulaid2latex = {} for el in formulas: formulaid2latex[el['id']] = el['formula_in_latex'] preprocessing_queue = [ preprocessing.ScaleAndShift(), # preprocessing.Douglas_peucker(EPSILON=0.2), # preprocessing.Space_evenly(number=100, # kind='cubic') ] checked_formulas = 0 checked_raw_data_instances = 0 for formula_id in formulaid2latex.keys(): alread_shown_in_browser = False if formula_id == 1: # This formula id is for trash. No need to look at it. continue # Get data logger.info("Get data for formula_id %i (%s)" % (formula_id, formulaid2latex[formula_id])) sql = ("SELECT `id`, `data`, `accepted_formula_id`, " "`wild_point_count`, `missing_line`, `has_hook`, " "`has_too_long_line`, `is_image`, `administrator_edit`, " "`other_problem`, `has_interrupted_line` " "FROM `wm_raw_draw_data` " "WHERE `accepted_formula_id` = %i " "ORDER BY `administrator_edit` DESC, " "`creation_date` ASC;") % formula_id cursor.execute(sql) raw_datasets = cursor.fetchall() logger.info("Raw datasets: %i" % len(raw_datasets)) checked_raw_data_instances += len(raw_datasets) checked_formulas += 1 if len(raw_datasets) < 100: continue As = [] for i, data in enumerate(raw_datasets): if data['data'] == "[]": continue B = HandwrittenDataM(data['data'], data['accepted_formula_id'], data['wild_point_count'], data['missing_line'], data['has_hook'], data['has_too_long_line'], data['is_image'], data['other_problem'], data['has_interrupted_line'], data['id'], formulaid2latex[formula_id]) B.preprocessing(preprocessing_queue) B_pll = B.get_pointlist() distance = float('inf') for A_pll in As: distance = min(distance, dtw(A_pll, B_pll)) if distance > 100: if data['administrator_edit'] is not None: As.append(B.get_pointlist()) else: if not alread_shown_in_browser: alread_shown_in_browser = True webbrowser.open( "http://www.martin-thoma.de/" "write-math/view/?" "raw_data_id=%i" % data['id'], 2) B.show() if B.ok: As.append(B.get_pointlist()) update_data(cfg, B) else: update_data(cfg, B, True) logger.info("[Status] Checked formulas: %i of %i" % (checked_formulas, len(formulaid2latex))) logger.info("[Status] Checked raw_data_instances: %i" % checked_raw_data_instances) logger.info("done")
def main(cfg, raw_data_start_id): cfg = utils.get_database_configuration() mysql = cfg['mysql_online'] connection = pymysql.connect(host=mysql['host'], user=mysql['user'], passwd=mysql['passwd'], db=mysql['db'], cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() # Get formulas print("Get formulas") sql = ("SELECT `id`, `formula_in_latex` FROM `wm_formula` WHERE `id` > %s") cursor.execute(sql, (raw_data_start_id, )) formulas = cursor.fetchall() formulaid2latex = {} for el in formulas: formulaid2latex[el['id']] = el['formula_in_latex'] preprocessing_queue = [ preprocessing.ScaleAndShift(), # preprocessing.Douglas_peucker(EPSILON=0.2), # preprocessing.Space_evenly(number=100, # kind='cubic') ] checked_formulas = 0 checked_raw_data_instances = 0 for formula_id in formulaid2latex.keys(): if formula_id == 1: # This formula id is for trash. No need to look at it. continue # Get data print("Get data for formula_id %i (%s)" % (formula_id, formulaid2latex[formula_id])) sql = ("SELECT `id`, `data`, `accepted_formula_id`, " "`wild_point_count`, `missing_line`, `has_hook`, " "`has_too_long_line`, `is_image`, `administrator_edit`, " "`other_problem`, `has_interrupted_line` " "FROM `wm_raw_draw_data` " "WHERE `accepted_formula_id` = %i " "ORDER BY `administrator_edit` DESC, " "`creation_date` ASC;") % formula_id cursor.execute(sql) raw_datasets = cursor.fetchall() print("Raw datasets: %i" % len(raw_datasets)) checked_raw_data_instances += len(raw_datasets) checked_formulas += 1 if len(raw_datasets) < 100: continue for i, data in enumerate(raw_datasets): if data['data'] == "[]": continue B = HandwrittenDataM(data['data'], data['accepted_formula_id'], data['wild_point_count'], data['missing_line'], data['has_hook'], data['has_too_long_line'], data['is_image'], data['other_problem'], data['has_interrupted_line'], data['id'], formulaid2latex[formula_id]) B.preprocessing(preprocessing_queue) Bs = deepcopy(B) Bs.preprocessing([preprocessing.DotReduction(0.01)]) if B != Bs: before_pointcount = sum( [len(line) for line in B.get_pointlist()]) after_pointcount = sum( [len(line) for line in Bs.get_pointlist()]) print("Reduced %i lines to %i lines." % (len(B.get_pointlist()), len(Bs.get_pointlist()))) print("Reduced %i points to %i points." % (before_pointcount, after_pointcount)) if before_pointcount - after_pointcount > 2: B.show() Bs.show() print("[Status] Checked formulas: %i of %i" % (checked_formulas, len(formulaid2latex))) print("[Status] Checked raw_data_instances: %i" % checked_raw_data_instances) print("done")