def get_point_distance_limit(data, time=False): """ Creates the statistics on the data, by the measurement of the distance between the registered points in each stroke. :param time: Flag, whether the average distance should be normalized by the delta time. :param data: Stroke set. :return: Length limit, that defines the outlier distance. """ distances = [] for stroke_index, stroke in enumerate(data): for point_index, point in enumerate(stroke[:-1]): if time: distances.append( util.point_2_point(stroke[point_index][0], stroke[ point_index + 1][0]) / (stroke[point_index + 1][1] - stroke[point_index][1]) if stroke[point_index + 1][1] != stroke[point_index][1] else 0.01) else: distances.append( util.point_2_point(stroke[point_index], stroke[point_index + 1])) q1, q2, q3 = util.get_quartiles(distances) return (q3 + 1.5 * (q3 - q1)) * limit_multiplier
def get_stroke_parameters(stroke, file_index, stroke_index): """ Calculates the parameters of a stroke and concatenates it with the predetermined horizontal value. :param stroke: A single stroke of the text. :param file_index: The index of the file, which contains the currently processed stroke. :param stroke_index: The index of the stroke in the StrokeSet. :return: The calculated parameters of the stroke. The values are Nan, if the stroke is too short. """ h_line_avg_distance = 0 d_line_avg_distance = 0 stroke_length = 0 avg_degree = 0 # In case of comas, dots, or xml errors the stroke is marked with null values, and will be removed from the # training data in the next processing step. if len(stroke) == 0 or len(stroke) == 1 or len(stroke) == 2: return None, None, None, 0, None else: for index in range(len(stroke)): try: if index in range(0, len(stroke) - 1): stroke_length += util.point_2_point( stroke[index], stroke[index + 1]) avg_degree += math.fabs( util.calculate_angle( stroke[0], stroke[index + 1])) / (len(stroke) - 1) if index in range(1, len(stroke) - 1): # Average distance of the stroke's points from the line, # that connects the first and the final point. d_line_avg_distance += util.point_2_line( stroke[0], stroke[-1], stroke[index]) / (len(stroke) - 2) if index in range(1, len(stroke)): # Average distance of the stroke's points from the horizontal line, # that goes through the first point. h_line_avg_distance += util.point_2_line( stroke[0], util.Point(stroke[0].x + 1, stroke[0].y), stroke[index]) / (len(stroke) - 1) except ZeroDivisionError: # In case of division error, that occurs during the calculation of the angle (due to faulty xml data) # ignore the point and move to the next. pass if stroke_index == 13: print(avg_degree, h_line_avg_distance, d_line_avg_distance, stroke_length, is_horizontal(file_index, stroke_index), file_index) return avg_degree, h_line_avg_distance, d_line_avg_distance, stroke_length, is_horizontal( file_index, stroke_index)
def get_faulty_strokes(time_data, limit): """ Finds the faulty strokes in the data, by comparing the distance values to the limit. :param time_data: The stroke set, with time labels. :param limit: Length limit. :return: List of the faulty strokes. """ f_strokes = [] for element_index, element in enumerate(time_data): distance = 0 for point_index, p in enumerate(element[:-1]): distance += util.point_2_point(p[0], element[point_index + 1][0]) if distance / (element[-1][1] - element[0][1] if element[-1][1] != element[0][1] else 0.01) > limit: f_strokes.append(element_index) return f_strokes
def get_stroke_length_limit(time_data): """ Calculates the length limit for the strokes. Their length is divided by the delta time, to adjust to the faulty data. :param time_data: The stroke set, with time labels. :return: Length limit times the error threshold. """ distances = [] for element in time_data: distance = 0 for point_index, p in enumerate(element[:-1]): distance += util.point_2_point(p[0], element[point_index + 1][0]) distances.append( distance / (element[-1][1] - element[0][1] if element[-1][1] != element[0][1] else 0.01)) q1, q2, q3 = util.get_quartiles(distances) return (q3 + 1.5 * (q3 - q1)) * limit_multiplier
def get_outliers(data, file_name): """ Discovers the strokes with extraordinarily large distance to registered points ratios in the given stroke set, then finds the points, that cause the anomaly. :param data: The stroke set of the text. :param file_name: The file that will be scanned for outliers. :return: Indexes of the points. """ def get_stroke_length_limit(time_data): """ Calculates the length limit for the strokes. Their length is divided by the delta time, to adjust to the faulty data. :param time_data: The stroke set, with time labels. :return: Length limit times the error threshold. """ distances = [] for element in time_data: distance = 0 for point_index, p in enumerate(element[:-1]): distance += util.point_2_point(p[0], element[point_index + 1][0]) distances.append( distance / (element[-1][1] - element[0][1] if element[-1][1] != element[0][1] else 0.01)) q1, q2, q3 = util.get_quartiles(distances) return (q3 + 1.5 * (q3 - q1)) * limit_multiplier def get_faulty_strokes(time_data, limit): """ Finds the faulty strokes in the data, by comparing the distance values to the limit. :param time_data: The stroke set, with time labels. :param limit: Length limit. :return: List of the faulty strokes. """ f_strokes = [] for element_index, element in enumerate(time_data): distance = 0 for point_index, p in enumerate(element[:-1]): distance += util.point_2_point(p[0], element[point_index + 1][0]) if distance / (element[-1][1] - element[0][1] if element[-1][1] != element[0][1] else 0.01) > limit: f_strokes.append(element_index) return f_strokes # Calculating the limit of the distance between two points in each stroke. timed_data = build_structure(file_name, time=True) # The distances are divided by the delta time between sampling, to adjust to the uneven periods. normalized_length_limit = get_stroke_length_limit(timed_data) faulty_strokes = get_faulty_strokes(timed_data, normalized_length_limit) for stroke_index, stroke in enumerate(timed_data): for index, point in enumerate(stroke[:-1]): if ( util.point_2_point(stroke[index][0], stroke[index + 1][0]) / (stroke[index + 1][1] - stroke[index][1] if stroke[index + 1][1] != stroke[index][1] else 0.01) ) > normalized_length_limit * 2 and stroke_index not in faulty_strokes: faulty_strokes.append(stroke_index) print(faulty_strokes) # Dividing the strokes into lines. lines = get_lines(data, faulty_strokes, file_name) # Calculating the limit of distance between two sequential points in the set of strokes. point_length_limit = get_point_distance_limit(data) # Ordered dictionary of faulty strokes indexes as keys, and lists of the faulty points' indexes as values. points = OrderedDict() for stroke_index in faulty_strokes: points[stroke_index] = get_outlier_points(data[stroke_index], lines[stroke_index], point_length_limit) return points
def get_outlier_points(stroke, estimated_position, limit): """ Finds the group of points, that is the closest to the stroke's estimated location, and returns the list of those points, which are not in this group. :param stroke: The inspected stroke. :param estimated_position: The stroke's estimated position. :param limit: The length limit of an edge between two vertices in the graph. The graph consists of the points of the stroke and it is represented as a graph for the algorithm that groups the points. :return: Ordered list of the outlier points' indexes. """ def index_to_point(indexes, point_objects): """ Gets the corresponding point objects in the stroke for the given set of indexes. :param indexes: Indexes to be interpreted as points. :param point_objects: A single stroke. :return: List of point objects. """ return [ point for point_index, point in enumerate(point_objects) if point_index in indexes ] # The connected vertices are stored as ones in the matrix. adjacency_matrix = np.ones((len(stroke), len(stroke))) for row in range(len(adjacency_matrix)): for col in range(len(adjacency_matrix[row])): if row == col: adjacency_matrix[row][col] = -1 elif util.point_2_point(stroke[row], stroke[col]) > limit: adjacency_matrix[row][col] = 0 # The matrix is converted into a dict, that stores the vertex sequence numbers as keys, and # the corresponding connected vertices as values. adjacency_list = OrderedDict() for index, row in enumerate(adjacency_matrix): adjacency_list[index] = util.find_all(row, 1) # The connected vertices are organised into groups. groups = [] while len(adjacency_list) > 0: group = util.dfs(adjacency_list) if len(group) != 0: groups.append(group) for index in group: if index in adjacency_list: del adjacency_list[index] # The groups are represented by their average position. average_positions = [] for group in groups: average_positions.append( util.get_average_point(index_to_point(group, stroke))) # The distances between the average position and the predicted location is calculated. distances = [] for position in average_positions: distances.append( util.point_2_point( position, util.Point(estimated_position[1], estimated_position[2]))) # The group that is closest to the predicted location is chosen. closest_group = distances.index(min(distances)) return [ index for index, point in enumerate(stroke) if index not in groups[closest_group] ]
def predict_stroke_position(stroke_index, lines, strokes): """ Predicts the faulty stroke's position, based on the surrounding strokes. If the stroke is not on the edges of a line, it will be placed at the middle of the distance between the two adjacent strokes. If the stroke is the first or the final one, it will be placed at a location calculated by the parameters of the corresponding line. :param stroke_index: The index of the stroke in the stroke set. :param lines: The structured set of strokes, organised into lines. :param strokes: The set of strokes. :return: The sequence number of the line, in which the stroke has been determined to be in. The x and the y coordinates of the position. """ # The distances list stores the distance between the strokes' position in the line. distances = [] # The stroke is not at the first or final index. The values of the surrounding strokes can be used. if len(lines) > stroke_index > 0: # lines[stroke_index] is a tuple, of which first element is the sequence number of the line. # If the stroke's previous and next neighbours are in the same line, then the stroke is in that line. if lines[stroke_index - 1][0] == lines[stroke_index][0]: line_index = lines[stroke_index - 1][0] # If they are in different lines, then the stroke must be either at the end of the line or at the beginning. else: prev_median_y = util.get_average([ stroke[2] for stroke in lines if stroke[0] == lines[stroke_index - 1][0] ]) next_median_y = util.get_average([ stroke[2] for stroke in lines if stroke[0] == lines[stroke_index][0] ]) # The stroke will be placed in the line, in which the stroke is closest to its possible location. line_index = lines[stroke_index - 1][0] if\ util.point_2_set(util.Point(lines[stroke_index - 1][1], prev_median_y), strokes[stroke_index]) <\ util.point_2_set(util.Point(lines[stroke_index][1], next_median_y), strokes[stroke_index]) else lines[stroke_index][0] x_medians = [stroke[1] for stroke in lines if stroke[0] == line_index] for index, x_median in enumerate(x_medians[:-1]): distances.append( util.point_2_point(util.Point(x_median, 0), util.Point(x_medians[index + 1], 0))) # print(distances) # If the stroke was determined to be in the line of the previous stroke, then its x position is calculated by # adding the average of distances between the strokes' positions in that line, to the final stroke of the line. if line_index == lines[stroke_index - 1][0]: x_coordinate = lines[stroke_index - 1][1] + util.get_average(distances) # If its in the next line, the same principle is applied. else: x_coordinate = lines[stroke_index][1] - util.get_average(distances) # The stroke is the first in the stroke set. elif stroke_index == 0: line_index = lines[stroke_index][0] x_medians = [stroke[1] for stroke in lines if stroke[0] == line_index] for index, x_median in enumerate(x_medians[:-1]): distances.append( util.point_2_point(util.Point(x_median, 0), util.Point(x_medians[index + 1], 0))) x_coordinate = lines[stroke_index][1] - util.get_average(distances) # The stroke is the final stroke in the set. else: line_index = lines[stroke_index - 1][0] x_medians = [stroke[1] for stroke in lines if stroke[0] == line_index] for index, x_median in enumerate(x_medians[:-1]): distances.append( util.point_2_point(util.Point(x_median, 0), util.Point(x_medians[index + 1], 0))) x_coordinate = lines[stroke_index - 1][1] + util.get_average(distances) y_coordinate = util.get_average( [stroke[2] for stroke in lines if stroke[0] == line_index]) return line_index, x_coordinate, y_coordinate
def get_lines(data, faulty_strokes, file_name): """ Divides the stroke set into lines. :param data: Set of strokes. :param faulty_strokes: A list of strokes that have been determined as faulty, based on their stroke length to number of registered points ratio. :param file_name: The file that will be scanned for outliers. :return: The data structure containing the strokes separated according to the lines of the text. """ def get_nb_eol(file): """ Counts the EOLs in the text of the xml. :param file: The file that will be scanned for outliers. :return: Number of EOLs. """ tree = ElementTree.parse(file) root = tree.getroot() return root.find('Transcription').find('Text').text.strip().count('\n') # The calculation of the distances between a text's strokes. The strokes are represented as a single number, # the median value of the registered points' x coordinates. This step is directed to find the end of lines # in the written text, hence the values of the y coordinates are not necessary, since the outlying distances # can be found as the large jumps of distance values at the end the of lines. distances = [] # The length statistics are created only on the correct strokes, so the anomalies in the faulty strokes # will not interfere with the detection of EOL. correct_strokes = [ stroke for stroke_index, stroke in enumerate(data) if stroke_index not in faulty_strokes ] for stroke_index, stroke in enumerate(correct_strokes): median_x = util.get_quartiles([point.x for point in stroke])[2] if stroke_index < len(correct_strokes) - 1: next_median_x = util.get_quartiles( [point.x for point in correct_strokes[stroke_index + 1]])[2] distances.append( util.point_2_point(util.Point(median_x, 0), util.Point(next_median_x, 0))) distances.sort() # The largest distances will be the EOLs, so the last get_nb_eol(file_name)th element will be the distance limit. length_limit = distances[-get_nb_eol(file_name)] - 0.1 lines = [] index = 0 # Creation of the data structure, that stores the line sequence number, the x and the y median values of a stroke. for stroke_index, stroke in enumerate(correct_strokes): median_x = util.get_quartiles([point.x for point in stroke])[2] median_y = util.get_quartiles([point.y for point in stroke])[2] lines.append((index, median_x, median_y)) if stroke_index < len(correct_strokes) - 1: next_median_x = util.get_quartiles( [point.x for point in correct_strokes[stroke_index + 1]])[2] if util.point_2_point(util.Point(median_x, 0), util.Point(next_median_x, 0)) > length_limit: index += 1 # The list of faulty strokes are ignored in this step, since the iterated data is the list of correct strokes. # Reason for this is the extraordinary values in the faulty strokes, which prevent the correct calculation of # the stroke's location. faulty_strokes.sort() # The faulty strokes are inserted into the list in this step, with the predicted locations. for stroke_index in faulty_strokes: lines.insert(stroke_index, predict_stroke_position(stroke_index, lines, data)) return lines