示例#1
0
def get_point_distance_limit(data, time=False):
    """
    Creates the statistics on the data, by the measurement of the distance
     between the registered points in each stroke.
    :param time: Flag, whether the average distance should be normalized by the delta time.
    :param data: Stroke set.
    :return: Length limit, that defines the outlier distance.
    """
    distances = []
    for stroke_index, stroke in enumerate(data):
        for point_index, point in enumerate(stroke[:-1]):
            if time:
                distances.append(
                    util.point_2_point(stroke[point_index][0], stroke[
                        point_index + 1][0]) /
                    (stroke[point_index + 1][1] - stroke[point_index][1])
                    if stroke[point_index +
                              1][1] != stroke[point_index][1] else 0.01)
            else:
                distances.append(
                    util.point_2_point(stroke[point_index],
                                       stroke[point_index + 1]))

    q1, q2, q3 = util.get_quartiles(distances)

    return (q3 + 1.5 * (q3 - q1)) * limit_multiplier
示例#2
0
def get_stroke_parameters(stroke, file_index, stroke_index):
    """
    Calculates the parameters of a stroke and concatenates it with the predetermined horizontal value.
    :param stroke: A single stroke of the text.
    :param file_index: The index of the file, which contains the currently processed stroke.
    :param stroke_index: The index of the stroke in the StrokeSet.
    :return: The calculated parameters of the stroke. The values are Nan,
         if the stroke is too short.
    """
    h_line_avg_distance = 0
    d_line_avg_distance = 0
    stroke_length = 0
    avg_degree = 0
    # In case of comas, dots, or xml errors the stroke is marked with null values, and will be removed from the
    # training data in the next processing step.
    if len(stroke) == 0 or len(stroke) == 1 or len(stroke) == 2:
        return None, None, None, 0, None

    else:
        for index in range(len(stroke)):
            try:
                if index in range(0, len(stroke) - 1):
                    stroke_length += util.point_2_point(
                        stroke[index], stroke[index + 1])
                    avg_degree += math.fabs(
                        util.calculate_angle(
                            stroke[0], stroke[index + 1])) / (len(stroke) - 1)
                if index in range(1, len(stroke) - 1):
                    # Average distance of the stroke's points from the line,
                    # that connects the first and the final point.
                    d_line_avg_distance += util.point_2_line(
                        stroke[0], stroke[-1],
                        stroke[index]) / (len(stroke) - 2)
                if index in range(1, len(stroke)):
                    # Average distance of the stroke's points from the horizontal line,
                    # that goes through the first point.
                    h_line_avg_distance += util.point_2_line(
                        stroke[0], util.Point(stroke[0].x + 1, stroke[0].y),
                        stroke[index]) / (len(stroke) - 1)

            except ZeroDivisionError:
                # In case of division error, that occurs during the calculation of the angle (due to faulty xml data)
                # ignore the point and move to the next.
                pass

    if stroke_index == 13:
        print(avg_degree, h_line_avg_distance,
              d_line_avg_distance, stroke_length,
              is_horizontal(file_index, stroke_index), file_index)

    return avg_degree, h_line_avg_distance, d_line_avg_distance, stroke_length, is_horizontal(
        file_index, stroke_index)
示例#3
0
    def get_faulty_strokes(time_data, limit):
        """
        Finds the faulty strokes in the data, by comparing the distance values to the limit.
        :param time_data: The stroke set, with time labels.
        :param limit: Length limit.
        :return: List of the faulty strokes.
        """
        f_strokes = []
        for element_index, element in enumerate(time_data):
            distance = 0
            for point_index, p in enumerate(element[:-1]):
                distance += util.point_2_point(p[0],
                                               element[point_index + 1][0])
            if distance / (element[-1][1] - element[0][1] if
                           element[-1][1] != element[0][1] else 0.01) > limit:
                f_strokes.append(element_index)

        return f_strokes
示例#4
0
    def get_stroke_length_limit(time_data):
        """
        Calculates the length limit for the strokes. Their length is divided by the delta time,
        to adjust to the faulty data.
        :param time_data: The stroke set, with time labels.
        :return: Length limit times the error threshold.
        """
        distances = []
        for element in time_data:
            distance = 0
            for point_index, p in enumerate(element[:-1]):
                distance += util.point_2_point(p[0],
                                               element[point_index + 1][0])
            distances.append(
                distance /
                (element[-1][1] -
                 element[0][1] if element[-1][1] != element[0][1] else 0.01))

        q1, q2, q3 = util.get_quartiles(distances)

        return (q3 + 1.5 * (q3 - q1)) * limit_multiplier
示例#5
0
def get_outliers(data, file_name):
    """
    Discovers the strokes with extraordinarily large distance to registered points ratios in the given
    stroke set, then finds the points, that cause the anomaly.
    :param data: The stroke set of the text.
    :param file_name: The file that will be scanned for outliers.
    :return: Indexes of the points.
    """
    def get_stroke_length_limit(time_data):
        """
        Calculates the length limit for the strokes. Their length is divided by the delta time,
        to adjust to the faulty data.
        :param time_data: The stroke set, with time labels.
        :return: Length limit times the error threshold.
        """
        distances = []
        for element in time_data:
            distance = 0
            for point_index, p in enumerate(element[:-1]):
                distance += util.point_2_point(p[0],
                                               element[point_index + 1][0])
            distances.append(
                distance /
                (element[-1][1] -
                 element[0][1] if element[-1][1] != element[0][1] else 0.01))

        q1, q2, q3 = util.get_quartiles(distances)

        return (q3 + 1.5 * (q3 - q1)) * limit_multiplier

    def get_faulty_strokes(time_data, limit):
        """
        Finds the faulty strokes in the data, by comparing the distance values to the limit.
        :param time_data: The stroke set, with time labels.
        :param limit: Length limit.
        :return: List of the faulty strokes.
        """
        f_strokes = []
        for element_index, element in enumerate(time_data):
            distance = 0
            for point_index, p in enumerate(element[:-1]):
                distance += util.point_2_point(p[0],
                                               element[point_index + 1][0])
            if distance / (element[-1][1] - element[0][1] if
                           element[-1][1] != element[0][1] else 0.01) > limit:
                f_strokes.append(element_index)

        return f_strokes

    # Calculating the limit of the distance between two points in each stroke.
    timed_data = build_structure(file_name, time=True)
    # The distances are divided by the delta time between sampling, to adjust to the uneven periods.
    normalized_length_limit = get_stroke_length_limit(timed_data)

    faulty_strokes = get_faulty_strokes(timed_data, normalized_length_limit)
    for stroke_index, stroke in enumerate(timed_data):
        for index, point in enumerate(stroke[:-1]):
            if (
                    util.point_2_point(stroke[index][0], stroke[index + 1][0])
                    / (stroke[index + 1][1] - stroke[index][1]
                       if stroke[index + 1][1] != stroke[index][1] else 0.01)
            ) > normalized_length_limit * 2 and stroke_index not in faulty_strokes:
                faulty_strokes.append(stroke_index)

    print(faulty_strokes)

    # Dividing the strokes into lines.
    lines = get_lines(data, faulty_strokes, file_name)

    # Calculating the limit of distance between two sequential points in the set of strokes.
    point_length_limit = get_point_distance_limit(data)

    # Ordered dictionary of faulty strokes indexes as keys, and lists of the faulty points' indexes as values.
    points = OrderedDict()
    for stroke_index in faulty_strokes:
        points[stroke_index] = get_outlier_points(data[stroke_index],
                                                  lines[stroke_index],
                                                  point_length_limit)

    return points
示例#6
0
def get_outlier_points(stroke, estimated_position, limit):
    """
    Finds the group of points, that is the closest to the stroke's estimated location,
    and returns the list of those points, which are not in this group.
    :param stroke: The inspected stroke.
    :param estimated_position: The stroke's estimated position.
    :param limit: The length limit of an edge between two vertices in the graph. The graph consists of
    the points of the stroke and it is represented as a graph for the algorithm that groups the points.
    :return: Ordered list of the outlier points' indexes.
    """
    def index_to_point(indexes, point_objects):
        """
        Gets the corresponding point objects in the stroke for the given set of indexes.
        :param indexes: Indexes to be interpreted as points.
        :param point_objects: A single stroke.
        :return: List of point objects.
        """
        return [
            point for point_index, point in enumerate(point_objects)
            if point_index in indexes
        ]

    # The connected vertices are stored as ones in the matrix.
    adjacency_matrix = np.ones((len(stroke), len(stroke)))
    for row in range(len(adjacency_matrix)):
        for col in range(len(adjacency_matrix[row])):
            if row == col:
                adjacency_matrix[row][col] = -1
            elif util.point_2_point(stroke[row], stroke[col]) > limit:
                adjacency_matrix[row][col] = 0

    # The matrix is converted into a dict, that stores the vertex sequence numbers as keys, and
    # the corresponding connected vertices as values.
    adjacency_list = OrderedDict()
    for index, row in enumerate(adjacency_matrix):
        adjacency_list[index] = util.find_all(row, 1)

    # The connected vertices are organised into groups.
    groups = []
    while len(adjacency_list) > 0:
        group = util.dfs(adjacency_list)
        if len(group) != 0:
            groups.append(group)
        for index in group:
            if index in adjacency_list:
                del adjacency_list[index]

    # The groups are represented by their average position.
    average_positions = []
    for group in groups:
        average_positions.append(
            util.get_average_point(index_to_point(group, stroke)))

    # The distances between the average position and the predicted location is calculated.
    distances = []
    for position in average_positions:
        distances.append(
            util.point_2_point(
                position,
                util.Point(estimated_position[1], estimated_position[2])))

    # The group that is closest to the predicted location is chosen.
    closest_group = distances.index(min(distances))

    return [
        index for index, point in enumerate(stroke)
        if index not in groups[closest_group]
    ]
示例#7
0
def predict_stroke_position(stroke_index, lines, strokes):
    """
    Predicts the faulty stroke's position, based on the surrounding strokes.
    If the stroke is not on the edges of a line, it will be placed at the middle of
    the distance between the two adjacent strokes. If the stroke is the first or the final
    one, it will be placed at a location calculated by the parameters of the corresponding
    line.
    :param stroke_index: The index of the stroke in the stroke set.
    :param lines: The structured set of strokes, organised into lines.
    :param strokes: The set of strokes.
    :return: The sequence number of the line, in which the stroke has been determined to be in.
    The x and the y coordinates of the position.
    """
    # The distances list stores the distance between the strokes' position in the line.
    distances = []
    # The stroke is not at the first or final index. The values of the surrounding strokes can be used.
    if len(lines) > stroke_index > 0:

        # lines[stroke_index] is a tuple, of which first element is the sequence number of the line.
        # If the stroke's previous and next neighbours are in the same line, then the stroke is in that line.
        if lines[stroke_index - 1][0] == lines[stroke_index][0]:
            line_index = lines[stroke_index - 1][0]
        # If they are in different lines, then the stroke must be either at the end of the line or at the beginning.
        else:
            prev_median_y = util.get_average([
                stroke[2] for stroke in lines
                if stroke[0] == lines[stroke_index - 1][0]
            ])
            next_median_y = util.get_average([
                stroke[2] for stroke in lines
                if stroke[0] == lines[stroke_index][0]
            ])
            # The stroke will be placed in the line, in which the stroke is closest to its possible location.
            line_index = lines[stroke_index - 1][0] if\
                util.point_2_set(util.Point(lines[stroke_index - 1][1], prev_median_y),
                                 strokes[stroke_index]) <\
                util.point_2_set(util.Point(lines[stroke_index][1], next_median_y),
                                 strokes[stroke_index]) else lines[stroke_index][0]

        x_medians = [stroke[1] for stroke in lines if stroke[0] == line_index]

        for index, x_median in enumerate(x_medians[:-1]):
            distances.append(
                util.point_2_point(util.Point(x_median, 0),
                                   util.Point(x_medians[index + 1], 0)))

        # print(distances)

        # If the stroke was determined to be in the line of the previous stroke, then its x position is calculated by
        # adding the average of distances between the strokes' positions in that line, to the final stroke of the line.
        if line_index == lines[stroke_index - 1][0]:
            x_coordinate = lines[stroke_index -
                                 1][1] + util.get_average(distances)
        # If its in the next line, the same principle is applied.
        else:
            x_coordinate = lines[stroke_index][1] - util.get_average(distances)

    # The stroke is the first in the stroke set.
    elif stroke_index == 0:
        line_index = lines[stroke_index][0]
        x_medians = [stroke[1] for stroke in lines if stroke[0] == line_index]
        for index, x_median in enumerate(x_medians[:-1]):
            distances.append(
                util.point_2_point(util.Point(x_median, 0),
                                   util.Point(x_medians[index + 1], 0)))

        x_coordinate = lines[stroke_index][1] - util.get_average(distances)

    # The stroke is the final stroke in the set.
    else:
        line_index = lines[stroke_index - 1][0]
        x_medians = [stroke[1] for stroke in lines if stroke[0] == line_index]
        for index, x_median in enumerate(x_medians[:-1]):
            distances.append(
                util.point_2_point(util.Point(x_median, 0),
                                   util.Point(x_medians[index + 1], 0)))

        x_coordinate = lines[stroke_index - 1][1] + util.get_average(distances)

    y_coordinate = util.get_average(
        [stroke[2] for stroke in lines if stroke[0] == line_index])

    return line_index, x_coordinate, y_coordinate
示例#8
0
def get_lines(data, faulty_strokes, file_name):
    """
     Divides the stroke set into lines.
    :param data: Set of strokes.
    :param faulty_strokes: A list of strokes that have been determined as faulty,
    based on their stroke length to number of registered points ratio.
    :param file_name: The file that will be scanned for outliers.
    :return: The data structure containing the strokes separated according to the lines of the text.
    """
    def get_nb_eol(file):
        """
        Counts the EOLs in the text of the xml.
        :param file: The file that will be scanned for outliers.
        :return: Number of EOLs.
        """
        tree = ElementTree.parse(file)
        root = tree.getroot()

        return root.find('Transcription').find('Text').text.strip().count('\n')

    # The calculation of the distances between a text's strokes. The strokes are represented as a single number,
    # the median value of the registered points' x coordinates. This step is directed to find the end of lines
    # in the written text, hence the values of the y coordinates are not necessary, since the outlying distances
    # can be found as the large jumps of distance values at the end the of lines.
    distances = []

    # The length statistics are created only on the correct strokes, so the anomalies in the faulty strokes
    # will not interfere with the detection of EOL.
    correct_strokes = [
        stroke for stroke_index, stroke in enumerate(data)
        if stroke_index not in faulty_strokes
    ]
    for stroke_index, stroke in enumerate(correct_strokes):
        median_x = util.get_quartiles([point.x for point in stroke])[2]
        if stroke_index < len(correct_strokes) - 1:
            next_median_x = util.get_quartiles(
                [point.x for point in correct_strokes[stroke_index + 1]])[2]
            distances.append(
                util.point_2_point(util.Point(median_x, 0),
                                   util.Point(next_median_x, 0)))

    distances.sort()

    # The largest distances will be the EOLs, so the last get_nb_eol(file_name)th element will be the distance limit.
    length_limit = distances[-get_nb_eol(file_name)] - 0.1

    lines = []
    index = 0

    # Creation of the data structure, that stores the line sequence number, the x and the y median values of a stroke.
    for stroke_index, stroke in enumerate(correct_strokes):
        median_x = util.get_quartiles([point.x for point in stroke])[2]
        median_y = util.get_quartiles([point.y for point in stroke])[2]

        lines.append((index, median_x, median_y))

        if stroke_index < len(correct_strokes) - 1:
            next_median_x = util.get_quartiles(
                [point.x for point in correct_strokes[stroke_index + 1]])[2]
            if util.point_2_point(util.Point(median_x, 0),
                                  util.Point(next_median_x, 0)) > length_limit:
                index += 1
        # The list of faulty strokes are ignored in this step, since the iterated data is the list of correct strokes.
        # Reason for this is the extraordinary values in the faulty strokes, which prevent the correct calculation of
        # the stroke's location.

    faulty_strokes.sort()

    # The faulty strokes are inserted into the list in this step, with the predicted locations.
    for stroke_index in faulty_strokes:
        lines.insert(stroke_index,
                     predict_stroke_position(stroke_index, lines, data))

    return lines