Пример #1
0
 def score_operation(self, operation) -> float:
     if isinstance(operation, Constant):
         return 1.0
     transformed_column = Column(values=operation.transform())
     target_column = Column(values=operation.target_token.values)
     return (values_jaccard(transformed_column, target_column) +
             syntactic_sim(transformed_column, target_column)) / 2.0
Пример #2
0
 def score_operation(self, operation) -> float:
     if isinstance(operation, Constant):
         return 1.0
     transformed_column = Column(values=operation.transform())
     target_column = Column(values=operation.target_token.values)
     result = self.scoring_model.predict_similarity(transformed_column,
                                                    target_column)
     # print(
     #     operation,
     #     result,
     #     operation.transform()[:3],
     #     operation.target_token.values[:3],
     # )
     return result
Пример #3
0
    def train_scoring_model(self,
                            example_patterns_by_groups: List[List[Pattern]]):
        labeled_pairs = []

        for idx, example_patterns in enumerate(example_patterns_by_groups):
            labeled_cols: List[Tuple[str, Column]] = []
            idx = 0
            for pattern in example_patterns:
                for token in pattern.tokens:
                    if len(token.values) <= 2:
                        continue
                    idx += 1

                    kf = KFold(n_splits=2)
                    for train_indices, test_indices in kf.split(token.values):
                        train_values = []
                        for index in test_indices:
                            train_values.append(token.values[index])
                        column = Column(str(idx), str(idx), train_values)
                        labeled_cols.append((str(idx), column))
            for label1, col1 in labeled_cols:
                for label2, col2 in labeled_cols:
                    # if label1 == label2 or abs(int(label1) - int(label2)) == 1:
                    labeled_pairs.append((col1, col2, label1 == label2))
        try:
            self.scoring_model.train_from_pairs(labeled_pairs)
        except Exception as e:
            logger.error(e)
    def to_cols(self):
        columns = []

        for idx, token in enumerate(self.tokens):
            column = Column(str(idx), str(idx), token.values)
            columns.append(column)

        return columns
Пример #5
0
def text_jaccard(col1: Column, col2: Column) -> float:
    col1_array = np.array(col1.text().split(" "))
    col2_array = np.array(col2.text().split(" "))

    return jaccard(col1_array, col2_array)
Пример #6
0
def text_cosine(col1: Column, col2: Column) -> float:
    return cosine(col1.text(), col2.text())
    def learn(self, original_tree: PatternTree, target_tree: PatternTree):
        self.token_mapper.train_scoring_model(
            original_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True)
            + target_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True)
        )

        final_node_to_result: Dict[PatternNode, TransformedResult] = {}

        target_col = Column(values=target_tree.values)
        best_score, best_level = 0, 0

        for layer in [3, 2, 1]:

            transformed_col = Column()

            original_nodes = sorted(
                original_tree.node_in_layers[layer],
                key=lambda x: len(x.value.tokens),
                reverse=True,
            )
            transformed_nodes = sorted(
                target_tree.node_in_layers[layer],
                key=lambda x: len(x.value.tokens),
                reverse=True,
            )

            node_to_result: Dict[PatternNode, TransformedResult] = {}

            for original_node in original_nodes:
                transformed_results = []
                for target_node in transformed_nodes:
                    result = self.token_mapper.learn(
                        original_node.value, target_node.value
                    )

                    transformed_results.append(
                        TransformedResult(
                            original_target_pairs=result[0], score=result[1]
                        )
                    )

                node_to_result[original_node] = max(
                    transformed_results, key=lambda x: x.score
                )

                value_tuples = list(
                    zip(*node_to_result[original_node].original_target_pairs)
                )
                transformed_col.extend_values(list(value_tuples[1]))

            pattern_score = self.token_mapper.scoring_model.predict_similarity(
                target_col, transformed_col
            )

            if pattern_score > best_score:
                best_score = pattern_score
                best_level = layer
                final_node_to_result = node_to_result.copy()

        validated_original_to_transformed_tuples = []

        scores = []

        for node, result in final_node_to_result.items():

            for original_value, transformed_value in result.original_target_pairs:
                validation_result = self.validator.validate_result(
                    transformed_value,
                    original_tree,
                    target_tree,
                    result.score,
                    best_level,
                )
                validated_original_to_transformed_tuples.append(
                    (original_value, transformed_value, validation_result)
                )

            scores.append(result.score)

        return validated_original_to_transformed_tuples, sum(scores) * 1.0 / len(scores)
    def learn_top_k(self, original_tree: PatternTree, target_tree: PatternTree, k: int):
        self.token_mapper.train_scoring_model(
            original_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True)
            + target_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True)
        )

        final_node_to_results: Dict[PatternNode, List[TransformedResult]] = {}
        target_col = Column(values=target_tree.values)
        best_score, best_level = -1, 0

        for layer in range(len(PATTERNS_BY_LEVEL) - 1, 0, -1):
            logger.debug("Mapping level: %s" % layer)

            transformed_col = Column()

            original_nodes = sorted(
                original_tree.node_in_layers[layer],
                key=lambda x: (len(x.value.values), -len(x.value.tokens)),
                reverse=True,
            )

            transformed_nodes = sorted(
                target_tree.node_in_layers[layer],
                key=lambda x: (len(x.value.values), -len(x.value.tokens)),
                reverse=True,
            )

            node_to_results = {}
            pattern_scores = []

            logger.debug(
                "Num combinations: %s * %s = %s"
                % (
                    len(original_nodes),
                    len(transformed_nodes),
                    len(original_nodes) * len(transformed_nodes),
                )
            )

            num_possible_nodes = 10

            for original_node in original_nodes[:num_possible_nodes]:
                token_mapping_results = []

                for target_node in transformed_nodes:
                    validated_values_by_pattern, scores_by_pattern = self.token_mapper.learn_top_k(
                        original_node.value, target_node.value, k
                    )

                    token_mapping_results.append(
                        [
                            TransformedResult(*result)
                            for result in zip(
                                validated_values_by_pattern, scores_by_pattern
                            )
                        ]
                    )

                node_to_results[original_node] = max(
                    token_mapping_results, key=lambda x: x[0].score
                )

                pattern_scores.append(
                    max([x.score for x in node_to_results[original_node]])
                )

                value_tuples = list(
                    zip(*node_to_results[original_node][0].original_target_pairs)
                )
                transformed_col.extend_values(list(value_tuples[1]))

            for original_node in original_nodes[num_possible_nodes:]:
                for i in range(k):
                    transform_result = TransformedResult(
                        [(str_value, "") for str_value in original_node.value.values],
                        0.0,
                    )
                    node_to_results[original_node] = [transform_result]
                    value_tuples = list(
                        zip(*node_to_results[original_node][0].original_target_pairs)
                    )
                    transformed_col.extend_values(list(value_tuples[1]))

            if self.mapping_method == "sim":
                assert isinstance(self.token_mapper.scoring_model, MultiBinary)
                pattern_score = (
                    self.token_mapper.scoring_model.predict_similarity(
                        target_col, transformed_col
                    )
                    * len([x for x in transformed_col.values if x])
                    * 1.0
                    / len(transformed_col.values)
                )
            else:
                pattern_score = np.mean(pattern_scores)

            # print(
            #     "Pattern Score",
            #     pattern_score,
            #     transformed_col.values,
            #     target_col.values,
            # )

            if pattern_score > best_score:
                best_score = pattern_score
                best_level = layer
                final_node_to_results = node_to_results.copy()

        validated_values_by_pattern = [[] for _ in range(len(final_node_to_results))]

        scores_by_pattern = [[] for _ in range(len(final_node_to_results))]
        idx = 0

        full_validation_result = False

        for node, results in final_node_to_results.items():
            full_validation_result = (
                full_validation_result
                or self.validator.validate_results(
                    results[0].original_target_pairs,
                    target_tree,
                    results[0].score,
                    0 if len(results) == 1 else results[1].score,
                    1,
                )[0]
            )

            validated_values_by_pattern[idx] = []
            for idx1, result in enumerate(results):
                for idx2, (original_value, transformed_value) in enumerate(
                    result.original_target_pairs
                ):
                    if len(results) == 1:
                        current_score = 1
                        next_score = 0
                    elif idx1 < len(results) - 1:
                        current_score = result.score
                        next_score = results[idx1 + 1].score
                    else:
                        current_score = 0
                        next_score = 0
                    validation_result = self.validator.validate_results(
                        [(original_value, transformed_value)],
                        target_tree,
                        current_score,
                        next_score,
                        1,
                    )
                    if idx1 == 0:
                        validated_values_by_pattern[idx].append(
                            (original_value, [transformed_value], [validation_result])
                        )
                    else:
                        current_values = validated_values_by_pattern[idx][idx2][0]
                        assert (
                            original_value == current_values
                        ), f"Original value should be the same f{original_value} vs {current_values}"
                        validated_values_by_pattern[idx][idx2][1].append(
                            transformed_value
                        )
                        validated_values_by_pattern[idx][idx2][2].append(
                            validation_result
                        )

                scores_by_pattern[idx].append(result.score)
            idx += 1

        return validated_values_by_pattern, scores_by_pattern, full_validation_result