def monge_elkan(pair, feature):
    if feature in products[pair[0]] and feature in products[pair[1]]:
        return simfunctions.monge_elkan(
            tokenizers.whitespace(products[pair[0]].get(feature,
                                                        [''])[0].lower()),
            tokenizers.whitespace(products[pair[1]].get(feature,
                                                        [''])[0].lower()))
    else:
        return noneValue
    def test_valid_input(self):
        self.assertEqual(monge_elkan([''], ['']), 1.0)  # need to check this

        self.assertEqual(monge_elkan([''], ['a']), 0.0)
        self.assertEqual(monge_elkan(['a'], ['a']), 1.0)

        self.assertEqual(monge_elkan(['Niall'], ['Neal']), 0.8049999999999999)
        self.assertEqual(monge_elkan(['Niall'], ['Njall']), 0.88)
        self.assertEqual(monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
                                     ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), 0.8364448051948052)
        self.assertEqual(
            monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
                        ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'],
                        sim_func=needleman_wunsch), 2.0)
        self.assertEqual(
            monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
                        ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'],
                        sim_func=affine), 2.25)
        self.assertEqual(monge_elkan(['Niall'], ['Niel']), 0.8266666666666667)
        self.assertEqual(monge_elkan(['Niall'], ['Nigel']), 0.7866666666666667)
Пример #3
0
    def test_valid_input(self):
        self.assertEqual(monge_elkan([''], ['']), 1.0)  # need to check this

        self.assertEqual(monge_elkan([''], ['a']), 0.0)
        self.assertEqual(monge_elkan(['a'], ['a']), 1.0)

        self.assertEqual(monge_elkan(['Niall'], ['Neal']), 0.8049999999999999)
        self.assertEqual(monge_elkan(['Niall'], ['Njall']), 0.88)
        self.assertEqual(
            monge_elkan([
                'Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of',
                'California,', 'San', 'Diego'
            ], [
                'Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,',
                'San', 'Diego'
            ]), 0.8364448051948052)
        self.assertEqual(
            monge_elkan([
                'Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of',
                'California,', 'San', 'Diego'
            ], [
                'Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,',
                'San', 'Diego'
            ],
                        sim_func=needleman_wunsch), 2.0)
        self.assertEqual(
            monge_elkan([
                'Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of',
                'California,', 'San', 'Diego'
            ], [
                'Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,',
                'San', 'Diego'
            ],
                        sim_func=affine), 2.25)
        self.assertEqual(monge_elkan(['Niall'], ['Niel']), 0.8266666666666667)
        self.assertEqual(monge_elkan(['Niall'], ['Nigel']), 0.7866666666666667)
 def test_invalid_input4(self):
     monge_elkan(['a'], None)
Пример #5
0
 def time_small_small_wo_rep(self):
     simfunctions.monge_elkan(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
 def test_invalid_input1(self):
     monge_elkan(1, 1)
Пример #7
0
 def test_invalid_input3(self):
     monge_elkan(None, None)
Пример #8
0
def generate_feature(file_name):
    lines = stage3_helper.read_file(file_name)

    features = []
    labels = []

    all_names = []
    for line in lines:
        json1, json2, label = stage3_helper.read_jsons_label_from_line(line)
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)
        all_names.append(tokenizers.whitespace(string1))
        all_names.append(tokenizers.whitespace(string2))

    for line in lines:
        json1, json2, label = stage3_helper.read_jsons_label_from_line(line)

        feature = []

        # TODO: Add more features and optimize features.

        # product_type
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_type)
        string1 = string1.lower()
        string2 = string2.lower()
        feature.append(
            simfunctions.jaccard(tokenizers.whitespace(string1),
                                 tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
        feature.append(
            simfunctions.jaro(
                tokenizers.whitespace(string1)[0],
                tokenizers.whitespace(string2)[0]))
        # if len(string1) == len(string2):
        #     feature.append(simfunctions.hamming_distance(string1, string2))
        # else:
        #     feature.append(5)
        feature.append(
            simfunctions.cosine(tokenizers.whitespace(string1),
                                tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.overlap_coefficient(tokenizers.whitespace(string1),
                                             tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.tfidf(tokenizers.whitespace(string1),
                               tokenizers.whitespace(string2)))
        feature.append(len(string1))
        feature.append(len(string2))
        feature.append(len(string1) - len(string2))
        feature.append(len(tokenizers.whitespace(string1)))
        feature.append(len(tokenizers.whitespace(string2)))
        # product_name
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)
        string1 = string1.lower()
        string2 = string2.lower()
        feature.append(
            simfunctions.jaccard(tokenizers.whitespace(string1),
                                 tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
        if len(string1) == len(string2):
            feature.append(simfunctions.hamming_distance(string1, string2))
        else:
            feature.append(5)
        feature.append(
            simfunctions.cosine(tokenizers.whitespace(string1),
                                tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.overlap_coefficient(tokenizers.whitespace(string1),
                                             tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.tfidf(tokenizers.whitespace(string1),
                               tokenizers.whitespace(string2)))
        feature.append(len(string1))
        feature.append(len(string2))
        feature.append(len(string1) - len(string2))
        feature.append(
            simfunctions.jaro(
                tokenizers.whitespace(string1)[0],
                tokenizers.whitespace(string2)[0]))
        feature.append(len(tokenizers.whitespace(string1)))
        feature.append(len(tokenizers.whitespace(string2)))

        # product_segment
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_segment)
        string1 = string1.lower()
        string2 = string2.lower()
        feature.append(
            simfunctions.jaccard(tokenizers.qgram(string1, 3),
                                 tokenizers.qgram(string2, 3)))
        feature.append(
            simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
        # if len(string1) == len(string2):
        #     feature.append(simfunctions.hamming_distance(string1, string2))
        # else:
        #     feature.append(5)
        feature.append(
            simfunctions.cosine(tokenizers.whitespace(string1),
                                tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.overlap_coefficient(tokenizers.whitespace(string1),
                                             tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.tfidf(tokenizers.whitespace(string1),
                               tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.jaro(
                tokenizers.whitespace(string1)[0],
                tokenizers.whitespace(string2)[0]))
        feature.append(len(string1))
        feature.append(len(string2))
        feature.append(len(string1) - len(string2))

        feature.append(len(tokenizers.whitespace(string1)))
        feature.append(len(tokenizers.whitespace(string2)))
        # product_long_description
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_long_description)

        if string1 is None or string2 is None:
            feature.append(0.5)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            # feature.append(0)
            # feature.append(0)
            # feature.append(0)
            # feature.append(0)
        else:
            string1 = string1.lower()
            string2 = string2.lower()
            string1 = stage3_helper.cleanhtml(string1)
            string2 = stage3_helper.cleanhtml(string2)
            string1 = stage3_helper.clean_stop_word(string1)
            string2 = stage3_helper.clean_stop_word(string2)
            feature.append(
                simfunctions.jaccard(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
            # feature.append(simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
            feature.append(
                simfunctions.overlap_coefficient(
                    tokenizers.whitespace(string1),
                    tokenizers.whitespace(string2)))
            # feature.append(simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2)))
            # feature.append(simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2)))
            feature.append(len(string1))
            feature.append(len(string2))
            feature.append(len(string1) - len(string2))

        # product_brand
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_brand)
        string1_name, string2_name = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)

        if string1 is None or string1 == '':
            string1 = get_predict_brand(string1_name)

        if string2 is None or string2 == '':
            string2 = get_predict_brand(string2_name)

        if string1 is None or string2 is None:
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
        else:
            feature.append(
                simfunctions.jaccard(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
            feature.append(
                simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
            feature.append(
                simfunctions.overlap_coefficient(
                    tokenizers.whitespace(string1),
                    tokenizers.whitespace(string2)))
            feature.append(
                simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                         tokenizers.whitespace(string2)))
            feature.append(
                simfunctions.tfidf(tokenizers.whitespace(string1),
                                   tokenizers.whitespace(string2)))
            feature.append(len(string1))
            feature.append(len(string2))
            feature.append(len(string1) - len(string2))
            #feature.append(simfunctions.jaro(tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0]))

        # Contains similar model names.
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)
        string1 = string1.lower()
        string2 = string2.lower()
        model_strs1 = stage3_helper.find_model_str(string1)
        model_strs2 = stage3_helper.find_model_str(string2)
        # share_model_str = False
        # for model in model_strs1:
        #     if model.lower() in string2.lower():
        #         share_model_str = True
        # for model in model_strs2:
        #     if model.lower() in string1.lower():
        #         share_model_str = True
        # if share_model_str:
        #     feature.append(1)
        # else:
        #     feature.append(0)
        if len(model_strs1) > 0 and len(model_strs2) > 0:
            feature.append(simfunctions.jaccard(model_strs1, model_strs2))
        else:
            feature.append(0.5)
        feature.append(len(model_strs1))
        feature.append(len(model_strs2))
        feature.append(len(model_strs1) - len(string2))

        # Other features.
        common = 0
        common_score = 0.0
        for item in json1:
            if item in json2:
                common += 1
                common_score += simfunctions.jaccard(
                    tokenizers.whitespace(json1[item][0]),
                    tokenizers.whitespace(json2[item][0]))
        common_score = common_score / common
        feature.append(len(json1))
        feature.append(len(json2))
        feature.append(len(json1) - len(json2))
        feature.append(common)
        feature.append(common_score)
        feature.append(len(json.dumps(json1)))
        feature.append(len(json.dumps(json2)))
        feature.append(len(json.dumps(json1)) - len(json.dumps(json2)))
        feature.append(
            simfunctions.jaccard(tokenizers.whitespace(json.dumps(json1)),
                                 tokenizers.whitespace(json.dumps(json2))))

        # Add one feature and label.
        features.append(feature)
        labels.append(stage3_helper.get_01_from_label(label))

    return features, labels, lines
Пример #9
0
 def test_invalid_input4(self):
     monge_elkan(['a'], None)
Пример #10
0
 def test_invalid_input2(self):
     monge_elkan(None, ['b'])
Пример #11
0
 def test_invalid_input5(self):
     monge_elkan(['temp'], 'temp')
Пример #12
0
 def test_invalid_input1(self):
     monge_elkan(1, 1)
Пример #13
0
        matched2 = CaSe_re.search(item[1]).group(0)
    except:
        matched2 = None
    for idx, brand in enumerate(brands_re):
        if (item[1] == None):
            match = None
        else:
            match = brand.search(item[1])
        if match is not None:
            if matched2 == None:
                matched2 = match.group(0)
            else:
                matchIndex = refurbished_re.sub('',
                                                item[1]).index(match.group(0))
                matchedIndex = refurbished_re.sub('', item[1]).index(matched2)
                if matchIndex <= matchedIndex:
                    matched2 = match.group(0)
    if matched2 is None:
        try:
            matched2 = CASE_re.search(item[0]).group(0)
        except:
            matched2 = ' '
    brandList[i].append(matched2)
    i = i + 1

monge_elkan_measure = []
for pair in brandList:
    measure = simfunctions.monge_elkan(tokenizers.whitespace(pair[0]),
                                       tokenizers.whitespace(pair[1]))
    monge_elkan_measure.append(measure)
print monge_elkan_measure
 def test_invalid_input2(self):
     monge_elkan(None, ['b'])
 def test_invalid_input5(self):
     monge_elkan(['temp'], 'temp')
 def test_invalid_input4(self):
     monge_elkan("temp", "temp")
Пример #17
0
 def time_small_medium_wi_rep(self):
     simfunctions.monge_elkan(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
Пример #18
0
 def time_large_large_wi_rep(self):
     simfunctions.monge_elkan(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
Пример #19
0
 def time_medium_medium_wo_rep(self):
     simfunctions.monge_elkan(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
 def test_invalid_input3(self):
     monge_elkan(None, None)
Пример #21
0
 def test_invalid_input4(self):
     monge_elkan("temp", "temp")