def test_invalid_input3(self):
     levenshtein(None, None)
 def test_valid_input(self):
     # http://oldfashionedsoftware.com/tag/levenshtein-distance/
     self.assertEqual(levenshtein('a', ''), 1)
     self.assertEqual(levenshtein('', 'a'), 1)
     self.assertEqual(levenshtein('abc', ''), 3)
     self.assertEqual(levenshtein('', 'abc'), 3)
     self.assertEqual(levenshtein('', ''), 0)
     self.assertEqual(levenshtein('a', 'a'), 0)
     self.assertEqual(levenshtein('abc', 'abc'), 0)
     self.assertEqual(levenshtein('', 'a'), 1)
     self.assertEqual(levenshtein('a', 'ab'), 1)
     self.assertEqual(levenshtein('b', 'ab'), 1)
     self.assertEqual(levenshtein('ac', 'abc'), 1)
     self.assertEqual(levenshtein('abcdefg', 'xabxcdxxefxgx'), 6)
     self.assertEqual(levenshtein('a', ''), 1)
     self.assertEqual(levenshtein('ab', 'a'), 1)
     self.assertEqual(levenshtein('ab', 'b'), 1)
     self.assertEqual(levenshtein('abc', 'ac'), 1)
     self.assertEqual(levenshtein('xabxcdxxefxgx', 'abcdefg'), 6)
     self.assertEqual(levenshtein('a', 'b'), 1)
     self.assertEqual(levenshtein('ab', 'ac'), 1)
     self.assertEqual(levenshtein('ac', 'bc'), 1)
     self.assertEqual(levenshtein('abc', 'axc'), 1)
     self.assertEqual(levenshtein('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6)
     self.assertEqual(levenshtein('example', 'samples'), 3)
     self.assertEqual(levenshtein('sturgeon', 'urgently'), 6)
     self.assertEqual(levenshtein('levenshtein', 'frankenstein'), 6)
     self.assertEqual(levenshtein('distance', 'difference'), 5)
     self.assertEqual(levenshtein('java was neat', 'scala is great'), 7)
 def test_invalid_input1(self):
     levenshtein('a', None)
 def test_invalid_input2(self):
     levenshtein(None, 'b')
Пример #5
0
def generate_feature(filename):
    productName_courpus = []
    brand_courpus = []
    with open(filename, 'r') as f:
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
            attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')

            if "Product Name" in attribute_id1:
		        productName_courpus.append(tokenizers.delimiter(attribute_id1["Product Name"][0]))
            if "Product Name" in attribute_id2:
                productName_courpus.append(tokenizers.delimiter(attribute_id2["Product Name"][0]))

            if "Brand" in attribute_id1:
                brand_courpus.append(tokenizers.delimiter(attribute_id1["Brand"][0]))
            if "Brand" in attribute_id2:
                brand_courpus.append(tokenizers.delimiter(attribute_id2["Brand"][0]))

    feature_matrix = []
    with open(filename, 'r') as f:
        i = 1
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
            attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')

            print 'Generate features for pair', i
            i = i+1

            instance = []

            #Product Name 4
            if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
                jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
                jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
                tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
                edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
                edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
            else:
                jaccard_productName = 0
                jaccard3gram_productName = 0
                tfidf_productName = 0
                edit_productName = 0

            instance += [jaccard_productName, jaccard3gram_productName, tfidf_productName, edit_productName]

            #Manufacturer 3
            if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
                jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
                jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
                tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
            else:
                jaccard_manufacturer = 0
                jaccard3gram_manufacturer = 0
                tfidf_manufacturer = 0

            instance += [jaccard_manufacturer, jaccard3gram_manufacturer, tfidf_manufacturer]

            #Color 3
            if ("Color" in attribute_id1 and "Color" in attribute_id2):
                jaccard_color = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
                jaccard3gram_color = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Color"][0], 3), tokenizers.qgram(attribute_id2["Color"][0], 3))
                tfidf_color = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
            else:
                jaccard_color = 0
                jaccard3gram_color = 0
                tfidf_color = 0

            instance += [jaccard_color, jaccard3gram_color, tfidf_color]

            #Product Type 3
            if ("Product Type" in attribute_id1 and "Product Type" in attribute_id2):
                jaccard_productType = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
                jaccard3gram_productType = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Type"][0], 3),tokenizers.qgram(attribute_id2["Product Type"][0], 3))
                tfidf_productType = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
            else:
                jaccard_productType = 0
                jaccard3gram_productType = 0
                tfidf_productType = 0

            instance += [jaccard_productType, jaccard3gram_productType, tfidf_productType]

            #Product Segment 3
            if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2:
                jaccard_productSegment = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Segment"][0]),tokenizers.delimiter(attribute_id2["Product Segment"][0]))
                jaccard3gram_productSegment= simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Segment"][0], 3),tokenizers.qgram(attribute_id2["Product Segment"][0], 3))
                if (attribute_id1["Product Segment"][0] == attribute_id2["Product Segment"][0]):
                    exactMatch_productSegment = 1
                else:
                    exactMatch_productSegment = 0
            else:
                exactMatch_productSegment = 0
                jaccard_productSegment = 0
                jaccard3gram_productSegment = 0

            instance += [exactMatch_productSegment, jaccard_productSegment, jaccard3gram_productSegment]

            #Brand 4
            if ("Brand" in attribute_id1 and "Brand" in attribute_id2):
                jaccard_brand = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Brand"][0]),tokenizers.delimiter(attribute_id2["Brand"][0]))
                jaccard3gram_brand = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Brand"][0], 3),
                                                          tokenizers.qgram(attribute_id2["Brand"][0], 3))
                edit_brand = simfunctions.levenshtein(attribute_id1["Brand"][0], attribute_id2["Brand"][0])
                edit_brand = 1 - edit_brand/max(len(attribute_id1["Brand"][0]), len(attribute_id2["Brand"][0]))
                tfidf_brand = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Brand"][0]), tokenizers.delimiter(attribute_id2["Brand"][0]), brand_courpus)
            else:
                jaccard3gram_brand = 0
                jaccard_brand = 0
                edit_brand = 0
                tfidf_brand = 0

            instance += [jaccard_brand, jaccard3gram_brand, edit_brand, tfidf_brand]

            #Category 2
            if ("Category" in attribute_id1 and "Category" in attribute_id2):
                jaccard_category = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Category"][0]),
                                                        tokenizers.delimiter(attribute_id2["Category"][0]))
                jaccard3gram_category = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Category"][0], 3),
                                                          tokenizers.qgram(attribute_id2["Category"][0], 3))
            else:
                jaccard_category = 0
                jaccard3gram_category = 0

            instance += [jaccard_category, jaccard3gram_category]

            #Long Description 3
            if ("Product Long Description" in attribute_id1 and "Product Long Description" in attribute_id2):
                tfidf_long_description = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Long Description"][0]), tokenizers.delimiter(attribute_id2["Product Long Description"][0]))
                jaccard_long_description = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Long Description"][0]), tokenizers.delimiter(attribute_id2["Product Long Description"][0]))
                jaccard3_long_description = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Long Description"][0], 3), tokenizers.qgram(attribute_id2["Product Long Description"][0], 3))
            else:
                tfidf_long_description = 0
                jaccard_long_description = 0
                jaccard3_long_description = 0

            instance += [tfidf_long_description, jaccard_long_description, jaccard3_long_description]

            #Short Description 3
            if ("Product Short Description" in attribute_id1 and "Product Short Description" in attribute_id2):
                jaccard_short_description = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Short Description"][0]), tokenizers.delimiter(attribute_id2["Product Short Description"][0]))
                jaccard3_short_description = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Short Description"][0], 3), tokenizers.qgram(attribute_id2["Product Short Description"][0], 3))
                tfidf_short_description = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Short Description"][0]), tokenizers.delimiter(attribute_id2["Product Short Description"][0]))
            else:
                jaccard_short_description = 0
                jaccard3_short_description = 0
                tfidf_short_description = 0

            instance += [jaccard_short_description, jaccard3_short_description, tfidf_short_description]

            #Other in long 8
            if ("Product Name" in attribute_id1 and "Product Long Description" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id1["Product Name"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name1_in_des2 = count/len(name_set)
            else:
                name1_in_des2 = 0

            instance += [name1_in_des2]

            if ("Product Long Description" in attribute_id1 and "Product Name" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id2["Product Name"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name2_in_des1 = count/len(name_set)
            else:
                name2_in_des1 = 0

            instance += [name2_in_des1]

            if ("Brand" in attribute_id1 and "Product Long Description" in attribute_id2):
                brand_set = tokenizers.delimiter(attribute_id1["Brand"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand1_in_des2 = count/len(brand_set)
            else:
                brand1_in_des2 = 0

            instance += [brand1_in_des2]

            if ("Brand" in attribute_id2 and "Product Long Description" in attribute_id1):
                brand_set = tokenizers.delimiter(attribute_id2["Brand"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand2_in_des1 = count/len(brand_set)
            else:
                brand2_in_des1 = 0

            instance += [brand2_in_des1]

            if ("Manufacturer" in attribute_id1 and "Product Long Description" in attribute_id2):
                manufacturer_set = tokenizers.delimiter(attribute_id1["Manufacturer"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer1_in_des2 = count/len(manufacturer_set)
            else:
                manufacturer1_in_des2 = 0

            instance += [manufacturer1_in_des2]

            if ("Manufacturer" in attribute_id2 and "Product Long Description" in attribute_id1):
                manufacturer_set = tokenizers.delimiter(attribute_id2["Manufacturer"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer2_in_des1 = count/len(manufacturer_set)
            else:
                manufacturer2_in_des1 = 0

            instance += [manufacturer2_in_des1]

            if ("Product Short Description" in attribute_id1 and "Product Long Description" in attribute_id2):
                short_des_set = tokenizers.delimiter(attribute_id1["Product Short Description"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for short in short_des_set:
                    if short in des:
                        count = count+1
                short1_in_des2 = count/len(short_des_set)
            else:
                short1_in_des2 = 0

            instance += [short1_in_des2]

            if ("Product Short Description" in attribute_id2 and "Product Long Description" in attribute_id1):
                short_des_set = tokenizers.delimiter(attribute_id2["Product Short Description"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for short in short_des_set:
                    if short in des:
                        count = count+1
                short2_in_des1 = count/len(short_des_set)
            else:
                short2_in_des1 = 0

            instance += [short2_in_des1]

            #Other in short 6
            if ("Product Name" in attribute_id1 and "Product Short Description" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id1["Product Name"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name1_in_short2 = count/len(name_set)
            else:
                name1_in_short2 = 0

            instance += [name1_in_short2]

            if ("Product Short Description" in attribute_id1 and "Product Name" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id2["Product Name"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name2_in_short1 = count/len(name_set)
            else:
                name2_in_short1 = 0

            instance += [name2_in_short1]

            if ("Brand" in attribute_id1 and "Product Short Description" in attribute_id2):
                brand_set = tokenizers.delimiter(attribute_id1["Brand"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand1_in_short2 = count/len(brand_set)
            else:
                brand1_in_short2 = 0

            instance += [brand1_in_short2]

            if ("Brand" in attribute_id2 and "Product Short Description" in attribute_id1):
                brand_set = tokenizers.delimiter(attribute_id2["Brand"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand2_in_short1 = count/len(brand_set)
            else:
                brand2_in_short1 = 0

            instance += [brand2_in_short1]

            if ("Manufacturer" in attribute_id1 and "Product Short Description" in attribute_id2):
                manufacturer_set = tokenizers.delimiter(attribute_id1["Manufacturer"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer1_in_short2 = count/len(manufacturer_set)
            else:
                manufacturer1_in_short2 = 0

            instance += [manufacturer1_in_short2]

            if ("Manufacturer" in attribute_id2 and "Product Short Description" in attribute_id1):
                manufacturer_set = tokenizers.delimiter(attribute_id2["Manufacturer"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer2_in_short1 = count/len(manufacturer_set)
            else:
                manufacturer2_in_short1 = 0

            instance += [manufacturer2_in_short1]

            #new 15
            if ("Manufacturer Part Number" in attribute_id1 and "Product Long Description" in attribute_id2):
                manu_part_number_set = tokenizers.delimiter(attribute_id1["Manufacturer Part Number"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for manu_part in manu_part_number_set:
                    if manu_part in des_set:
                        count = count+1
                manu_part1_in_des2 = count/len(manu_part_number_set)
            else:
                manu_part1_in_des2 = 0

            instance += [manu_part1_in_des2]

            if ("Manufacturer Part Number" in attribute_id2 and "Product Long Description" in attribute_id1):
                manu_part_number_set = tokenizers.delimiter(attribute_id2["Manufacturer Part Number"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for manu_part in manu_part_number_set:
                    if manu_part in des_set:
                        count = count+1
                manu_part2_in_des1 = count/len(manu_part_number_set)
            else:
                manu_part2_in_des1 = 0

            instance += [manu_part2_in_des1]

            if ("Assembled Product Length" in attribute_id1 and "Product Long Description" in attribute_id2):
                length_set = tokenizers.delimiter(attribute_id1["Assembled Product Length"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for length in length_set:
                    if length in des_set:
                        count = count+1
                length1_in_des2 = count/len(length_set)
            else:
                length1_in_des2 = 0

            instance += [length1_in_des2]

            if ("Assembled Product Length" in attribute_id2 and "Product Long Description" in attribute_id1):
                length_set = tokenizers.delimiter(attribute_id2["Assembled Product Length"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for length in length_set:
                    if length in des_set:
                        count = count+1
                length2_in_des1 = count/len(length_set)
            else:
                length2_in_des1 = 0

            instance += [length2_in_des1]

            if ("Assembled Product Width" in attribute_id1 and "Product Long Description" in attribute_id2):
                width_set = tokenizers.delimiter(attribute_id1["Assembled Product Width"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for width in width_set:
                    if width in des_set:
                        count = count+1
                width1_in_des2 = count/len(width_set)
            else:
                width1_in_des2 = 0

            instance += [width1_in_des2]

            if ("Assembled Product Width" in attribute_id2 and "Product Long Description" in attribute_id1):
                width_set = tokenizers.delimiter(attribute_id2["Assembled Product Width"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for width in width_set:
                    if width in des_set:
                        count = count+1
                width2_in_des1 = count/len(width_set)
            else:
                width2_in_des1 = 0

            instance += [width2_in_des1]

            if ("Assembled Product Height" in attribute_id1 and "Product Long Description" in attribute_id2):
                height_set = tokenizers.delimiter(attribute_id1["Assembled Product Height"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for height in height_set:
                    if height in des_set:
                        count = count+1
                height1_in_des2 = count/len(height_set)
            else:
                height1_in_des2 = 0

            instance += [height1_in_des2]

            if ("Assembled Product Height" in attribute_id2 and "Product Long Description" in attribute_id1):
                height_set = tokenizers.delimiter(attribute_id2["Assembled Product Height"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for height in height_set:
                    if height in des_set:
                        count = count+1
                height2_in_des1 = count/len(height_set)
            else:
                height2_in_des1 = 0

            instance += [height2_in_des1]

            if ("Type" in attribute_id1 and "Product Long Description" in attribute_id2):
                type_set = tokenizers.delimiter(attribute_id1["Type"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for type in type_set:
                    if type in des_set:
                        count = count+1
                type1_in_des2 = count/len(type_set)
            else:
                type1_in_des2 = 0

            instance += [type1_in_des2]

            if ("Type" in attribute_id2 and "Product Long Description" in attribute_id1):
                type_set = tokenizers.delimiter(attribute_id2["Type"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for type in type_set:
                    if type in des_set:
                        count = count+1
                type2_in_des1 = count/len(type_set)
            else:
                type2_in_des1 = 0

            instance += [type2_in_des1]

            if ("Operating System" in attribute_id1 and "Product Long Description" in attribute_id2):
                op_set = tokenizers.delimiter(attribute_id1["Operating System"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for op in op_set:
                    if op in op_set:
                        count = count+1
                op1_in_des2 = count/len(op_set)
            else:
                op1_in_des2 = 0

            instance += [op1_in_des2]

            if ("Operating System" in attribute_id2 and "Product Long Description" in attribute_id1):
                op_set = tokenizers.delimiter(attribute_id2["Operating System"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for op in op_set:
                    if op in op_set:
                        count = count+1
                op2_in_des1 = count/len(op_set)
            else:
                op2_in_des1 = 0

            instance += [op2_in_des1]

            if ("Screen Size" in attribute_id1 and "Product Long Description" in attribute_id2):
                ss_set = tokenizers.delimiter(attribute_id1["Screen Size"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for ss in ss_set:
                    if ss in ss_set:
                        count = count+1
                ss1_in_des2 = count/len(ss_set)
            else:
                ss1_in_des2 = 0

            instance += [ss1_in_des2]

            if ("Screen Size" in attribute_id2 and "Product Long Description" in attribute_id1):
                ss_set = tokenizers.delimiter(attribute_id2["Screen Size"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for ss in ss_set:
                    if ss in ss_set:
                        count = count+1
                ss2_in_des1 = count/len(ss_set)
            else:
                ss2_in_des1 = 0

            instance += [ss2_in_des1]

            if "Product Long Description" in attribute_id2:
                all_set = []
                for key in attribute_id1:
                    if key is not "Product Long Description":
                        value_list = tokenizers.delimiter(attribute_id1[key][0])
                        for v in value_list:
                            all_set.append(v)
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for a in all_set:
                    if a in des:
                        count += 1
                all1_in_des2 = count/len(all_set)
            else:
                all1_in_des2 = 0

            instance += [all1_in_des2]

            feature_matrix.append(instance)

    return feature_matrix
Пример #6
0
	attribute_id2 = product_dict[id2]
	id.append([id1,id2])

	# class label
	if (match_dict[pair] == 'MATCH'):
		classlabels.append(1)
	else:
		classlabels.append(0)


	####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf
	if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
		jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
		jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
		tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
		edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
		edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
	else:
		jaccard_productName = 0
		jaccard3gram_productName = 0
		tfidf_productName = 0
		edit_productName = 0

	####feature: Manufacturer
	if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
		jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
		jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
		tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
	else:
		jaccard_manufacturer = 0
		jaccard3gram_manufacturer = 0
Пример #7
0
 def time_short_long(self):
     simfunctions.levenshtein(_short_string_1, _long_string_1)
Пример #8
0
 def time_medium_long(self):
     simfunctions.levenshtein(_medium_string_1, _long_string_1)
Пример #9
0
 def time_short_medium(self):
     simfunctions.levenshtein(_short_string_1, _medium_string_1)
Пример #10
0
 def time_long_long(self):
     simfunctions.levenshtein(_long_string_1, _long_string_2)
Пример #11
0
 def time_medium_medium(self):
     simfunctions.levenshtein(_medium_string_1, _medium_string_2)
Пример #12
0
 def time_short_short(self):
     simfunctions.levenshtein(_short_string_1, _short_string_2)
Пример #13
0
    else:
        classlabels.append(0)

    ####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf
    if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
        jaccard_productName = simfunctions.jaccard(
            tokenizers.delimiter(attribute_id1["Product Name"][0]),
            tokenizers.delimiter(attribute_id2["Product Name"][0]))
        jaccard3gram_productName = simfunctions.jaccard(
            tokenizers.qgram(attribute_id1["Product Name"][0], 3),
            tokenizers.qgram(attribute_id2["Product Name"][0], 3))
        tfidf_productName = simfunctions.tfidf(
            tokenizers.delimiter(attribute_id1["Product Name"][0]),
            tokenizers.delimiter(attribute_id2["Product Name"][0]),
            productName_courpus)
        edit_productName = simfunctions.levenshtein(
            attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
        edit_productName = 1 - edit_productName / max(
            len(attribute_id1["Product Name"][0]),
            len(attribute_id2["Product Name"][0]))
    else:
        jaccard_productName = 0
        jaccard3gram_productName = 0
        tfidf_productName = 0
        edit_productName = 0

    ####feature: Manufacturer
    if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
        jaccard_manufacturer = simfunctions.jaccard(
            tokenizers.delimiter(attribute_id1["Manufacturer"][0]),
            tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
        jaccard3gram_manufacturer = simfunctions.jaccard(