def test_valid_input(self): self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True), 0.11166746710505392) self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.17541160386140586) self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.5547001962252291) self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475) self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0) self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475) self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) self.assertEqual(tfidf([], ['a', 'b', 'a']), 0.0)
def test_valid_input(self): self.assertEqual( tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True), 0.11166746710505392) self.assertEqual( tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.17541160386140586) self.assertEqual( tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.5547001962252291) self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475) self.assertEqual( tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0) self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475) self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) self.assertEqual(tfidf([], ['a', 'b', 'a']), 0.0)
def generate_feature(filename): productName_courpus = [] brand_courpus = [] with open(filename, 'r') as f: for line in f: list_line = line.split('?') attribute_id1 = json.loads(list_line[2], encoding='latin-1') attribute_id2 = json.loads(list_line[4], encoding='latin-1') if "Product Name" in attribute_id1: productName_courpus.append( tokenizers.delimiter(attribute_id1["Product Name"][0])) if "Product Name" in attribute_id2: productName_courpus.append( tokenizers.delimiter(attribute_id2["Product Name"][0])) if "Brand" in attribute_id1: brand_courpus.append( tokenizers.delimiter(attribute_id1["Brand"][0])) if "Brand" in attribute_id2: brand_courpus.append( tokenizers.delimiter(attribute_id2["Brand"][0])) feature_matrix = [] with open(filename, 'r') as f: i = 1 for line in f: list_line = line.split('?') attribute_id1 = json.loads(list_line[2], encoding='latin-1') attribute_id2 = json.loads(list_line[4], encoding='latin-1') print 'Generate features for pair', i i = i + 1 instance = [] #Product Name 4 if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2): jaccard_productName = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0])) jaccard3gram_productName = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3)) tfidf_productName = simfunctions.tfidf( tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus) edit_productName = simfunctions.levenshtein( attribute_id1["Product Name"][0], attribute_id2["Product Name"][0]) edit_productName = 1 - edit_productName / max( len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0])) else: jaccard_productName = 0 jaccard3gram_productName = 0 tfidf_productName = 0 edit_productName = 0 instance += [ jaccard_productName, jaccard3gram_productName, tfidf_productName, edit_productName ] #Manufacturer 3 if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2): jaccard_manufacturer = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0])) jaccard3gram_manufacturer = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3)) tfidf_manufacturer = simfunctions.tfidf( tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0])) else: jaccard_manufacturer = 0 jaccard3gram_manufacturer = 0 tfidf_manufacturer = 0 instance += [ jaccard_manufacturer, jaccard3gram_manufacturer, tfidf_manufacturer ] #Color 3 if ("Color" in attribute_id1 and "Color" in attribute_id2): jaccard_color = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0])) jaccard3gram_color = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Color"][0], 3), tokenizers.qgram(attribute_id2["Color"][0], 3)) tfidf_color = simfunctions.tfidf( tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0])) else: jaccard_color = 0 jaccard3gram_color = 0 tfidf_color = 0 instance += [jaccard_color, jaccard3gram_color, tfidf_color] #Product Type 3 if ("Product Type" in attribute_id1 and "Product Type" in attribute_id2): jaccard_productType = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Product Type"][0]), tokenizers.delimiter(attribute_id2["Product Type"][0])) jaccard3gram_productType = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Product Type"][0], 3), tokenizers.qgram(attribute_id2["Product Type"][0], 3)) tfidf_productType = simfunctions.tfidf( tokenizers.delimiter(attribute_id1["Product Type"][0]), tokenizers.delimiter(attribute_id2["Product Type"][0])) else: jaccard_productType = 0 jaccard3gram_productType = 0 tfidf_productType = 0 instance += [ jaccard_productType, jaccard3gram_productType, tfidf_productType ] #Product Segment 3 if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2: jaccard_productSegment = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Product Segment"][0]), tokenizers.delimiter(attribute_id2["Product Segment"][0])) jaccard3gram_productSegment = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Product Segment"][0], 3), tokenizers.qgram(attribute_id2["Product Segment"][0], 3)) if (attribute_id1["Product Segment"][0] == attribute_id2["Product Segment"][0]): exactMatch_productSegment = 1 else: exactMatch_productSegment = 0 else: exactMatch_productSegment = 0 jaccard_productSegment = 0 jaccard3gram_productSegment = 0 instance += [ exactMatch_productSegment, jaccard_productSegment, jaccard3gram_productSegment ] #Brand 4 if ("Brand" in attribute_id1 and "Brand" in attribute_id2): jaccard_brand = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Brand"][0]), tokenizers.delimiter(attribute_id2["Brand"][0])) jaccard3gram_brand = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Brand"][0], 3), tokenizers.qgram(attribute_id2["Brand"][0], 3)) edit_brand = simfunctions.levenshtein( attribute_id1["Brand"][0], attribute_id2["Brand"][0]) edit_brand = 1 - edit_brand / max( len(attribute_id1["Brand"][0]), len(attribute_id2["Brand"][0])) tfidf_brand = simfunctions.tfidf( tokenizers.delimiter(attribute_id1["Brand"][0]), tokenizers.delimiter(attribute_id2["Brand"][0]), brand_courpus) else: jaccard3gram_brand = 0 jaccard_brand = 0 edit_brand = 0 tfidf_brand = 0 instance += [ jaccard_brand, jaccard3gram_brand, edit_brand, tfidf_brand ] #Category 2 if ("Category" in attribute_id1 and "Category" in attribute_id2): jaccard_category = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Category"][0]), tokenizers.delimiter(attribute_id2["Category"][0])) jaccard3gram_category = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Category"][0], 3), tokenizers.qgram(attribute_id2["Category"][0], 3)) else: jaccard_category = 0 jaccard3gram_category = 0 instance += [jaccard_category, jaccard3gram_category] #Long Description 3 if ("Product Long Description" in attribute_id1 and "Product Long Description" in attribute_id2): tfidf_long_description = simfunctions.tfidf( tokenizers.delimiter( attribute_id1["Product Long Description"][0]), tokenizers.delimiter( attribute_id2["Product Long Description"][0])) jaccard_long_description = simfunctions.jaccard( tokenizers.delimiter( attribute_id1["Product Long Description"][0]), tokenizers.delimiter( attribute_id2["Product Long Description"][0])) jaccard3_long_description = simfunctions.jaccard( tokenizers.qgram( attribute_id1["Product Long Description"][0], 3), tokenizers.qgram( attribute_id2["Product Long Description"][0], 3)) else: tfidf_long_description = 0 jaccard_long_description = 0 jaccard3_long_description = 0 instance += [ tfidf_long_description, jaccard_long_description, jaccard3_long_description ] #Short Description 3 if ("Product Short Description" in attribute_id1 and "Product Short Description" in attribute_id2): jaccard_short_description = simfunctions.jaccard( tokenizers.delimiter( attribute_id1["Product Short Description"][0]), tokenizers.delimiter( attribute_id2["Product Short Description"][0])) jaccard3_short_description = simfunctions.jaccard( tokenizers.qgram( attribute_id1["Product Short Description"][0], 3), tokenizers.qgram( attribute_id2["Product Short Description"][0], 3)) tfidf_short_description = simfunctions.tfidf( tokenizers.delimiter( attribute_id1["Product Short Description"][0]), tokenizers.delimiter( attribute_id2["Product Short Description"][0])) else: jaccard_short_description = 0 jaccard3_short_description = 0 tfidf_short_description = 0 instance += [ jaccard_short_description, jaccard3_short_description, tfidf_short_description ] #Other in long 8 if ("Product Name" in attribute_id1 and "Product Long Description" in attribute_id2): name_set = tokenizers.delimiter( attribute_id1["Product Name"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for name in name_set: if name in des: count = count + 1 name1_in_des2 = count / len(name_set) else: name1_in_des2 = 0 instance += [name1_in_des2] if ("Product Long Description" in attribute_id1 and "Product Name" in attribute_id2): name_set = tokenizers.delimiter( attribute_id2["Product Name"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for name in name_set: if name in des: count = count + 1 name2_in_des1 = count / len(name_set) else: name2_in_des1 = 0 instance += [name2_in_des1] if ("Brand" in attribute_id1 and "Product Long Description" in attribute_id2): brand_set = tokenizers.delimiter(attribute_id1["Brand"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for brand in brand_set: if brand in des: count = count + 1 brand1_in_des2 = count / len(brand_set) else: brand1_in_des2 = 0 instance += [brand1_in_des2] if ("Brand" in attribute_id2 and "Product Long Description" in attribute_id1): brand_set = tokenizers.delimiter(attribute_id2["Brand"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for brand in brand_set: if brand in des: count = count + 1 brand2_in_des1 = count / len(brand_set) else: brand2_in_des1 = 0 instance += [brand2_in_des1] if ("Manufacturer" in attribute_id1 and "Product Long Description" in attribute_id2): manufacturer_set = tokenizers.delimiter( attribute_id1["Manufacturer"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count + 1 manufacturer1_in_des2 = count / len(manufacturer_set) else: manufacturer1_in_des2 = 0 instance += [manufacturer1_in_des2] if ("Manufacturer" in attribute_id2 and "Product Long Description" in attribute_id1): manufacturer_set = tokenizers.delimiter( attribute_id2["Manufacturer"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count + 1 manufacturer2_in_des1 = count / len(manufacturer_set) else: manufacturer2_in_des1 = 0 instance += [manufacturer2_in_des1] if ("Product Short Description" in attribute_id1 and "Product Long Description" in attribute_id2): short_des_set = tokenizers.delimiter( attribute_id1["Product Short Description"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for short in short_des_set: if short in des: count = count + 1 short1_in_des2 = count / len(short_des_set) else: short1_in_des2 = 0 instance += [short1_in_des2] if ("Product Short Description" in attribute_id2 and "Product Long Description" in attribute_id1): short_des_set = tokenizers.delimiter( attribute_id2["Product Short Description"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for short in short_des_set: if short in des: count = count + 1 short2_in_des1 = count / len(short_des_set) else: short2_in_des1 = 0 instance += [short2_in_des1] #Other in short 6 if ("Product Name" in attribute_id1 and "Product Short Description" in attribute_id2): name_set = tokenizers.delimiter( attribute_id1["Product Name"][0]) des = attribute_id2["Product Short Description"][0] count = 0 for name in name_set: if name in des: count = count + 1 name1_in_short2 = count / len(name_set) else: name1_in_short2 = 0 instance += [name1_in_short2] if ("Product Short Description" in attribute_id1 and "Product Name" in attribute_id2): name_set = tokenizers.delimiter( attribute_id2["Product Name"][0]) des = attribute_id1["Product Short Description"][0] count = 0 for name in name_set: if name in des: count = count + 1 name2_in_short1 = count / len(name_set) else: name2_in_short1 = 0 instance += [name2_in_short1] if ("Brand" in attribute_id1 and "Product Short Description" in attribute_id2): brand_set = tokenizers.delimiter(attribute_id1["Brand"][0]) des = attribute_id2["Product Short Description"][0] count = 0 for brand in brand_set: if brand in des: count = count + 1 brand1_in_short2 = count / len(brand_set) else: brand1_in_short2 = 0 instance += [brand1_in_short2] if ("Brand" in attribute_id2 and "Product Short Description" in attribute_id1): brand_set = tokenizers.delimiter(attribute_id2["Brand"][0]) des = attribute_id1["Product Short Description"][0] count = 0 for brand in brand_set: if brand in des: count = count + 1 brand2_in_short1 = count / len(brand_set) else: brand2_in_short1 = 0 instance += [brand2_in_short1] if ("Manufacturer" in attribute_id1 and "Product Short Description" in attribute_id2): manufacturer_set = tokenizers.delimiter( attribute_id1["Manufacturer"][0]) des = attribute_id2["Product Short Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count + 1 manufacturer1_in_short2 = count / len(manufacturer_set) else: manufacturer1_in_short2 = 0 instance += [manufacturer1_in_short2] if ("Manufacturer" in attribute_id2 and "Product Short Description" in attribute_id1): manufacturer_set = tokenizers.delimiter( attribute_id2["Manufacturer"][0]) des = attribute_id1["Product Short Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count + 1 manufacturer2_in_short1 = count / len(manufacturer_set) else: manufacturer2_in_short1 = 0 instance += [manufacturer2_in_short1] #new 15 if ("Manufacturer Part Number" in attribute_id1 and "Product Long Description" in attribute_id2): manu_part_number_set = tokenizers.delimiter( attribute_id1["Manufacturer Part Number"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for manu_part in manu_part_number_set: if manu_part in des_set: count = count + 1 manu_part1_in_des2 = count / len(manu_part_number_set) else: manu_part1_in_des2 = 0 instance += [manu_part1_in_des2] if ("Manufacturer Part Number" in attribute_id2 and "Product Long Description" in attribute_id1): manu_part_number_set = tokenizers.delimiter( attribute_id2["Manufacturer Part Number"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for manu_part in manu_part_number_set: if manu_part in des_set: count = count + 1 manu_part2_in_des1 = count / len(manu_part_number_set) else: manu_part2_in_des1 = 0 instance += [manu_part2_in_des1] if ("Assembled Product Length" in attribute_id1 and "Product Long Description" in attribute_id2): length_set = tokenizers.delimiter( attribute_id1["Assembled Product Length"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for length in length_set: if length in des_set: count = count + 1 length1_in_des2 = count / len(length_set) else: length1_in_des2 = 0 instance += [length1_in_des2] if ("Assembled Product Length" in attribute_id2 and "Product Long Description" in attribute_id1): length_set = tokenizers.delimiter( attribute_id2["Assembled Product Length"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for length in length_set: if length in des_set: count = count + 1 length2_in_des1 = count / len(length_set) else: length2_in_des1 = 0 instance += [length2_in_des1] if ("Assembled Product Width" in attribute_id1 and "Product Long Description" in attribute_id2): width_set = tokenizers.delimiter( attribute_id1["Assembled Product Width"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for width in width_set: if width in des_set: count = count + 1 width1_in_des2 = count / len(width_set) else: width1_in_des2 = 0 instance += [width1_in_des2] if ("Assembled Product Width" in attribute_id2 and "Product Long Description" in attribute_id1): width_set = tokenizers.delimiter( attribute_id2["Assembled Product Width"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for width in width_set: if width in des_set: count = count + 1 width2_in_des1 = count / len(width_set) else: width2_in_des1 = 0 instance += [width2_in_des1] if ("Assembled Product Height" in attribute_id1 and "Product Long Description" in attribute_id2): height_set = tokenizers.delimiter( attribute_id1["Assembled Product Height"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for height in height_set: if height in des_set: count = count + 1 height1_in_des2 = count / len(height_set) else: height1_in_des2 = 0 instance += [height1_in_des2] if ("Assembled Product Height" in attribute_id2 and "Product Long Description" in attribute_id1): height_set = tokenizers.delimiter( attribute_id2["Assembled Product Height"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for height in height_set: if height in des_set: count = count + 1 height2_in_des1 = count / len(height_set) else: height2_in_des1 = 0 instance += [height2_in_des1] if ("Type" in attribute_id1 and "Product Long Description" in attribute_id2): type_set = tokenizers.delimiter(attribute_id1["Type"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for type in type_set: if type in des_set: count = count + 1 type1_in_des2 = count / len(type_set) else: type1_in_des2 = 0 instance += [type1_in_des2] if ("Type" in attribute_id2 and "Product Long Description" in attribute_id1): type_set = tokenizers.delimiter(attribute_id2["Type"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for type in type_set: if type in des_set: count = count + 1 type2_in_des1 = count / len(type_set) else: type2_in_des1 = 0 instance += [type2_in_des1] if ("Operating System" in attribute_id1 and "Product Long Description" in attribute_id2): op_set = tokenizers.delimiter( attribute_id1["Operating System"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for op in op_set: if op in op_set: count = count + 1 op1_in_des2 = count / len(op_set) else: op1_in_des2 = 0 instance += [op1_in_des2] if ("Operating System" in attribute_id2 and "Product Long Description" in attribute_id1): op_set = tokenizers.delimiter( attribute_id2["Operating System"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for op in op_set: if op in op_set: count = count + 1 op2_in_des1 = count / len(op_set) else: op2_in_des1 = 0 instance += [op2_in_des1] if ("Screen Size" in attribute_id1 and "Product Long Description" in attribute_id2): ss_set = tokenizers.delimiter(attribute_id1["Screen Size"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for ss in ss_set: if ss in ss_set: count = count + 1 ss1_in_des2 = count / len(ss_set) else: ss1_in_des2 = 0 instance += [ss1_in_des2] if ("Screen Size" in attribute_id2 and "Product Long Description" in attribute_id1): ss_set = tokenizers.delimiter(attribute_id2["Screen Size"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for ss in ss_set: if ss in ss_set: count = count + 1 ss2_in_des1 = count / len(ss_set) else: ss2_in_des1 = 0 instance += [ss2_in_des1] if "Product Long Description" in attribute_id2: all_set = [] for key in attribute_id1: if key is not "Product Long Description": value_list = tokenizers.delimiter( attribute_id1[key][0]) for v in value_list: all_set.append(v) des = attribute_id2["Product Long Description"][0] count = 0 for a in all_set: if a in des: count += 1 all1_in_des2 = count / len(all_set) else: all1_in_des2 = 0 instance += [all1_in_des2] feature_matrix.append(instance) return feature_matrix
def test_invalid_input1(self): tfidf(1, 1)
def time_small_large_wo_rep_no_corpus_no_dampen(self): simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
attribute_id1 = product_dict[id1] attribute_id2 = product_dict[id2] id.append([id1,id2]) # class label if (match_dict[pair] == 'MATCH'): classlabels.append(1) else: classlabels.append(0) ####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2): jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0])) jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3)) tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus) edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0]) edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0])) else: jaccard_productName = 0 jaccard3gram_productName = 0 tfidf_productName = 0 edit_productName = 0 ####feature: Manufacturer if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2): jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0])) jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3)) tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0])) else: jaccard_manufacturer = 0
def test_invalid_input2(self): tfidf(None, ['b'])
def time_medium_large_wo_rep_no_dampen(self): simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
def test_invalid_input3(self): tfidf(None, None)
def time_small_medium_wo_rep_no_dampen(self): simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
def time_small_large_wi_rep_no_dampen(self): simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
def time_medium_large_wi_rep_no_corpus(self): simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
def time_small_large_wi_rep_no_corpus(self): simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True)
def time_small_medium_wo_rep_no_corpus(self): simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True)
def test_invalid_input4(self): tfidf(['a'], None)
def time_small_medium_wi_rep(self): simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list, dampen=True)
def time_medium_large_wi_rep(self): simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True)
def generate_feature(file_name): lines = stage3_helper.read_file(file_name) features = [] labels = [] all_names = [] for line in lines: json1, json2, label = stage3_helper.read_jsons_label_from_line(line) string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) all_names.append(tokenizers.whitespace(string1)) all_names.append(tokenizers.whitespace(string2)) for line in lines: json1, json2, label = stage3_helper.read_jsons_label_from_line(line) feature = [] # TODO: Add more features and optimize features. # product_type string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_type) string1 = string1.lower() string2 = string2.lower() feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) feature.append( simfunctions.jaro( tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) # if len(string1) == len(string2): # feature.append(simfunctions.hamming_distance(string1, string2)) # else: # feature.append(5) feature.append( simfunctions.cosine(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.overlap_coefficient(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) feature.append(len(tokenizers.whitespace(string1))) feature.append(len(tokenizers.whitespace(string2))) # product_name string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) string1 = string1.lower() string2 = string2.lower() feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) if len(string1) == len(string2): feature.append(simfunctions.hamming_distance(string1, string2)) else: feature.append(5) feature.append( simfunctions.cosine(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.overlap_coefficient(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) feature.append( simfunctions.jaro( tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) feature.append(len(tokenizers.whitespace(string1))) feature.append(len(tokenizers.whitespace(string2))) # product_segment string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_segment) string1 = string1.lower() string2 = string2.lower() feature.append( simfunctions.jaccard(tokenizers.qgram(string1, 3), tokenizers.qgram(string2, 3))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) # if len(string1) == len(string2): # feature.append(simfunctions.hamming_distance(string1, string2)) # else: # feature.append(5) feature.append( simfunctions.cosine(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.overlap_coefficient(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro( tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) feature.append(len(tokenizers.whitespace(string1))) feature.append(len(tokenizers.whitespace(string2))) # product_long_description string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_long_description) if string1 is None or string2 is None: feature.append(0.5) feature.append(0) feature.append(0) feature.append(0) feature.append(0) # feature.append(0) # feature.append(0) # feature.append(0) # feature.append(0) else: string1 = string1.lower() string2 = string2.lower() string1 = stage3_helper.cleanhtml(string1) string2 = stage3_helper.cleanhtml(string2) string1 = stage3_helper.clean_stop_word(string1) string2 = stage3_helper.clean_stop_word(string2) feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) # feature.append(simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) feature.append( simfunctions.overlap_coefficient( tokenizers.whitespace(string1), tokenizers.whitespace(string2))) # feature.append(simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) # feature.append(simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) # product_brand string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_brand) string1_name, string2_name = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) if string1 is None or string1 == '': string1 = get_predict_brand(string1_name) if string2 is None or string2 == '': string2 = get_predict_brand(string2_name) if string1 is None or string2 is None: feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) else: feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) feature.append( simfunctions.overlap_coefficient( tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) #feature.append(simfunctions.jaro(tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) # Contains similar model names. string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) string1 = string1.lower() string2 = string2.lower() model_strs1 = stage3_helper.find_model_str(string1) model_strs2 = stage3_helper.find_model_str(string2) # share_model_str = False # for model in model_strs1: # if model.lower() in string2.lower(): # share_model_str = True # for model in model_strs2: # if model.lower() in string1.lower(): # share_model_str = True # if share_model_str: # feature.append(1) # else: # feature.append(0) if len(model_strs1) > 0 and len(model_strs2) > 0: feature.append(simfunctions.jaccard(model_strs1, model_strs2)) else: feature.append(0.5) feature.append(len(model_strs1)) feature.append(len(model_strs2)) feature.append(len(model_strs1) - len(string2)) # Other features. common = 0 common_score = 0.0 for item in json1: if item in json2: common += 1 common_score += simfunctions.jaccard( tokenizers.whitespace(json1[item][0]), tokenizers.whitespace(json2[item][0])) common_score = common_score / common feature.append(len(json1)) feature.append(len(json2)) feature.append(len(json1) - len(json2)) feature.append(common) feature.append(common_score) feature.append(len(json.dumps(json1))) feature.append(len(json.dumps(json2))) feature.append(len(json.dumps(json1)) - len(json.dumps(json2))) feature.append( simfunctions.jaccard(tokenizers.whitespace(json.dumps(json1)), tokenizers.whitespace(json.dumps(json2)))) # Add one feature and label. features.append(feature) labels.append(stage3_helper.get_01_from_label(label)) return features, labels, lines
# class label if (match_dict[pair] == 'MATCH'): classlabels.append(1) else: classlabels.append(0) ####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2): jaccard_productName = simfunctions.jaccard( tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0])) jaccard3gram_productName = simfunctions.jaccard( tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3)) tfidf_productName = simfunctions.tfidf( tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus) edit_productName = simfunctions.levenshtein( attribute_id1["Product Name"][0], attribute_id2["Product Name"][0]) edit_productName = 1 - edit_productName / max( len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0])) else: jaccard_productName = 0 jaccard3gram_productName = 0 tfidf_productName = 0 edit_productName = 0 ####feature: Manufacturer if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2): jaccard_manufacturer = simfunctions.jaccard(
def generate_feature(filename): productName_courpus = [] brand_courpus = [] with open(filename, 'r') as f: for line in f: list_line = line.split('?') attribute_id1 = json.loads(list_line[2], encoding = 'latin-1') attribute_id2 = json.loads(list_line[4], encoding = 'latin-1') if "Product Name" in attribute_id1: productName_courpus.append(tokenizers.delimiter(attribute_id1["Product Name"][0])) if "Product Name" in attribute_id2: productName_courpus.append(tokenizers.delimiter(attribute_id2["Product Name"][0])) if "Brand" in attribute_id1: brand_courpus.append(tokenizers.delimiter(attribute_id1["Brand"][0])) if "Brand" in attribute_id2: brand_courpus.append(tokenizers.delimiter(attribute_id2["Brand"][0])) feature_matrix = [] with open(filename, 'r') as f: i = 1 for line in f: list_line = line.split('?') attribute_id1 = json.loads(list_line[2], encoding = 'latin-1') attribute_id2 = json.loads(list_line[4], encoding = 'latin-1') print 'Generate features for pair', i i = i+1 instance = [] #Product Name 4 if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2): jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0])) jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3)) tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus) edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0]) edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0])) else: jaccard_productName = 0 jaccard3gram_productName = 0 tfidf_productName = 0 edit_productName = 0 instance += [jaccard_productName, jaccard3gram_productName, tfidf_productName, edit_productName] #Manufacturer 3 if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2): jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0])) jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3)) tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0])) else: jaccard_manufacturer = 0 jaccard3gram_manufacturer = 0 tfidf_manufacturer = 0 instance += [jaccard_manufacturer, jaccard3gram_manufacturer, tfidf_manufacturer] #Color 3 if ("Color" in attribute_id1 and "Color" in attribute_id2): jaccard_color = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0])) jaccard3gram_color = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Color"][0], 3), tokenizers.qgram(attribute_id2["Color"][0], 3)) tfidf_color = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0])) else: jaccard_color = 0 jaccard3gram_color = 0 tfidf_color = 0 instance += [jaccard_color, jaccard3gram_color, tfidf_color] #Product Type 3 if ("Product Type" in attribute_id1 and "Product Type" in attribute_id2): jaccard_productType = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0])) jaccard3gram_productType = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Type"][0], 3),tokenizers.qgram(attribute_id2["Product Type"][0], 3)) tfidf_productType = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0])) else: jaccard_productType = 0 jaccard3gram_productType = 0 tfidf_productType = 0 instance += [jaccard_productType, jaccard3gram_productType, tfidf_productType] #Product Segment 3 if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2: jaccard_productSegment = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Segment"][0]),tokenizers.delimiter(attribute_id2["Product Segment"][0])) jaccard3gram_productSegment= simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Segment"][0], 3),tokenizers.qgram(attribute_id2["Product Segment"][0], 3)) if (attribute_id1["Product Segment"][0] == attribute_id2["Product Segment"][0]): exactMatch_productSegment = 1 else: exactMatch_productSegment = 0 else: exactMatch_productSegment = 0 jaccard_productSegment = 0 jaccard3gram_productSegment = 0 instance += [exactMatch_productSegment, jaccard_productSegment, jaccard3gram_productSegment] #Brand 4 if ("Brand" in attribute_id1 and "Brand" in attribute_id2): jaccard_brand = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Brand"][0]),tokenizers.delimiter(attribute_id2["Brand"][0])) jaccard3gram_brand = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Brand"][0], 3), tokenizers.qgram(attribute_id2["Brand"][0], 3)) edit_brand = simfunctions.levenshtein(attribute_id1["Brand"][0], attribute_id2["Brand"][0]) edit_brand = 1 - edit_brand/max(len(attribute_id1["Brand"][0]), len(attribute_id2["Brand"][0])) tfidf_brand = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Brand"][0]), tokenizers.delimiter(attribute_id2["Brand"][0]), brand_courpus) else: jaccard3gram_brand = 0 jaccard_brand = 0 edit_brand = 0 tfidf_brand = 0 instance += [jaccard_brand, jaccard3gram_brand, edit_brand, tfidf_brand] #Category 2 if ("Category" in attribute_id1 and "Category" in attribute_id2): jaccard_category = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Category"][0]), tokenizers.delimiter(attribute_id2["Category"][0])) jaccard3gram_category = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Category"][0], 3), tokenizers.qgram(attribute_id2["Category"][0], 3)) else: jaccard_category = 0 jaccard3gram_category = 0 instance += [jaccard_category, jaccard3gram_category] #Long Description 3 if ("Product Long Description" in attribute_id1 and "Product Long Description" in attribute_id2): tfidf_long_description = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Long Description"][0]), tokenizers.delimiter(attribute_id2["Product Long Description"][0])) jaccard_long_description = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Long Description"][0]), tokenizers.delimiter(attribute_id2["Product Long Description"][0])) jaccard3_long_description = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Long Description"][0], 3), tokenizers.qgram(attribute_id2["Product Long Description"][0], 3)) else: tfidf_long_description = 0 jaccard_long_description = 0 jaccard3_long_description = 0 instance += [tfidf_long_description, jaccard_long_description, jaccard3_long_description] #Short Description 3 if ("Product Short Description" in attribute_id1 and "Product Short Description" in attribute_id2): jaccard_short_description = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Short Description"][0]), tokenizers.delimiter(attribute_id2["Product Short Description"][0])) jaccard3_short_description = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Short Description"][0], 3), tokenizers.qgram(attribute_id2["Product Short Description"][0], 3)) tfidf_short_description = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Short Description"][0]), tokenizers.delimiter(attribute_id2["Product Short Description"][0])) else: jaccard_short_description = 0 jaccard3_short_description = 0 tfidf_short_description = 0 instance += [jaccard_short_description, jaccard3_short_description, tfidf_short_description] #Other in long 8 if ("Product Name" in attribute_id1 and "Product Long Description" in attribute_id2): name_set = tokenizers.delimiter(attribute_id1["Product Name"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for name in name_set: if name in des: count = count+1 name1_in_des2 = count/len(name_set) else: name1_in_des2 = 0 instance += [name1_in_des2] if ("Product Long Description" in attribute_id1 and "Product Name" in attribute_id2): name_set = tokenizers.delimiter(attribute_id2["Product Name"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for name in name_set: if name in des: count = count+1 name2_in_des1 = count/len(name_set) else: name2_in_des1 = 0 instance += [name2_in_des1] if ("Brand" in attribute_id1 and "Product Long Description" in attribute_id2): brand_set = tokenizers.delimiter(attribute_id1["Brand"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for brand in brand_set: if brand in des: count = count+1 brand1_in_des2 = count/len(brand_set) else: brand1_in_des2 = 0 instance += [brand1_in_des2] if ("Brand" in attribute_id2 and "Product Long Description" in attribute_id1): brand_set = tokenizers.delimiter(attribute_id2["Brand"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for brand in brand_set: if brand in des: count = count+1 brand2_in_des1 = count/len(brand_set) else: brand2_in_des1 = 0 instance += [brand2_in_des1] if ("Manufacturer" in attribute_id1 and "Product Long Description" in attribute_id2): manufacturer_set = tokenizers.delimiter(attribute_id1["Manufacturer"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count+1 manufacturer1_in_des2 = count/len(manufacturer_set) else: manufacturer1_in_des2 = 0 instance += [manufacturer1_in_des2] if ("Manufacturer" in attribute_id2 and "Product Long Description" in attribute_id1): manufacturer_set = tokenizers.delimiter(attribute_id2["Manufacturer"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count+1 manufacturer2_in_des1 = count/len(manufacturer_set) else: manufacturer2_in_des1 = 0 instance += [manufacturer2_in_des1] if ("Product Short Description" in attribute_id1 and "Product Long Description" in attribute_id2): short_des_set = tokenizers.delimiter(attribute_id1["Product Short Description"][0]) des = attribute_id2["Product Long Description"][0] count = 0 for short in short_des_set: if short in des: count = count+1 short1_in_des2 = count/len(short_des_set) else: short1_in_des2 = 0 instance += [short1_in_des2] if ("Product Short Description" in attribute_id2 and "Product Long Description" in attribute_id1): short_des_set = tokenizers.delimiter(attribute_id2["Product Short Description"][0]) des = attribute_id1["Product Long Description"][0] count = 0 for short in short_des_set: if short in des: count = count+1 short2_in_des1 = count/len(short_des_set) else: short2_in_des1 = 0 instance += [short2_in_des1] #Other in short 6 if ("Product Name" in attribute_id1 and "Product Short Description" in attribute_id2): name_set = tokenizers.delimiter(attribute_id1["Product Name"][0]) des = attribute_id2["Product Short Description"][0] count = 0 for name in name_set: if name in des: count = count+1 name1_in_short2 = count/len(name_set) else: name1_in_short2 = 0 instance += [name1_in_short2] if ("Product Short Description" in attribute_id1 and "Product Name" in attribute_id2): name_set = tokenizers.delimiter(attribute_id2["Product Name"][0]) des = attribute_id1["Product Short Description"][0] count = 0 for name in name_set: if name in des: count = count+1 name2_in_short1 = count/len(name_set) else: name2_in_short1 = 0 instance += [name2_in_short1] if ("Brand" in attribute_id1 and "Product Short Description" in attribute_id2): brand_set = tokenizers.delimiter(attribute_id1["Brand"][0]) des = attribute_id2["Product Short Description"][0] count = 0 for brand in brand_set: if brand in des: count = count+1 brand1_in_short2 = count/len(brand_set) else: brand1_in_short2 = 0 instance += [brand1_in_short2] if ("Brand" in attribute_id2 and "Product Short Description" in attribute_id1): brand_set = tokenizers.delimiter(attribute_id2["Brand"][0]) des = attribute_id1["Product Short Description"][0] count = 0 for brand in brand_set: if brand in des: count = count+1 brand2_in_short1 = count/len(brand_set) else: brand2_in_short1 = 0 instance += [brand2_in_short1] if ("Manufacturer" in attribute_id1 and "Product Short Description" in attribute_id2): manufacturer_set = tokenizers.delimiter(attribute_id1["Manufacturer"][0]) des = attribute_id2["Product Short Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count+1 manufacturer1_in_short2 = count/len(manufacturer_set) else: manufacturer1_in_short2 = 0 instance += [manufacturer1_in_short2] if ("Manufacturer" in attribute_id2 and "Product Short Description" in attribute_id1): manufacturer_set = tokenizers.delimiter(attribute_id2["Manufacturer"][0]) des = attribute_id1["Product Short Description"][0] count = 0 for manufacturer in manufacturer_set: if manufacturer in des: count = count+1 manufacturer2_in_short1 = count/len(manufacturer_set) else: manufacturer2_in_short1 = 0 instance += [manufacturer2_in_short1] #new 15 if ("Manufacturer Part Number" in attribute_id1 and "Product Long Description" in attribute_id2): manu_part_number_set = tokenizers.delimiter(attribute_id1["Manufacturer Part Number"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for manu_part in manu_part_number_set: if manu_part in des_set: count = count+1 manu_part1_in_des2 = count/len(manu_part_number_set) else: manu_part1_in_des2 = 0 instance += [manu_part1_in_des2] if ("Manufacturer Part Number" in attribute_id2 and "Product Long Description" in attribute_id1): manu_part_number_set = tokenizers.delimiter(attribute_id2["Manufacturer Part Number"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for manu_part in manu_part_number_set: if manu_part in des_set: count = count+1 manu_part2_in_des1 = count/len(manu_part_number_set) else: manu_part2_in_des1 = 0 instance += [manu_part2_in_des1] if ("Assembled Product Length" in attribute_id1 and "Product Long Description" in attribute_id2): length_set = tokenizers.delimiter(attribute_id1["Assembled Product Length"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for length in length_set: if length in des_set: count = count+1 length1_in_des2 = count/len(length_set) else: length1_in_des2 = 0 instance += [length1_in_des2] if ("Assembled Product Length" in attribute_id2 and "Product Long Description" in attribute_id1): length_set = tokenizers.delimiter(attribute_id2["Assembled Product Length"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for length in length_set: if length in des_set: count = count+1 length2_in_des1 = count/len(length_set) else: length2_in_des1 = 0 instance += [length2_in_des1] if ("Assembled Product Width" in attribute_id1 and "Product Long Description" in attribute_id2): width_set = tokenizers.delimiter(attribute_id1["Assembled Product Width"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for width in width_set: if width in des_set: count = count+1 width1_in_des2 = count/len(width_set) else: width1_in_des2 = 0 instance += [width1_in_des2] if ("Assembled Product Width" in attribute_id2 and "Product Long Description" in attribute_id1): width_set = tokenizers.delimiter(attribute_id2["Assembled Product Width"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for width in width_set: if width in des_set: count = count+1 width2_in_des1 = count/len(width_set) else: width2_in_des1 = 0 instance += [width2_in_des1] if ("Assembled Product Height" in attribute_id1 and "Product Long Description" in attribute_id2): height_set = tokenizers.delimiter(attribute_id1["Assembled Product Height"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for height in height_set: if height in des_set: count = count+1 height1_in_des2 = count/len(height_set) else: height1_in_des2 = 0 instance += [height1_in_des2] if ("Assembled Product Height" in attribute_id2 and "Product Long Description" in attribute_id1): height_set = tokenizers.delimiter(attribute_id2["Assembled Product Height"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for height in height_set: if height in des_set: count = count+1 height2_in_des1 = count/len(height_set) else: height2_in_des1 = 0 instance += [height2_in_des1] if ("Type" in attribute_id1 and "Product Long Description" in attribute_id2): type_set = tokenizers.delimiter(attribute_id1["Type"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for type in type_set: if type in des_set: count = count+1 type1_in_des2 = count/len(type_set) else: type1_in_des2 = 0 instance += [type1_in_des2] if ("Type" in attribute_id2 and "Product Long Description" in attribute_id1): type_set = tokenizers.delimiter(attribute_id2["Type"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for type in type_set: if type in des_set: count = count+1 type2_in_des1 = count/len(type_set) else: type2_in_des1 = 0 instance += [type2_in_des1] if ("Operating System" in attribute_id1 and "Product Long Description" in attribute_id2): op_set = tokenizers.delimiter(attribute_id1["Operating System"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for op in op_set: if op in op_set: count = count+1 op1_in_des2 = count/len(op_set) else: op1_in_des2 = 0 instance += [op1_in_des2] if ("Operating System" in attribute_id2 and "Product Long Description" in attribute_id1): op_set = tokenizers.delimiter(attribute_id2["Operating System"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for op in op_set: if op in op_set: count = count+1 op2_in_des1 = count/len(op_set) else: op2_in_des1 = 0 instance += [op2_in_des1] if ("Screen Size" in attribute_id1 and "Product Long Description" in attribute_id2): ss_set = tokenizers.delimiter(attribute_id1["Screen Size"][0]) des_set = attribute_id2["Product Long Description"][0] count = 0 for ss in ss_set: if ss in ss_set: count = count+1 ss1_in_des2 = count/len(ss_set) else: ss1_in_des2 = 0 instance += [ss1_in_des2] if ("Screen Size" in attribute_id2 and "Product Long Description" in attribute_id1): ss_set = tokenizers.delimiter(attribute_id2["Screen Size"][0]) des_set = attribute_id1["Product Long Description"][0] count = 0 for ss in ss_set: if ss in ss_set: count = count+1 ss2_in_des1 = count/len(ss_set) else: ss2_in_des1 = 0 instance += [ss2_in_des1] if "Product Long Description" in attribute_id2: all_set = [] for key in attribute_id1: if key is not "Product Long Description": value_list = tokenizers.delimiter(attribute_id1[key][0]) for v in value_list: all_set.append(v) des = attribute_id2["Product Long Description"][0] count = 0 for a in all_set: if a in des: count += 1 all1_in_des2 = count/len(all_set) else: all1_in_des2 = 0 instance += [all1_in_des2] feature_matrix.append(instance) return feature_matrix
def time_small_medium_wi_rep_no_corpus_no_dampen(self): simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)