def monge_elkan(pair, feature): if feature in products[pair[0]] and feature in products[pair[1]]: return simfunctions.monge_elkan( tokenizers.whitespace(products[pair[0]].get(feature, [''])[0].lower()), tokenizers.whitespace(products[pair[1]].get(feature, [''])[0].lower())) else: return noneValue
def test_valid_input(self): self.assertEqual(monge_elkan([''], ['']), 1.0) # need to check this self.assertEqual(monge_elkan([''], ['a']), 0.0) self.assertEqual(monge_elkan(['a'], ['a']), 1.0) self.assertEqual(monge_elkan(['Niall'], ['Neal']), 0.8049999999999999) self.assertEqual(monge_elkan(['Niall'], ['Njall']), 0.88) self.assertEqual(monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), 0.8364448051948052) self.assertEqual( monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch), 2.0) self.assertEqual( monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine), 2.25) self.assertEqual(monge_elkan(['Niall'], ['Niel']), 0.8266666666666667) self.assertEqual(monge_elkan(['Niall'], ['Nigel']), 0.7866666666666667)
def test_valid_input(self): self.assertEqual(monge_elkan([''], ['']), 1.0) # need to check this self.assertEqual(monge_elkan([''], ['a']), 0.0) self.assertEqual(monge_elkan(['a'], ['a']), 1.0) self.assertEqual(monge_elkan(['Niall'], ['Neal']), 0.8049999999999999) self.assertEqual(monge_elkan(['Niall'], ['Njall']), 0.88) self.assertEqual( monge_elkan([ 'Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego' ], [ 'Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego' ]), 0.8364448051948052) self.assertEqual( monge_elkan([ 'Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego' ], [ 'Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego' ], sim_func=needleman_wunsch), 2.0) self.assertEqual( monge_elkan([ 'Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego' ], [ 'Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego' ], sim_func=affine), 2.25) self.assertEqual(monge_elkan(['Niall'], ['Niel']), 0.8266666666666667) self.assertEqual(monge_elkan(['Niall'], ['Nigel']), 0.7866666666666667)
def test_invalid_input4(self): monge_elkan(['a'], None)
def time_small_small_wo_rep(self): simfunctions.monge_elkan(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
def test_invalid_input1(self): monge_elkan(1, 1)
def test_invalid_input3(self): monge_elkan(None, None)
def generate_feature(file_name): lines = stage3_helper.read_file(file_name) features = [] labels = [] all_names = [] for line in lines: json1, json2, label = stage3_helper.read_jsons_label_from_line(line) string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) all_names.append(tokenizers.whitespace(string1)) all_names.append(tokenizers.whitespace(string2)) for line in lines: json1, json2, label = stage3_helper.read_jsons_label_from_line(line) feature = [] # TODO: Add more features and optimize features. # product_type string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_type) string1 = string1.lower() string2 = string2.lower() feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) feature.append( simfunctions.jaro( tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) # if len(string1) == len(string2): # feature.append(simfunctions.hamming_distance(string1, string2)) # else: # feature.append(5) feature.append( simfunctions.cosine(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.overlap_coefficient(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) feature.append(len(tokenizers.whitespace(string1))) feature.append(len(tokenizers.whitespace(string2))) # product_name string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) string1 = string1.lower() string2 = string2.lower() feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) if len(string1) == len(string2): feature.append(simfunctions.hamming_distance(string1, string2)) else: feature.append(5) feature.append( simfunctions.cosine(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.overlap_coefficient(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) feature.append( simfunctions.jaro( tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) feature.append(len(tokenizers.whitespace(string1))) feature.append(len(tokenizers.whitespace(string2))) # product_segment string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_segment) string1 = string1.lower() string2 = string2.lower() feature.append( simfunctions.jaccard(tokenizers.qgram(string1, 3), tokenizers.qgram(string2, 3))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) # if len(string1) == len(string2): # feature.append(simfunctions.hamming_distance(string1, string2)) # else: # feature.append(5) feature.append( simfunctions.cosine(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.overlap_coefficient(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro( tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) feature.append(len(tokenizers.whitespace(string1))) feature.append(len(tokenizers.whitespace(string2))) # product_long_description string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_long_description) if string1 is None or string2 is None: feature.append(0.5) feature.append(0) feature.append(0) feature.append(0) feature.append(0) # feature.append(0) # feature.append(0) # feature.append(0) # feature.append(0) else: string1 = string1.lower() string2 = string2.lower() string1 = stage3_helper.cleanhtml(string1) string2 = stage3_helper.cleanhtml(string2) string1 = stage3_helper.clean_stop_word(string1) string2 = stage3_helper.clean_stop_word(string2) feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) # feature.append(simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) feature.append( simfunctions.overlap_coefficient( tokenizers.whitespace(string1), tokenizers.whitespace(string2))) # feature.append(simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) # feature.append(simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) # product_brand string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_brand) string1_name, string2_name = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) if string1 is None or string1 == '': string1 = get_predict_brand(string1_name) if string2 is None or string2 == '': string2 = get_predict_brand(string2_name) if string1 is None or string2 is None: feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) feature.append(0) else: feature.append( simfunctions.jaccard(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1)) feature.append( simfunctions.overlap_coefficient( tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append( simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2))) feature.append(len(string1)) feature.append(len(string2)) feature.append(len(string1) - len(string2)) #feature.append(simfunctions.jaro(tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0])) # Contains similar model names. string1, string2 = stage3_helper.get_attribute_from_jsons( json1, json2, product_name) string1 = string1.lower() string2 = string2.lower() model_strs1 = stage3_helper.find_model_str(string1) model_strs2 = stage3_helper.find_model_str(string2) # share_model_str = False # for model in model_strs1: # if model.lower() in string2.lower(): # share_model_str = True # for model in model_strs2: # if model.lower() in string1.lower(): # share_model_str = True # if share_model_str: # feature.append(1) # else: # feature.append(0) if len(model_strs1) > 0 and len(model_strs2) > 0: feature.append(simfunctions.jaccard(model_strs1, model_strs2)) else: feature.append(0.5) feature.append(len(model_strs1)) feature.append(len(model_strs2)) feature.append(len(model_strs1) - len(string2)) # Other features. common = 0 common_score = 0.0 for item in json1: if item in json2: common += 1 common_score += simfunctions.jaccard( tokenizers.whitespace(json1[item][0]), tokenizers.whitespace(json2[item][0])) common_score = common_score / common feature.append(len(json1)) feature.append(len(json2)) feature.append(len(json1) - len(json2)) feature.append(common) feature.append(common_score) feature.append(len(json.dumps(json1))) feature.append(len(json.dumps(json2))) feature.append(len(json.dumps(json1)) - len(json.dumps(json2))) feature.append( simfunctions.jaccard(tokenizers.whitespace(json.dumps(json1)), tokenizers.whitespace(json.dumps(json2)))) # Add one feature and label. features.append(feature) labels.append(stage3_helper.get_01_from_label(label)) return features, labels, lines
def test_invalid_input2(self): monge_elkan(None, ['b'])
def test_invalid_input5(self): monge_elkan(['temp'], 'temp')
matched2 = CaSe_re.search(item[1]).group(0) except: matched2 = None for idx, brand in enumerate(brands_re): if (item[1] == None): match = None else: match = brand.search(item[1]) if match is not None: if matched2 == None: matched2 = match.group(0) else: matchIndex = refurbished_re.sub('', item[1]).index(match.group(0)) matchedIndex = refurbished_re.sub('', item[1]).index(matched2) if matchIndex <= matchedIndex: matched2 = match.group(0) if matched2 is None: try: matched2 = CASE_re.search(item[0]).group(0) except: matched2 = ' ' brandList[i].append(matched2) i = i + 1 monge_elkan_measure = [] for pair in brandList: measure = simfunctions.monge_elkan(tokenizers.whitespace(pair[0]), tokenizers.whitespace(pair[1])) monge_elkan_measure.append(measure) print monge_elkan_measure
def test_invalid_input4(self): monge_elkan("temp", "temp")
def time_small_medium_wi_rep(self): simfunctions.monge_elkan(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
def time_large_large_wi_rep(self): simfunctions.monge_elkan(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
def time_medium_medium_wo_rep(self): simfunctions.monge_elkan(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)