def match_business(text, list_of_company_names): """one wall of petitioner name text to be compared to a large list of company names to find best match""" # max_score, max_company_name = 0, np.nan max_score_jw, max_company_name_jw = 0, np.nan for company in list_of_company_names: if company in text: return 1, company score = fuzz.token_set_ratio(text, company) # print(text, company, score, max_score) if score > 0.75: score_jw = jaro_winkler.normalized_similarity(text, company) if score_jw > max_score_jw: max_score_jw, max_company_name_jw = score_jw, company print('1 done', max_score_jw, max_company_name_jw) return max_score_jw, max_company_name_jw
def produce_similarity_score_text(text_1, text_2, metric='edit_distance'): """similarity between two strings""" if metric == 'edit_distance': similarity_score = nltk.edit_distance(text_1, text_2) if metric == 'bleu': tokenized_1 = text_1.split() tokenized_2 = text_2.split() short_sentence_smoother = SmoothingFunction().method4 similarity_score = bleu.sentence_bleu( [tokenized_1], tokenized_2, smoothing_function=short_sentence_smoother) if metric == 'jaro': similarity_score = jaro_winkler.normalized_similarity(text_1, text_2) if metric == 'jaccard': similarity_score = jaccard.normalized_similarity(text_1, text_2) if metric == 'monge_elkan': similarity_score = monge_elkan.normalized_similarity(text_1, text_2) if metric == 'overlap': similarity_score = overlap.normalized_similarity(text_1, text_2) return similarity_score
def get_street_score(s1, s2): return jaro_winkler.normalized_similarity(s1.upper(), s2.upper())
def get_building_score(b1, b2): return jaro_winkler.normalized_similarity(b1.upper(), b2.upper())
def words_similarity(self, w1, w2): return 1/2*(levenshtein.normalized_similarity(w1, w2) +\ jaro_winkler.normalized_similarity(w1, w2))
def get(self, page_id): # deal with the page_id, page_size first page_id = filter_page_id(page_id) # deal with all values page_size = filter_page_size(request.args.get("page_size"), 18) order_method = "view" if request.args.get("order_method") in [ "view", "name", "price", "relevancy" ]: order_method = request.args.get("order_method") order = "asc" if request.args.get("order") in ["asc", "desc"]: order = request.args.get("order") price_min = filter_price(request.args.get("price_min"), 0) price_max = filter_price(request.args.get("price_max"), 10000) if price_max < price_min: abort(400, "Price max should > price min") # variable to store all conditions conds = [] conds.append("(item.price >= {} AND item.price <= {})".format( price_min, price_max)) # keyword search, may be empty # when it is the keyword search, default is order_method = relevancy and order = desc keyword = request.args.get("keyword") # multi-valued attributes cpu = filter_param(request.args.getlist("cpu"), ["0", "1"]) storage = filter_param(request.args.getlist("storage"), ["0", "1", "2", "3"]) memory = filter_param(request.args.getlist("memory"), ["0", "1", "2"]) graphic = filter_param(request.args.getlist("graphic"), ["0", "1", "2"]) screen = filter_param(request.args.getlist("screen"), ["0", "1", "2", "3"]) cpu_conds = [ "lower(laptop.cpu_prod) LIKE '%intel%'", "lower(laptop.cpu_prod) LIKE '%amd%'", ] storage_conds = [ "CAST(laptop.primary_storage_cap AS INTEGER) <= 256", "(CAST(laptop.primary_storage_cap AS INTEGER) > 256 AND CAST(laptop.primary_storage_cap AS INTEGER) <= 512)", "(CAST(laptop.primary_storage_cap AS INTEGER) > 512 AND CAST(laptop.primary_storage_cap AS INTEGER) <= 1024)", "CAST(laptop.primary_storage_cap AS INTEGER) > 1024", ] memory_conds = [ "CAST(laptop.memory_size AS INTEGER) <= 8", "(CAST(laptop.memory_size AS INTEGER) > 8 AND CAST(laptop.memory_size AS INTEGER) <= 16)", "CAST(laptop.memory_size AS INTEGER) > 16", ] graphic_conds = [ "laptop.gpu_model LIKE '%GTX 1%'", "laptop.gpu_model LIKE '%RTX 2%'", "laptop.gpu_model LIKE '%RTX 3%'", ] screen_conds = [ "CAST(laptop.display_size AS REAL) <= 13.3", "(CAST(laptop.display_size AS REAL) > 13.3 AND CAST(laptop.display_size AS REAL) <= 15.6)", "CAST(laptop.display_size AS REAL) > 15.6", ] # for each variable list, if one condition, use AND to join, if multiple condition # bracket them, and inside use OR to join conds.append(configure_conds(cpu, cpu_conds)) conds.append(configure_conds(storage, storage_conds)) conds.append(configure_conds(memory, memory_conds)) conds.append(configure_conds(graphic, graphic_conds)) conds.append(configure_conds(screen, screen_conds)) # at last, check the status = 0 / 1 / 2 # default to on sell items status = 1 if (request.args.get("status")): auth_header = request.headers.get("Authorization") if not auth_header: return "No authorization token exist when you try to access parameter 'status'", 403 T = Token() identity = T.check(auth_header) if (not identity) or (identity['role'] != 0): return "Wrong token when you try to access parameter 'status'", 403 status_list = ["0", "1", "2"] if request.args.get("status") not in status_list: return "Wrong status parameter", 400 status = int(request.args.get("status")) # add condition for status if status == 0: conds.append("(item.status = 0)") elif status == 1: conds.append("(item.status = 1)") else: # all items conds.append("(status = 0 OR status = 1)") # remove all None conds = [cond for cond in conds if cond is not None] try: with sqlite3.connect(os.environ.get("DB_FILE")) as conn: conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) } cur = conn.cursor() # get both item_id and name into the list sql = """SELECT item.item_id, item.name FROM item LEFT OUTER JOIN laptop ON item.item_id = laptop.item_id """ for cond in conds: if "WHERE" in sql: sql += "AND {} \n".format(cond) else: sql += "WHERE {} \n".format(cond) if order_method != "relevancy": sql += "ORDER BY {} {}".format(order_method, order) cur.execute(sql) item_id_name_list = cur.fetchall() # if no result, or the id list does not reach this page id # here cannot use abort, it will be caught in the exception if (not item_id_name_list) or (len(item_id_name_list) < page_id * page_size): return (404, "No more pages") # if there is a keyword in the request, then we fetch all item names and compare # the keyword will not have %20 inside result_id_list = item_id_name_list if keyword: keyword = keyword.lower() for item in item_id_name_list: name = item['name'].lower() item[ 'similarity'] = jaro_winkler.normalized_similarity( keyword, name) # if the keyword search asks for order by similarity # use descending order by default if order_method == "relevancy": item_id_name_list = sorted( item_id_name_list, key=lambda d: d['similarity'], reverse=True) # threshold = 0.65 THRESHOLD = 0.65 result_id_list = [ d for d in item_id_name_list if d['similarity'] > THRESHOLD ] # again, check if no results if (not result_id_list) or (len(result_id_list) < page_id * page_size): return (404, "No more pages") # pack the result result = { 'current_page': page_id, 'page_count': get_page_count(len(result_id_list), page_size), 'data': get_all_profiles( result_id_list[page_id * page_size:(page_id + 1) * page_size]) } return result, 200 except Exception as e: print(e) abort(500, "Internal server error")