示例#1
0
def match_business(text, list_of_company_names):
    """one wall of petitioner name text to be compared to a large list of company names to find best match"""
    # max_score, max_company_name = 0, np.nan
    max_score_jw, max_company_name_jw = 0, np.nan
    for company in list_of_company_names:
        if company in text:
            return 1, company
        score = fuzz.token_set_ratio(text, company)
        # print(text, company, score, max_score)
        if score > 0.75:
            score_jw = jaro_winkler.normalized_similarity(text, company)
            if score_jw > max_score_jw:
                max_score_jw, max_company_name_jw = score_jw, company
    print('1 done', max_score_jw, max_company_name_jw)
    return max_score_jw, max_company_name_jw
def produce_similarity_score_text(text_1, text_2, metric='edit_distance'):
    """similarity between two strings"""
    if metric == 'edit_distance':
        similarity_score = nltk.edit_distance(text_1, text_2)
    if metric == 'bleu':
        tokenized_1 = text_1.split()
        tokenized_2 = text_2.split()
        short_sentence_smoother = SmoothingFunction().method4
        similarity_score = bleu.sentence_bleu(
            [tokenized_1],
            tokenized_2,
            smoothing_function=short_sentence_smoother)
    if metric == 'jaro':
        similarity_score = jaro_winkler.normalized_similarity(text_1, text_2)
    if metric == 'jaccard':
        similarity_score = jaccard.normalized_similarity(text_1, text_2)
    if metric == 'monge_elkan':
        similarity_score = monge_elkan.normalized_similarity(text_1, text_2)
    if metric == 'overlap':
        similarity_score = overlap.normalized_similarity(text_1, text_2)
    return similarity_score
示例#3
0
def get_street_score(s1, s2):
    return jaro_winkler.normalized_similarity(s1.upper(), s2.upper())
示例#4
0
def get_building_score(b1, b2):
    return jaro_winkler.normalized_similarity(b1.upper(), b2.upper())
示例#5
0
	def words_similarity(self, w1, w2):
		return 1/2*(levenshtein.normalized_similarity(w1, w2) +\
				jaro_winkler.normalized_similarity(w1, w2))
示例#6
0
    def get(self, page_id):
        # deal with the page_id, page_size first
        page_id = filter_page_id(page_id)

        # deal with all values
        page_size = filter_page_size(request.args.get("page_size"), 18)

        order_method = "view"
        if request.args.get("order_method") in [
                "view", "name", "price", "relevancy"
        ]:
            order_method = request.args.get("order_method")

        order = "asc"
        if request.args.get("order") in ["asc", "desc"]:
            order = request.args.get("order")

        price_min = filter_price(request.args.get("price_min"), 0)
        price_max = filter_price(request.args.get("price_max"), 10000)

        if price_max < price_min:
            abort(400, "Price max should > price min")

        # variable to store all conditions
        conds = []
        conds.append("(item.price >= {} AND item.price <= {})".format(
            price_min, price_max))

        # keyword search, may be empty
        # when it is the keyword search, default is order_method = relevancy and order = desc
        keyword = request.args.get("keyword")

        # multi-valued attributes
        cpu = filter_param(request.args.getlist("cpu"), ["0", "1"])
        storage = filter_param(request.args.getlist("storage"),
                               ["0", "1", "2", "3"])
        memory = filter_param(request.args.getlist("memory"), ["0", "1", "2"])
        graphic = filter_param(request.args.getlist("graphic"),
                               ["0", "1", "2"])
        screen = filter_param(request.args.getlist("screen"),
                              ["0", "1", "2", "3"])

        cpu_conds = [
            "lower(laptop.cpu_prod) LIKE '%intel%'",
            "lower(laptop.cpu_prod) LIKE '%amd%'",
        ]

        storage_conds = [
            "CAST(laptop.primary_storage_cap AS INTEGER) <= 256",
            "(CAST(laptop.primary_storage_cap AS INTEGER) > 256 AND CAST(laptop.primary_storage_cap AS INTEGER) <= 512)",
            "(CAST(laptop.primary_storage_cap AS INTEGER) > 512 AND CAST(laptop.primary_storage_cap AS INTEGER) <= 1024)",
            "CAST(laptop.primary_storage_cap AS INTEGER) > 1024",
        ]

        memory_conds = [
            "CAST(laptop.memory_size AS INTEGER) <= 8",
            "(CAST(laptop.memory_size AS INTEGER) > 8 AND CAST(laptop.memory_size AS INTEGER) <= 16)",
            "CAST(laptop.memory_size AS INTEGER) > 16",
        ]

        graphic_conds = [
            "laptop.gpu_model LIKE '%GTX 1%'",
            "laptop.gpu_model LIKE '%RTX 2%'",
            "laptop.gpu_model LIKE '%RTX 3%'",
        ]

        screen_conds = [
            "CAST(laptop.display_size AS REAL) <= 13.3",
            "(CAST(laptop.display_size AS REAL) > 13.3 AND CAST(laptop.display_size AS REAL) <= 15.6)",
            "CAST(laptop.display_size AS REAL) > 15.6",
        ]

        # for each variable list, if one condition, use AND to join, if multiple condition
        # bracket them, and inside use OR to join
        conds.append(configure_conds(cpu, cpu_conds))
        conds.append(configure_conds(storage, storage_conds))
        conds.append(configure_conds(memory, memory_conds))
        conds.append(configure_conds(graphic, graphic_conds))
        conds.append(configure_conds(screen, screen_conds))

        # at last, check the status = 0 / 1 / 2
        # default to on sell items
        status = 1

        if (request.args.get("status")):
            auth_header = request.headers.get("Authorization")
            if not auth_header:
                return "No authorization token exist when you try to access parameter 'status'", 403

            T = Token()
            identity = T.check(auth_header)

            if (not identity) or (identity['role'] != 0):
                return "Wrong token when you try to access parameter 'status'", 403

            status_list = ["0", "1", "2"]

            if request.args.get("status") not in status_list:
                return "Wrong status parameter", 400

            status = int(request.args.get("status"))

        # add condition for status
        if status == 0:
            conds.append("(item.status = 0)")
        elif status == 1:
            conds.append("(item.status = 1)")
        else:
            # all items
            conds.append("(status = 0 OR status = 1)")

        # remove all None
        conds = [cond for cond in conds if cond is not None]

        try:
            with sqlite3.connect(os.environ.get("DB_FILE")) as conn:
                conn.row_factory = lambda C, R: {
                    c[0]: R[i]
                    for i, c in enumerate(C.description)
                }
                cur = conn.cursor()

                # get both item_id and name into the list
                sql = """SELECT item.item_id, item.name 
                    FROM item LEFT OUTER JOIN laptop 
                    ON item.item_id = laptop.item_id 
                """

                for cond in conds:
                    if "WHERE" in sql:
                        sql += "AND {} \n".format(cond)
                    else:
                        sql += "WHERE {} \n".format(cond)

                if order_method != "relevancy":
                    sql += "ORDER BY {} {}".format(order_method, order)

                cur.execute(sql)

                item_id_name_list = cur.fetchall()

                # if no result, or the id list does not reach this page id
                # here cannot use abort, it will be caught in the exception
                if (not item_id_name_list) or (len(item_id_name_list) <
                                               page_id * page_size):
                    return (404, "No more pages")

                # if there is a keyword in the request, then we fetch all item names and compare
                # the keyword will not have %20 inside
                result_id_list = item_id_name_list

                if keyword:
                    keyword = keyword.lower()

                    for item in item_id_name_list:
                        name = item['name'].lower()
                        item[
                            'similarity'] = jaro_winkler.normalized_similarity(
                                keyword, name)

                    # if the keyword search asks for order by similarity
                    # use descending order by default
                    if order_method == "relevancy":
                        item_id_name_list = sorted(
                            item_id_name_list,
                            key=lambda d: d['similarity'],
                            reverse=True)

                    # threshold = 0.65
                    THRESHOLD = 0.65
                    result_id_list = [
                        d for d in item_id_name_list
                        if d['similarity'] > THRESHOLD
                    ]

                # again, check if no results
                if (not result_id_list) or (len(result_id_list) <
                                            page_id * page_size):
                    return (404, "No more pages")

                # pack the result
                result = {
                    'current_page':
                    page_id,
                    'page_count':
                    get_page_count(len(result_id_list), page_size),
                    'data':
                    get_all_profiles(
                        result_id_list[page_id * page_size:(page_id + 1) *
                                       page_size])
                }

                return result, 200

        except Exception as e:
            print(e)
            abort(500, "Internal server error")