Пример #1
0
def indexing():
    business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL)
    business_coll.create_index("business_id")

    vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
    vector_coll.create_index("id")
    vector_coll.create_index("type")
Пример #2
0
def get_knn(type_, id_, k=10, approach='hin2vec'):
    distances = []
    #TODO refector
    coll_mapping = {
        'hin2vec': settings.VECTOR_COLL,
        'deepwalk': settings.VECTOR_DEEPWALK_COLL,
        'pte': settings.VECTOR_PTE_COLL,
        'esim': settings.VECTOR_ESIM_COLL,
    }
    vector_coll = mongodb_helper.get_coll(coll_mapping[approach])
    rest = vector_coll.find_one({'id': id_})
    if rest is None:
        return []

    v = rest['v']
    for business in vector_coll.find({'type': settings.BUSINESS_COLL}):
        v2 = business['v']
        if type_ == 'euclidean':
            distance = by_euclidean_distance(v, v2)
        if type_ == 'manhattan':
            distance = by_manhattan_distance(v, v2)
        if type_ == 'inner':
            distance = np.inner(v, v2)
        if type_ == 'sigmoid':
            distance = by_sigmoid_inner_product(v, v2)
        if type_ == 'cosine':
            distance = by_cosine(v, v2)
        distances.append((distance, business['id']))

    if type_ in ['inner', 'sigmoid', 'cosine']:
        results = sorted(distances, reverse=True)[1:k + 1]
        print results
        return results
    return sorted(distances)[1:k + 1]
Пример #3
0
def extract_all_review():
    review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL)
    a = review_coll.find({})
    text_lists = []
    for record in a:
        text_lists.append(record['text'])
    return text_lists
Пример #4
0
def insert_db(bid, keywords):
    coll = mongodb_helper.get_coll(settings.BUSINESS_KEYWORD_COLL)
    keywords_dict_list = []
    for word, score in keywords.items():
        keywords_dict_list.append({'word': word, 'score': score})
    data = {'id': bid, 'keywords': keywords_dict_list}
    coll.insert_one(data)
Пример #5
0
def main(k):
    '''\
    %prog [options] <k>
    '''
    k = int(k)

    bid2indexes, texts = extract_all_review()
    print 'Business count:', len(bid2indexes)
    print 'Review count:', len(texts)

    words, counts, tfidfs = compute_tfidf(texts)
    print 'Distinct word count:', len(words)
    seq2word = {}
    for seq, word in enumerate(words):
        seq2word[seq] = word

    coll = mongodb_helper.get_coll(settings.BUSINESS_KEYWORD_COLL)
    coll.drop()
    ith = 0
    for bid, indexes in bid2indexes.items():
        keywords = get_topk_keywords(k, indexes, tfidfs, seq2word)
        insert_db(bid, keywords)
        if ith % 100 == 0:
            print ith
        ith += 1
    coll.create_index('id')
    return 0
Пример #6
0
def search(request):
    filtered = []
    if 'q' in request.GET:
        solr = pysolr.Solr(
            'http://%s:%d/solr/%s/' %
            (settings.SOLR_HOST, settings.SOLR_PORT, settings.SOLR_CORE),
            timeout=10)
        keywords = request.GET['q']
        results = solr.search(keywords, rows=1000)

        vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
        review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL)
        for r in results:
            b_id = r['business_id'][0]
            if vector_coll.find_one({'id': b_id}) is not None:
                review_count = review_coll.count({'business_id': b_id})
                r['review_count'] = review_count
                filtered.append(r)

    return render(request, 'se.html', {'rests': filtered})
Пример #7
0
def detail(request, rest_id):
    business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL)
    rest_info = business_coll.find_one({'business_id': rest_id})
    vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
    rest_vec = vector_coll.find_one({'id': rest_id})

    knn_ids = [id_ for _, id_ in knn.by_euclidean_distance(rest_id)]
    knn_infos = [business_coll.find_one({'business_id': id_})
                 for id_ in knn_ids]
    categories = rest_info['categories']
    knn_cat_dist = []
    for cat, score in distribution.category_distribution(knn_ids):
        if cat in categories:
            knn_cat_dist.append((cat, score, True))
            continue
        knn_cat_dist.append((cat, score, False))
    return render(request, 'rest.html', {'rest_info': rest_info,
                                         'rest_vec': rest_vec,
                                         'knn_infos': knn_infos,
                                         'knn_cat_dist': knn_cat_dist})
Пример #8
0
def by_sigmoid_inner_product(id_, k=10):
    distances = []
    vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
    rest = vector_coll.find_one({'id': id_})
    if rest is None:
        return []

    v = rest['v']
    for business in vector_coll.find({'type': settings.BUSINESS_COLL}):
        v2 = business['v']
        distance = 1 / (1 + exp(-sum(a * b for a, b in zip(v, v2))))
        distances.append((distance, business['id']))
    return sorted(distances)[1:k + 1]
Пример #9
0
def by_manhattan_distance(id_, k=10):
    distances = []
    vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
    rest = vector_coll.find_one({'id': id_})
    if rest is None:
        return []

    v = rest['v']
    for business in vector_coll.find({'type': settings.BUSINESS_COLL}):
        v2 = business['v']
        distance = sum(abs(a - b) for a, b in zip(v, v2))
        distances.append((distance, business['id']))
    return sorted(distances)[1:k + 1]
Пример #10
0
def import_yelp_data():
    colls = [
        (settings.BUSINESS_COLL, settings.BUSINESS_FILE),
        (settings.USER_COLL, settings.USER_FILE),
        (settings.REVIEW_COLL, settings.REVIEW_FILE),
        (settings.TIP_COLL, settings.TIP_FILE),
        (settings.CHECKIN_COLL, settings.CHECKIN_FILE),
    ]
    for coll_name, fpath in colls:
        coll = mongodb_helper.get_coll(coll_name)
        for sub_dataset in load_dataset(fpath):
            coll.insert_many(sub_dataset)
        print coll_name, coll.count()
Пример #11
0
def extract_review_text(ids):
    review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL)
    result_dict = {}
    for id_ in ids:
        review_text_list = []
        result = review_coll.find({'business_id': id_})
        for record in result:
            original_text = record['text']
            #processed_text = preprocess_review_text(original_text)
            #review_text_list.append(processed_text)
            review_text_list.append(original_text)
        result_dict[id_] = review_text_list
    return result_dict
Пример #12
0
def search(request):
    filtered = []
    if 'q' in request.GET:
        solr = pysolr.Solr('http://localhost:8983/solr/gettingstarted/',
                           timeout=10)
        keywords = request.GET['q']
        results = solr.search(keywords)

        vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
        for r in results:
            if vector_coll.find_one({'id': r['business_id'][0]}) is not None:
                filtered.append(r)

    return render(request, 'se.html', {'rests': filtered})
Пример #13
0
def category_distribution(ids):
    business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL)
    cat_dist = {}
    for id_ in ids:
        cats = business_coll.find_one({'business_id': id_})['categories']
        if cats is None:
            continue

        for cat in cats:
            if cat not in cat_dist:
                cat_dist[cat] = 1.0 / len(ids)
                continue
            cat_dist[cat] += 1.0 / len(ids)
    return sorted(cat_dist.items(), key=lambda x: x[1], reverse=True)
Пример #14
0
def extract_all_review():
    review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL)
    text_lists = []
    bid2indexes = {}
    index = 0
    for record in review_coll.find({}):
        text_lists.append(preprocess_review_text(record['text']))
        bid = record['business_id']
        if bid not in bid2indexes:
            bid2indexes[bid] = [index]
            continue
        bid2indexes[bid].append(index)
        index += 1
        if index % 1000 == 0:
            print index
    return bid2indexes, text_lists
Пример #15
0
def get_knn(type_, id_, k=10, approach='hin2vec'):
    distances = []
    #TODO refector
    coll_mapping = {
        'hin2vec': settings.VECTOR_COLL,
        'deepwalk': settings.VECTOR_DEEPWALK_COLL,
        'pte': settings.VECTOR_PTE_COLL,
        'esim': settings.VECTOR_ESIM_COLL,
    }
    vector_coll = mongodb_helper.get_coll(coll_mapping[approach])
    rest = vector_coll.find_one({'id': id_})
    if rest is None:
        return []

    v = rest['v']
    hin2vec_path_dim = [
        2, 15, 24, 31, 46, 52, 58, 68, 78, 85, 87, 92, 93, 98, 122, 125
    ]
    #   hin2vec_path_dim = []
    if approach == 'hin2vec':
        for i in hin2vec_path_dim:
            v[i] = 0
    for business in vector_coll.find({'type': settings.BUSINESS_COLL}):
        if business['id'] == id_:
            continue

        v2 = business['v']
        if approach == 'hin2vec':
            for i in hin2vec_path_dim:
                v2[i] = 0
        if type_ == 'euclidean':
            distance = by_euclidean_distance(v, v2)
        if type_ == 'manhattan':
            distance = by_manhattan_distance(v, v2)
        if type_ == 'inner':
            distance = np.inner(v, v2)
        if type_ == 'sigmoid':
            distance = by_sigmoid_inner_product(v, v2)
        if type_ == 'cosine':
            distance = by_cosine(v, v2)
        distances.append((distance, business['id']))

    if type_ in ['inner', 'sigmoid', 'cosine']:
        results = sorted(distances, reverse=True)[:k]
        return results
    return sorted(distances)[:k]
Пример #16
0
def fetch_business_data(ids, key, data_type_flag):
    business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL)
    result_list = []
    if data_type_flag == GEO_COORDS:
        for id_ in ids:
            longitude = business_coll.find_one({'business_id': id_})[key[0]]
            latitude = business_coll.find_one({'business_id': id_})[key[1]]
            result = (longitude, latitude, id_)
            if result is None:
                continue
            result_list.append(result)
    else:
        for id_ in ids:
            result = business_coll.find_one({'business_id': id_})[key]
            if result is None:
                continue

            if isinstance(result, list):
                for item in result:
                    result_list.append(item)
            else:
                result_list.append(result)
    return result_list
Пример #17
0
def get_keywords(bid):
    coll = mongodb_helper.get_coll(settings.BUSINESS_KEYWORD_COLL)
    data = coll.find_one({'id': bid})
    if data is None:
        return []
    return [w['word'] for w in data['keywords']]
Пример #18
0
def detail(request, rest_id):
    business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL)
    rest_info = business_coll.find_one({'business_id': rest_id})
    vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
    rest_vec = vector_coll.find_one({'id': rest_id})

    # generate google map search string
    query = "https://www.google.com/maps/embed/v1/place?key=AIzaSyC0woDjDcggf1PhuX9POXxTO0F059_JpjU"
    query += "&q=" + "+".join(rest_info['address'].split(" "))
    query += "," + "+".join(rest_info['city'].split(" "))

    similarity_types = [['euclidean', 'Euclidean distance', False],
                        ['manhattan', 'Manhattan distance', False],
                        ['inner', 'Inner product', False],
                        ['cosine', 'Cosine', False]]
    selected_sim_type = request.GET.get('similarity', 'euclidean')
    for s in similarity_types:
        if s[0] == selected_sim_type:
            s[2] = True
            break

    approaches = [['hin2vec', 'HIN2Vec', False],
                  ['deepwalk', 'DeepWalk', False], ['pte', 'PTE', False],
                  ['esim', 'Esim', False]]
    selected_approach = request.GET.get('approach', 'hin2vec')
    for s in approaches:
        if s[0] == selected_approach:
            s[2] = True
            break

    knn_result = knn.get_knn(selected_sim_type,
                             rest_id,
                             approach=selected_approach)
    knn_ids = [id_ for _, id_ in knn_result]
    knn_infos = [
        business_coll.find_one({'business_id': id_}) for id_ in knn_ids
    ]
    for ith, b in enumerate(knn_infos):
        b['co_user_count'] = co_customers.get_number_com_customers(
            rest_id, b['business_id'])
        b['co_user_ratio'] = co_customers.get_ratio_com_customers(
            rest_id, b['business_id'])
        b['score'] = knn_result[ith][0]

    rest_info['keywords'] = views_helper.get_keywords(rest_id)
    #   for kth_info in knn_infos:
    #       kth_info['keywords'] = views_helper.get_keywords(kth_info['business_id'])
    knn_keyword_dist = []
    for word, score in distribution.keyword_distribution(knn_ids):
        if word in rest_info['keywords']:
            knn_keyword_dist.append((word, score, True))
            continue
        knn_keyword_dist.append((word, score, False))

    knn_lon_lat = []
    for row in knn_infos:
        knn_lon_lat.append([row['longitude'], row['latitude']])

    categories = rest_info['categories']
    knn_cat_dist = []
    for cat, score in distribution.category_distribution(knn_ids):
        if cat in categories:
            knn_cat_dist.append((cat, score, True))
            continue
        knn_cat_dist.append((cat, score, False))

    barchart_data = [
        go.Bar(x=[row[0] for row in knn_cat_dist],
               y=[row[1] for row in knn_cat_dist])
    ]

    barchart_cat = plot(barchart_data, output_type="div").replace(
        "<div>", "<div style='height:500px'>")

    piechart_data_cat = [
        go.Pie(labels=[row[0] for row in knn_cat_dist],
               values=[row[1] for row in knn_cat_dist])
    ]

    piechart_cat = plot(piechart_data_cat, output_type="div").replace(
        "<div>", "<div style='height:500px'>")

    city = rest_info['city']
    knn_city_dist = []
    for c, score in distribution.city_distribution(knn_ids):
        if c == city:
            knn_city_dist.append((c, score, True))
            continue
        knn_city_dist.append((c, score, False))
    barchart_data = [
        go.Bar(x=[row[0] for row in knn_city_dist],
               y=[row[1] for row in knn_city_dist])
    ]

    barchart_city = plot(barchart_data, output_type="div").replace(
        "<div>", "<div style='height:500px'>")
    f = open("tmp.html", "w")
    f.write(barchart_city)
    f.close()

    piechart_data = [
        go.Pie(labels=[row[0] for row in knn_city_dist],
               values=[row[1] for row in knn_city_dist])
    ]

    piechart_city = plot(piechart_data, output_type="div").replace(
        "<div>", "<div style='height:500px'>")

    #   edges = [(1,2), (3,2), (1,4), (3,4)]
    #   nodes = {1: {"name": "McDonald's", "type": "business"},
    #            2: {"name": "Jack",       "type": "user"},
    #            3: {"name": "Burger King","type": "business"},
    #            4: {"name": "Anthony",    "type": "user"}}

    # network generation
    #meta_paths = []
    network_div = []
    rest_id1 = rest_info['business_id']
    for i in range(10):
        rest_id2 = knn_ids[i]
        meta_paths_tmp = graph_db.get_meta_path_count(rest_id1, rest_id2, 2)
        temp_ = []
        for mp, count in sorted(meta_paths_tmp.items(),
                                key=lambda x: len(x[0])):
            temp_.append(('B-%s-B' % ('-'.join(mp)), count))
        meta_paths = temp_

        nodes, edges = graph_db.get_paths(rest_id1, rest_id2, 2)

        if len(nodes) == 0:
            network_div.append([
                rest_info["name"] + " v.s. " + knn_infos[i]["name"], '',
                meta_paths
            ])
        else:
            G = create_network(nodes, edges)
            network_div.append([
                rest_info["name"] + " v.s. " + knn_infos[i]["name"],
                draw_network(G), meta_paths
            ])

    # added knn_lon_lat for google map display
    return render(
        request, 'rest.html', {
            'rest_info': rest_info,
            'rest_vec': rest_vec,
            'query': query,
            'knn_infos': knn_infos,
            'knn_cat_dist': knn_cat_dist,
            'knn_keyword_dist': knn_keyword_dist,
            'knn_lon_lat': knn_lon_lat,
            'barchart_cat': barchart_cat,
            'piechart_data_cat': piechart_data_cat,
            'piechart_cat': piechart_cat,
            'knn_city_dist': knn_city_dist,
            'barchart_city': barchart_city,
            'piechart_city': piechart_city,
            'network_div': network_div,
            'similarity_types': similarity_types,
            'approaches': approaches,
        })
Пример #19
0
def imoprt_vectors():
    coll = mongodb_helper.get_coll(settings.VECTOR_COLL)
    for sub_vectors in load_vectors():
        coll.insert_many(sub_vectors)