Пример #1
0
def update_hotel_reviewer_score(path):
    driver = get_webdriver()
    result = OrderedDict()
    xls = get_data(path)
    data = json.dumps(xls, default=json_serial)

    lines = json.loads(data, object_hook=json_util.object_hook)['hotel_info']
    reviews = json.loads(data,
                         object_hook=json_util.object_hook)['review_info']
    reviewers = json.loads(data,
                           object_hook=json_util.object_hook)['reviewer_info']
    for index, reviewer in enumerate(reviewers[1:]):
        print reviewer
        while len(reviewer) < 18:
            reviewer.append('')
        if True:  #reviewer[11]!='' and reviewer[12]!='':
            link = reviewer[2]
            # print reviewer
            print link
            try:
                scores = get_reviewer_scores(driver, link)
                print scores
                # break

                reviewer[11] = scores[0]
                reviewer[12] = scores[1]
                reviewer[13] = scores[2]
                reviewer[14] = scores[3]
                reviewer[15] = scores[4]
            except:
                pass
    result.update({"hotel_info": lines})
    result.update({"reviewer_info": reviewers})
    result.update({"review_info": reviews})
    save_data(path, result)
Пример #2
0
def crawl_hotel_reviewer_name(path, reviewer_path):
    print path
    result = OrderedDict()
    xls = get_data(path)
    data = json.dumps(xls, default=json_serial)
    # print data
    lines = json.loads(data, object_hook=json_util.object_hook)['hotel_info']
    reviews = json.loads(data,
                         object_hook=json_util.object_hook)['review_info']

    # reviewer_result = OrderedDict()
    # reviewer_xls = get_data(reviewer_path)
    # reviewer_data = json.dumps(reviewer_xls, default=json_serial)
    # reviewers=json.loads(reviewer_data, object_hook=json_util.object_hook)['reviewer_info']

    # reviewers_id_set=[reviewer[1] for reviewer in reviewers[1:] if len(reviewer) > 1]
    reviewers_id_set = []
    # print reviewers_id_set
    # exit(0)

    driver = get_webdriver()

    for index, review in enumerate(reviews[1:]):
        print index
        #  review[5],review[8]
        try:
            url = review[5]
            reviewer_data = hotel_reviewer_name_crawler(
                driver, url, reviewers_id_set)
            reviewer_id = reviewer_data[0]
            reviewer_name = reviewer_data[1]
            reviewer_exist = reviewer_data[2]
            print "Get reviewer: ", reviewer_name
            # if not reviewer_id or not reviewer_name:
            #     print 'Passed'
            #     continue

            reviews[index + 1][1] = reviewer_id
            reviews[index + 1][2] = reviewer_name

            if not reviewer_exist:
                print "Add new reviewer: ", reviewer_id, reviewer_name
                temp = []
                temp.append(reviewer_id)
                temp.append(reviewer_name)
                temp.append(url)

                # print len(reviewers)
                # reviewers.append(temp)
                # print len(reviewers)

        except Exception as e:
            print e
            driver = get_new_webdriver(driver)

    closeDriver(driver)

    result.update({"hotel_info": lines})
    result.update({"review_info": reviews})
    save_data(path, result)
Пример #3
0
def get_hotel_tripadvisor_reviews(path, output_path=None):
    if not output_path:
        output_path = path

    content = read_xls(path)
    sheets = content.keys()
    if 'review_info' in sheets:
        content['review_info'] = content['review_info'][:1]
        reviews = content['review_info']
    if 'hotel_info' in sheets:
        hotel_info = content['hotel_info']

    try:
        for line in hotel_info[1:]:
            driver = get_webdriver()
            hotel_id = line[0]
            hotel_link = line[5]
            print hotel_id, hotel_link
            try:
                reviews = hotel_reviews_crawler(hotel_link, hotel_id, reviews,
                                                driver)
                driver.close()
                driver.quit()
            except Exception as e:
                print line
                print e
            # break
        if len(content['review_info']) > 1:
            write_xls(output_path, content)
            return True

    except Exception as e:
        print e

    return False
Пример #4
0
def update_hotel_missed_reviewer(reviewer_path):
    reviewer_result = OrderedDict()
    reviewer_xls = get_data(reviewer_path)
    reviewer_data = json.dumps(reviewer_xls, default=json_serial)
    reviewers = json.loads(reviewer_data,
                           object_hook=json_util.object_hook)['reviewer_info']
    # print reviewers[0]
    # driver = get_webdriver()
    for i, reviewer in enumerate(reviewers):
        try:
            print i
            reviewer_id = reviewer[0]
            if len(reviewer) > 2 and (not reviewer[2]
                                      or len(reviewer[2]) == 0):
                review_url = reviewer[-1]
                info = get_reviewer_profile_link(review_url, get_webdriver(),
                                                 reviewer[1])
                temp = []
                temp.append(reviewer_id)
                temp.append(info.get('reviewer_name', ''))
                temp.append(info.get('reviewer_link', ''))
                temp.append(info.get('reviewer_location', ''))
                temp.append(info.get('reviewer_level', ''))
                temp.append(info.get('reviewer_num_reviews', ''))
                temp.append(info.get('reviewer_num_hotel_reviews', ''))
                temp.append(info.get('reviewer_num_helpful_votes', ''))
                temp.append(info.get('reviewer_firstmonth', ''))
                temp.append(info.get('reviewer_gender', ''))
                temp.append(info.get('reviewer_age', ''))
                temp.append(info.get('reviewer_num_1', ''))
                temp.append(info.get('reviewer_num_2', ''))
                temp.append(info.get('reviewer_num_3', ''))
                temp.append(info.get('reviewer_num_4', ''))
                temp.append(info.get('reviewer_num_5', ''))
                temp.append(info.get('reviewer_description', ''))
                temp.append(info.get('reviewer_readership', ''))
                temp.append(info.get('review_url', ''))
                # print temp
                reviewers[i] = temp[:]
        except Exception as e:
            print e
    # driver.close()
    reviewer_result.update({"reviewer_info": reviewers})
    save_data(reviewer_path, reviewer_result)
Пример #5
0
def update_hotel_reviewer(path, reviewer_path):
    print path
    result = OrderedDict()
    xls = get_data(path)
    data = json.dumps(xls, default=json_serial)
    # print data
    lines = json.loads(data, object_hook=json_util.object_hook)['hotel_info']
    reviews = json.loads(data,
                         object_hook=json_util.object_hook)['review_info']

    reviewer_result = OrderedDict()
    reviewer_xls = get_data(reviewer_path)
    reviewer_data = json.dumps(reviewer_xls, default=json_serial)
    reviewers = json.loads(reviewer_data,
                           object_hook=json_util.object_hook)['reviewer_info']

    reviewers_id_set = [
        reviewer[1] for reviewer in reviewers[1:] if len(reviewer) > 1
    ]
    # print reviewers_id_set
    # exit(0)
    previous_reviewers_info = [reviewer for reviewer in reviewers[1:]]
    new_reviewers_info = []

    driver = get_webdriver()

    for index, review in enumerate(reviews[1:]):
        print index
        #  review[5],review[8]
        try:
            reviewer_id, reviewer_name = hotel_reviewer_profile_crawler(
                driver, review[5], review[8], reviewers_id_set,
                new_reviewers_info, previous_reviewers_info)

            print reviewer_id, reviewer_name
            reviews[index + 1][1] = reviewer_id
            reviews[index + 1][2] = reviewer_name
        except Exception as e:
            print e
            driver = get_new_webdriver(driver)
        # break
    try:
        driver.close()
    except:
        pass
    for info in new_reviewers_info:
        # previous_reviewers_id=[reviewer[1] for reviewer in reviewers[1:] if len]
        added = False
        try:
            print info
            temp = []
            temp.append(info.get('Reviewer_ID', ''))
            temp.append(info.get('reviewer_name', ''))
            temp.append(info.get('reviewer_link', ''))
            temp.append(info.get('reviewer_location', ''))
            temp.append(info.get('reviewer_level', ''))
            temp.append(info.get('reviewer_num_reviews', ''))
            temp.append(info.get('reviewer_num_hotel_reviews', ''))
            temp.append(info.get('reviewer_num_helpful_votes', ''))
            temp.append(info.get('reviewer_firstmonth', ''))
            temp.append(info.get('reviewer_gender', ''))
            temp.append(info.get('reviewer_age', ''))
            temp.append(info.get('reviewer_num_1', ''))
            temp.append(info.get('reviewer_num_2', ''))
            temp.append(info.get('reviewer_num_3', ''))
            temp.append(info.get('reviewer_num_4', ''))
            temp.append(info.get('reviewer_num_5', ''))
            temp.append(info.get('reviewer_description', ''))
            temp.append(info.get('reviewer_readership', ''))
            temp.append(info.get('review_url', ''))
            for i in xrange(len(reviewers)):
                reviewer = reviewers[i]
                if reviewer[1] == temp[1]:
                    if reviewer[0] != temp[0]:
                        temp[0] = reviewer[0]
                    reviewers[i] = temp
                    added = True
                    break
            if not added:
                reviewers.append(temp)
        except:
            pass
    result.update({"hotel_info": lines})
    result.update({"review_info": reviews})
    save_data(path, result)

    reviewer_result.update({"reviewer_info": reviewers})
    save_data(reviewer_path, reviewer_result)