Пример #1
0
def review_result_is_valid(hotel_id):
    with taDB(common.TA_DB) as db:
        record = db.read_a_hotel(hotel_id)
    if record is None:
        return False
    rno = record[3]
    if int(rno) == 0:
        logger.info('[hotel {}] PASSED: no reviews'.format(hotel_id))
        return True
    rid_str = record[4]
    rids = ast.literal_eval(rid_str)
    if rno < len(rids):
        return False

    with taDB(common.TA_DB) as db:
        for rid in rids:
            rrecord = db.read_a_review(rid)
            if rrecord is None:
                return False
            html = rrecord[1]
            if html is None:
                logger.info('[hotel {}] FAILED: HTML is absent'.format(hotel_id))
                return False
            rec_soup = common.load_soup_string(html)
            if rec_soup.find('div', id=''.join(['review_', rid])) is None:
                print(html)
                logger.info('[hotel {}] FAILED: corrupted HTML'.format(hotel_id))
                return False
    logger.info('[hotel {}] PASSED: verified'.format(hotel_id))
    return True
Пример #2
0
    def gather_reviews(title):
        def gen_review_url(rid):
            return TA_ROOT + 'OverlayWidgetAjax?' + '&'.join(
                ['Mode=EXPANDED_HOTEL_REVIEWS',
                 'metaReferer=Hotel_Review',
                 'reviews=' + rid])

        while True:
            # logger.info('[worker {}] running'.format(title))
            hotel_id = que.get()
            if hotel_id is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            with taDB(common.TA_DB) as db:
                record = db.read_a_hotel(hotel_id)
            if record is None:
                continue
            rid_str = record[4]
            rids = ast.literal_eval(rid_str)
            new_rids = []
            slice_num = math.ceil(
                len(rids) / common.REVIEW_CHUNK_SIZE)
            for slicePos in range(slice_num):
                time.sleep(common.SLEEP_TIME)
                spos = slicePos * common.REVIEW_CHUNK_SIZE
                epos = (slicePos + 1) * common.REVIEW_CHUNK_SIZE \
                    if slicePos + 1 < slice_num else len(rids)
                id_string = ','.join(rids[spos: epos])
                logger.info('\t[hotel {}] from {} to {}'
                            .format(hotel_id, spos + 1, epos))
                url = gen_review_url(id_string)
                web_data = requests.get(url)
                web_text = web_data.text
                new_rids.extend(save_reviews(web_text))
            diff_flag = False
            diff_set = set(rids).difference(set(new_rids))
            for diff in diff_set:
                print('found diff')
                url = gen_review_url(diff)
                web_data = requests.get(url)
                blank = re.findall(
                    '(?<=id=\")review_\d+(?=\")',
                    web_data.text, re.IGNORECASE)
                if len(blank) > 0:
                    diff_flag = True
                    logger.info('{} is not empty'.format(diff))
                    break
            if not diff_flag:
                if diff_set:
                    with lock:
                        with taDB(common.TA_DB) as db:
                            db.update_review_list_in_hotel(
                                hotel_id, len(new_rids), str(new_rids))
                    logger.info('\t[hotel {}] review indexes updated'
                                .format(hotel_id))
            else:
                logger.info('\ttry again later')
                logger.info('\t{}'.format(diff_set))
                que.put(hotel_id)
            que.task_done()
Пример #3
0
    def gather_profiles(title):
        while True:
            # logger.info('[worker {}] running'.format(title))
            uid = que.get()
            if uid is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            logger.info('[user {}]'.format(uid))
            url = TA_ROOT + 'MemberOverlay?uid=' + uid
            simple_soup = common.load_soup_online(url).find(
                'div', class_='memberOverlay')
            profile_url = re.search('(?<=")/members/.+(?=")', str(simple_soup))
            if profile_url is None:
                profile_url = re.search(
                    '(?<=")/MemberProfile-a_uid.[A-Z0-9]+(?=")',
                    str(simple_soup))

            result = []
            if profile_url is not None:
                profile_url = TA_ROOT + profile_url.group(0).strip()
                result.append(simple_soup.prettify())
                detail_soup = common.load_soup_online(profile_url)
                member_soup = detail_soup.find('div',
                                               id='MODULES_MEMBER_CENTER')
                if member_soup is not None:
                    result.append(member_soup.prettify())
                    record = [uid, '\r\n'.join(result)]
                    with lock:
                        with taDB(common.TA_DB) as db:
                            db.insert_a_user(record)
                else:
                    if '404' in detail_soup.find('title').string:
                        with lock:
                            with taDB(common.TA_DB) as db:
                                db.remove_user_id_in_review(uid)
                                logger.info('\tuser id removed')
                    else:
                        logger.info('\tfailed to fetch full profile')
                        que.put(uid)
            else:
                logger.info('\tno profile url')
                que.put(uid)

            time.sleep(common.SLEEP_TIME)
            que.task_done()
Пример #4
0
 def user_is_valid(uid):
     with taDB(common.TA_DB) as db:
         user_record = db.read_a_user(uid)
     if user_record is None:
         return False
     html = user_record[0]
     if html is None:
         return False
     soup = common.load_soup_string(html)
     if soup.find('div', id='MODULES_MEMBER_CENTER') is None:
         logger.info('[user {}] FAILED: corrupted'.format(uid))
         return False
     else:
         logger.info('[user {}] PASSED: verified'.format(uid))
         return True
def start(gid):
    def gather_review_ids(title):
        while True:
            logger.info('[worker {}] running'.format(title))
            cur_pair = que.get()
            if cur_pair is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            hid, hurl = next(iter(cur_pair.items()))
            hurl = TA_ROOT + hurl
            logger.info('[hotel {}] {}'.format(hid, hurl))
            html, rno, rid_list = find_review_ids(hid, hurl)
            if rid_list is not None:
                record = [hid, html, gid, rno, str(rid_list)]
                with lock:
                    with taDB(common.TA_DB) as idb:
                        idb.insert_a_hotel(record)
            else:
                logger.info('\ttry again later')
                que.put(cur_pair)
            que.task_done()

    que = queue.Queue()

    with taDB(common.TA_DB) as iodb:
        hid_pairs = iodb.get_hotel_url_pairs(gid)

    threads = []
    thread_size = common.DETAIL_THREAD_NUM
    for j in range(thread_size):
        t = threading.Thread(target=gather_review_ids, args=(str(j + 1)))
        t.start()
        threads.append(t)

    logger.info('hid_pairs -- {}'.format(len(hid_pairs)))
    [
        que.put({key: hid_pairs[key]}) for key in hid_pairs
        if not review_index_is_valid(key)
    ]

    que.join()

    for k in range(thread_size):
        que.put(None)
    for t in threads:
        t.join()

    logger.info('all review ids are ready')
Пример #6
0
def save_reviews(web_data):
    web_soup = common.load_soup_string(web_data)
    review_soups = web_soup.find_all('div', id=re.compile('review_\d+'))
    records = []
    any_rids = []
    for x in review_soups:
        # len('review_') = 7
        any_rid = x['id'][7:]
        any_html = x.prettify()
        any_uid = re.search('[A-Z0-9]{32}', any_html)
        if any_uid is not None:
            any_uid = any_uid.group(0)
        any_rids.append(any_rid)
        records.append((any_rid, any_html, any_uid))
    with lock:
        with taDB(common.TA_DB) as db:
            db.insert_many_reviews(records)
    return any_rids
 def gather_review_ids(title):
     while True:
         # logger.info('[worker {}] running'.format(title))
         cur_pair = que.get()
         if cur_pair is None:
             logger.info('[worker {}] shutting down'.format(title))
             break
         hid, hurl = next(iter(cur_pair.items()))
         hurl = TA_ROOT + hurl
         logger.info('[hotel {}] {}'.format(hid, hurl))
         html, rno, rid_list = find_review_ids(hid, hurl)
         if rid_list is not None:
             record = [hid, html, gid, rno, str(rid_list)]
             with lock:
                 with taDB(common.TA_DB) as idb:
                     idb.insert_a_hotel(record)
         else:
             logger.info('\ttry again later')
             que.put(cur_pair)
         que.task_done()
def review_index_is_valid(hid):
    with taDB(common.TA_DB) as db:
        record = db.read_a_hotel(hid)
    if record is not None:
        rno = record[3]
        rid_str = record[4]

        rids = ast.literal_eval(rid_str)

        is_having = len(rids)
        if rno > is_having or is_having != len(set(rids)):
            print('should_have {}, is_having {}'.format(rno, is_having))
            logger.info('[hotel {}] FAILED: corrupted'.format(hid))
            return False
        elif rno < is_having:
            logger.info('[hotel {}] PASSED: extra reviews'.format(hid))
            return True
        else:
            logger.info('[hotel {}] PASSED: verified'.format(hid))
            return True
    else:
        return False
Пример #9
0
        _.SLEEP_TIME, _.SNIPPET_THREAD_NUM, _.DETAIL_THREAD_NUM,
        _.REVIEW_THREAD_NUM, _.USER_THREAD_NUM))

    # location list
    url_list = config['LOCATION']['List'].split(';')
    logging.info('{} locations found'.format(len(url_list)))
    return url_list


if __name__ == "__main__":

    init_logger()

    fn = common.TA_DB
    if not isfile(fn):
        with taDB(common.TA_DB) as db:
            db.create_tables()
            logging.info('database {} created.'.format(fn))

    urls = load_config()
    # for url in urls:
    #     gid = re.sub('\D', '', url)
    #     crawlSnippets.start(gid, url.strip())
    #     crawlHotels.start(gid)
    #     crawlReviews.start(gid)
    # crawlUsers.start()

    # with taDB(common.TA_DB) as db:
    #     db.extract_hotel_info()
    #     db.extract_review_info()
    #     db.extract_user_info()
Пример #10
0
    def gather_hotels(title):
        def calc_max_page(soup_container):
            return math.ceil(find_num_hotels(soup_container) / HOTEL_PER_PAGE)

        def find_hotel_ids(url_str):
            soup_container = common.load_soup_online(url_str)
            hdr = soup_container.find('div', class_='hdrTxt')
            if num_page == 1 and hdr is not None:
                divs_soup = hdr.find_all_previous(
                    'div', id=re.compile('^HOTELDEAL\d+'))
            else:
                divs_soup = soup_container.find_all(
                    'div', id=re.compile('^HOTELDEAL\d+'))

            page_pairs = []
            for link in divs_soup:
                # len('HOTELDEAL') = 6
                pair_hid = link['id'][9:]
                pair_url = link.find('div',
                                     class_='listing_title').find('a')['href']
                page_pairs.append({pair_hid: pair_url[1:]})
            return page_pairs

        def update_hotel_ids(new_pairs, pair_list):
            for new_pair in new_pairs:
                pair_key, pair_value = next(iter(new_pair.items()))
                # if hotel id not duplicate
                if pair_key not in pair_list:
                    pair_list[pair_key] = pair_value

        while True:
            # logger.info('[worker {}] running'.format(title))
            pid = que.get()
            if pid is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            paras = '&'.join([
                'seen=0', 'sequence=1', 'geo=' + gid,
                'requestingServlet=Hotels', 'refineForm=true', 'hs=',
                'adults=2', 'rooms=1', 'o=a' + str(pid * HOTEL_PER_PAGE),
                'pageSize=&rad=0', 'dateBumped=NONE',
                'displayedSortOrder=popularity'
            ])
            page_url = ''.join([init_url, '?', paras])
            logger.info('[page {}] {}'.format(pid + 1, page_url))
            # print('aa')
            hotels = find_hotel_ids(page_url)
            # print('bb')
            if hotels is None:
                que.put(pid)
            elif len(hotels) < HOTEL_PER_PAGE and pid < num_page - 1:
                que.put(pid)
            elif pid == num_page - 1 \
                    and len(hotels) < num_hotel % HOTEL_PER_PAGE:
                que.put(pid)
            else:
                with lock:
                    update_hotel_ids(hotels, hid_pairs)
                    logger.info('\t#{}, totaling {}'.format(
                        pid, len(hid_pairs)))
                    with taDB(common.TA_DB) as db:
                        record = [gid, str(hid_pairs)]
                        db.insert_a_location(record)

            time.sleep(common.SLEEP_TIME)
            que.task_done()
Пример #11
0
def start(gid, init_url):
    def gather_hotels(title):
        def calc_max_page(soup_container):
            return math.ceil(find_num_hotels(soup_container) / HOTEL_PER_PAGE)

        def find_hotel_ids(url_str):
            soup_container = common.load_soup_online(url_str)
            hdr = soup_container.find('div', class_='hdrTxt')
            if num_page == 1 and hdr is not None:
                divs_soup = hdr.find_all_previous(
                    'div', id=re.compile('^HOTELDEAL\d+'))
            else:
                divs_soup = soup_container.find_all(
                    'div', id=re.compile('^HOTELDEAL\d+'))

            page_pairs = []
            for link in divs_soup:
                # len('HOTELDEAL') = 6
                pair_hid = link['id'][9:]
                pair_url = link.find('div',
                                     class_='listing_title').find('a')['href']
                page_pairs.append({pair_hid: pair_url[1:]})
            return page_pairs

        def update_hotel_ids(new_pairs, pair_list):
            for new_pair in new_pairs:
                pair_key, pair_value = next(iter(new_pair.items()))
                # if hotel id not duplicate
                if pair_key not in pair_list:
                    pair_list[pair_key] = pair_value

        while True:
            # logger.info('[worker {}] running'.format(title))
            pid = que.get()
            if pid is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            paras = '&'.join([
                'seen=0', 'sequence=1', 'geo=' + gid,
                'requestingServlet=Hotels', 'refineForm=true', 'hs=',
                'adults=2', 'rooms=1', 'o=a' + str(pid * HOTEL_PER_PAGE),
                'pageSize=&rad=0', 'dateBumped=NONE',
                'displayedSortOrder=popularity'
            ])
            page_url = ''.join([init_url, '?', paras])
            logger.info('[page {}] {}'.format(pid + 1, page_url))
            # print('aa')
            hotels = find_hotel_ids(page_url)
            # print('bb')
            if hotels is None:
                que.put(pid)
            elif len(hotels) < HOTEL_PER_PAGE and pid < num_page - 1:
                que.put(pid)
            elif pid == num_page - 1 \
                    and len(hotels) < num_hotel % HOTEL_PER_PAGE:
                que.put(pid)
            else:
                with lock:
                    update_hotel_ids(hotels, hid_pairs)
                    logger.info('\t#{}, totaling {}'.format(
                        pid, len(hid_pairs)))
                    with taDB(common.TA_DB) as db:
                        record = [gid, str(hid_pairs)]
                        db.insert_a_location(record)

            time.sleep(common.SLEEP_TIME)
            que.task_done()

    s1 = gid
    loc_name = init_url[init_url.index(gid) + len(gid) +
                        1:init_url.rindex('-')]
    logger.info('[location {}] {}'.format(gid, loc_name.replace('_', ' ')))
    soup = common.load_soup_online(init_url)
    num_page = find_max_page(soup)
    logger.info('hotels Pages {} '.format(num_page))

    num_hotel = find_num_hotels(soup)
    logger.info('{} hotels in {} pages'.format(num_hotel, num_page))

    with taDB(common.TA_DB) as iodb:
        hid_pairs = iodb.get_hotel_url_pairs(gid)
    logger.info('{} hotels in local cache'.format(len(hid_pairs)))

    # collecting hotel ids might take multiple iterations
    while len(hid_pairs) < num_hotel:
        que = queue.Queue()

        threads = []
        thread_size = common.SNIPPET_THREAD_NUM
        for j in range(thread_size):
            t = threading.Thread(target=gather_hotels, args=(str(j + 1)))
            t.start()
            threads.append(t)

        # set start value to math.ceil(len(hid_pairs) / HOTEL_PER_PAGE)
        # rather than 0 if the hotels are ordered in the list
        [que.put(x) for x in range(0, num_page)]

        que.join()

        for k in range(thread_size):
            que.put(None)
        for t in threads:
            t.join()

    logger.info('all hotel ids are ready'.format(len(hid_pairs)))
Пример #12
0
def start():
    def user_is_valid(uid):
        with taDB(common.TA_DB) as db:
            user_record = db.read_a_user(uid)
        if user_record is None:
            return False
        html = user_record[0]
        if html is None:
            return False
        soup = common.load_soup_string(html)
        if soup.find('div', id='MODULES_MEMBER_CENTER') is None:
            logger.info('[user {}] FAILED: corrupted'.format(uid))
            return False
        else:
            logger.info('[user {}] PASSED: verified'.format(uid))
            return True

    def gather_profiles(title):
        while True:
            # logger.info('[worker {}] running'.format(title))
            uid = que.get()
            if uid is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            logger.info('[user {}]'.format(uid))
            url = TA_ROOT + 'MemberOverlay?uid=' + uid
            simple_soup = common.load_soup_online(url).find(
                'div', class_='memberOverlay')
            profile_url = re.search('(?<=")/members/.+(?=")', str(simple_soup))
            if profile_url is None:
                profile_url = re.search(
                    '(?<=")/MemberProfile-a_uid.[A-Z0-9]+(?=")',
                    str(simple_soup))

            result = []
            if profile_url is not None:
                profile_url = TA_ROOT + profile_url.group(0).strip()
                result.append(simple_soup.prettify())
                detail_soup = common.load_soup_online(profile_url)
                member_soup = detail_soup.find('div',
                                               id='MODULES_MEMBER_CENTER')
                if member_soup is not None:
                    result.append(member_soup.prettify())
                    record = [uid, '\r\n'.join(result)]
                    with lock:
                        with taDB(common.TA_DB) as db:
                            db.insert_a_user(record)
                else:
                    if '404' in detail_soup.find('title').string:
                        with lock:
                            with taDB(common.TA_DB) as db:
                                db.remove_user_id_in_review(uid)
                                logger.info('\tuser id removed')
                    else:
                        logger.info('\tfailed to fetch full profile')
                        que.put(uid)
            else:
                logger.info('\tno profile url')
                que.put(uid)

            time.sleep(common.SLEEP_TIME)
            que.task_done()

    # extract unique user ids from reviews
    logger.info('retrieving users...')
    with taDB(common.TA_DB) as iodb:
        iodb.generate_unique_users()
        uids = iodb.read_all_user_ids()
    logger.info('{} users found'.format(len(uids)))

    que = queue.Queue()

    threads = []
    thread_size = common.USER_THREAD_NUM
    for j in range(thread_size):
        t = threading.Thread(target=gather_profiles, args=(str(j + 1)))
        t.start()
        threads.append(t)

    [que.put(x) for x in uids if not user_is_valid(x)]

    que.join()

    for k in range(thread_size):
        que.put(None)
    for t in threads:
        t.join()

    logger.info('all user ids are ready')