def webhook_handler(): if flask.request.method == "POST": # Retrieve the message in JSON and then transform it to Telegram object update = telegram.Update.de_json(flask.request.get_json(force=True), bot) if update.message: # Regular message text = update.message.text user_id = update.message.from_user.id chat_id = update.message.chat_id username = update.message.from_user.username message_id = None elif update.callback_query: # Callback query text = update.callback_query.data user_id = update.callback_query.from_user.id chat_id = update.callback_query.message.chat_id username = update.callback_query.from_user.username message_id = update.callback_query.message.message_id else: logging.error("Received unknown update!") return constants.RESPONSE_OK # User must have username if not username: bot.sendMessage(chat_id, constants.ERROR_NO_USERNAME) return constants.RESPONSE_OK # Retrieve/Create user user = User.get_by_id(user_id) if not user: # New user logging.info("User %s not found! Creating new user...", user_id) user = User(id=user_id, chat_id=chat_id, username=username) user.put() else: # Existing user user.last_activity_date = datetime.now() if username != user.username: logging.debug("User %s has changed username from %s to %s", user_id, user.username, username) user.username = username user.put() commands.handle_input(user, text, message_id) return constants.RESPONSE_OK
def crawler_comment_thread(requester, queue, index): fout = file('./tmp/comment.part.%d' % index, 'w') ferr = file('./tmp/comment.err.part.%d' % index, 'w') failure_count = 1 while not queue.empty(): try: url, count = queue.get(True, 30) if count == 10: log('10:' + url + '\n', ferr) continue html = requester.get(url) log('%d:%d:%s' % (queue.qsize(), index, url), fout) page = H.document_fromstring(html) shop_id = shop_id_pattern.search(url).group() comment_list_node = page.xpath(COMMENT_LIST_XPATH) for comment_block in comment_list_node: comment = Comment() user = User() comment.comment_id = comment_block.xpath(COMMENT_ID_XPATH)[0] comment.shop_id = shop_id user.user_id = comment_block.xpath(USER_ID_XPATH)[0] comment.user_id = user.user_id username_node = comment_block.xpath(USER_NAME_XPATH) if len(username_node) < 1: log('no username:%s' % url, ferr) continue user.username = comment_block.xpath( USER_NAME_XPATH)[0].text_content() score_node = comment_block.xpath(SCORE_XPATH) comment.star = int( number_pattern.search( comment_block.xpath(SCORE_XPATH)[0].split(' ') [1]).group()) / 10 if len(score_node) > 0 else 0 average_node = comment_block.xpath(AVERAGE_PER_XPATH) comment.average = number_pattern.search( average_node[0].text_content()).group( ) if len(average_node) > 0 else 0 comment.date = fill_date( str( comment_block.xpath(DATE_XPATH)[0].text_content(). encode('utf-8')).split('\xc2\xa0')[0], DATE_FORMAT) content_extra_node = comment_block.xpath(CONTENT_EXTRA_XPATH) if len(content_extra_node) > 0: comment.content = content_extra_node[0].text_content( ).strip() else: comment.content = comment_block.xpath( CONTENT_XPATH)[0].text_content().strip() other_score_node = comment_block.xpath(OTHER_SCORE_XPATH) comment.taste_score = 0 comment.envir_score = 0 comment.service_score = 0 for each_node in other_score_node: if TASTE_TAG in each_node.text_content(): comment.taste_score = each_node.text_content()[2] elif ENVIR_TAG in each_node.text_content(): comment.envir_score = each_node.text_content()[2] elif SERVICE_TAG in each_node.text_content(): comment.service_score = each_node.text_content()[2] #has_other_score = len(other_score_node) > 0 #comment.taste_score = number_pattern.search(other_score_node[0].text_content()).group() if has_other_score else 0 #comment.envir_score = number_pattern.search(other_score_node[1].text_content()).group() if has_other_score else 0 #comment.service_score = number_pattern.search(other_score_node[2].text_content()).group() if has_other_score else 0 user.save() comment.save() next_page_node = page.xpath(COMMENT_NEXT_XPATH) if len(next_page_node) > 0: pageno = next_page_node[0] comment_url_prefix = pageno_pattern.sub('', url) next_url = comment_url_prefix + pageno queue.put((next_url, 1)) failure_count = 1 except Empty, e: log('%d:Empty' % index, fout) break except urllib2.HTTPError, e: if e.code != 404: # 403 forbbidden queue.put((url, count)) sleep(10 * failure_count) failure_count += 1 if failure_count == 10: log('%d:403:%s' % (index, url), fout) else: log('%d:404:error:%s' % (index, url), fout)
def crawler_comment_thread(requester, queue, index): fout = file('./tmp/comment.part.%d' % index, 'w') ferr = file('./tmp/comment.err.part.%d' % index, 'w') failure_count = 1 while not queue.empty(): try: url, count = queue.get(True, 30) if count == 10: log('10:' + url + '\n', ferr) continue html = requester.get(url) log('%d:%d:%s' % (queue.qsize(), index, url), fout) page = H.document_fromstring(html) shop_id = shop_id_pattern.search(url).group() comment_list_node = page.xpath(COMMENT_LIST_XPATH) for comment_block in comment_list_node: comment = Comment() user = User() comment.comment_id = comment_block.xpath(COMMENT_ID_XPATH)[0] comment.shop_id = shop_id user.user_id = comment_block.xpath(USER_ID_XPATH)[0] comment.user_id = user.user_id username_node = comment_block.xpath(USER_NAME_XPATH) if len(username_node) < 1: log('no username:%s' % url, ferr) continue user.username = comment_block.xpath(USER_NAME_XPATH)[0].text_content() score_node = comment_block.xpath(SCORE_XPATH) comment.star = int(number_pattern.search(comment_block.xpath(SCORE_XPATH)[0].split(' ')[1]).group()) / 10 if len(score_node) > 0 else 0 average_node = comment_block.xpath(AVERAGE_PER_XPATH) comment.average = number_pattern.search(average_node[0].text_content()).group() if len(average_node) > 0 else 0 comment.date = fill_date(str(comment_block.xpath(DATE_XPATH)[0].text_content().encode('utf-8')).split('\xc2\xa0')[0], DATE_FORMAT) content_extra_node = comment_block.xpath(CONTENT_EXTRA_XPATH) if len(content_extra_node) > 0: comment.content = content_extra_node[0].text_content().strip() else: comment.content = comment_block.xpath(CONTENT_XPATH)[0].text_content().strip() other_score_node = comment_block.xpath(OTHER_SCORE_XPATH) comment.taste_score = 0 comment.envir_score = 0 comment.service_score = 0 for each_node in other_score_node: if TASTE_TAG in each_node.text_content(): comment.taste_score = each_node.text_content()[2] elif ENVIR_TAG in each_node.text_content(): comment.envir_score = each_node.text_content()[2] elif SERVICE_TAG in each_node.text_content(): comment.service_score = each_node.text_content()[2] #has_other_score = len(other_score_node) > 0 #comment.taste_score = number_pattern.search(other_score_node[0].text_content()).group() if has_other_score else 0 #comment.envir_score = number_pattern.search(other_score_node[1].text_content()).group() if has_other_score else 0 #comment.service_score = number_pattern.search(other_score_node[2].text_content()).group() if has_other_score else 0 user.save() comment.save() next_page_node = page.xpath(COMMENT_NEXT_XPATH) if len(next_page_node) > 0: pageno = next_page_node[0] comment_url_prefix = pageno_pattern.sub('', url) next_url = comment_url_prefix + pageno queue.put((next_url, 1)) failure_count = 1 except Empty,e: log('%d:Empty' % index, fout) break except urllib2.HTTPError,e: if e.code != 404: # 403 forbbidden queue.put((url, count)) sleep(10 * failure_count) failure_count += 1 if failure_count == 10: log('%d:403:%s' % (index, url), fout) else: log('%d:404:error:%s' % (index, url), fout)