def get_post_content_and_time( post_url, post_type, post_name, time_last_time ): post_full_url = 'http://www.hi-pda.com/forum/' + post_url post_headers = { 'Referer' : 'http://www.hi-pda.com/forum/', 'Host' : 'www.hi-pda.com' } logging.info( 'Get post[%s] by url[%s].' % ( post_name, post_url ) ) #根据url请求帖子内容 post_request = HttpRequest( post_full_url, None, post_headers ) post_request.send_request() post_resp_content = post_request.get_resp_content() try: post_resp_content = post_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error( 'Decode post response content failed.' ) logging.exception( e ) #从帖子内容中解析帖子的摘要 re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''' ) result_content = re_pattern_content.search( post_resp_content ) post_content = None post_update_time = None post_create_time = None if result_content is None: logging.warn( 'Request failed.' ) else: post_content = result_content.groups()[0] if post_content is None: logging.warn( 'Get post conetent failed.' ) else: #从帖子内容中解析帖子更新时间 re_pattern_update_time = re.compile( r'''于 (.*) 编辑''' ) result_update_time = re_pattern_update_time.search( post_content ) if result_update_time is None: pass else: post_update_time = result_update_time.groups()[0] if post_update_time is None: #从帖子内容中解析帖子发表时间 re_pattern_create_time = re.compile( r'''<em id=".+">发表于 (.+)</em>''' ) result_create_time = re_pattern_create_time.search( post_resp_content ) if result_create_time is None: logging.warn( 'Get post time failed.' ) else: post_create_time = result_create_time.groups()[0] else: post_create_time = post_update_time post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M') post_create_time_stamp = post_create_time_datetime.timestamp() post = None #比较帖子时间和上次爬取时间,如果大于上次爬取时间,则视为本次爬取目标 if ( post_create_time_stamp - time_last_time ) > 0: conn = mysql.connector.connect(user = db_user, password = db_passwd, database = db_name) cursor = conn.cursor() logging.info( 'post_type:' + post_type ) logging.info( 'post_name:' + post_name ) logging.info( 'post_url:' + post_full_url ) logging.info( 'post_create_time:' + post_create_time ) logging.info( 'post_content:' + post_content ) post_id = next_id() post = Post( id = post_id, post_type = post_type, post_title = post_name, post_owner = 'hipda', post_content = post_content, post_link = post_full_url, post_time = post_create_time ) # post.save() cursor.execute('insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp]) conn.commit() cursor.close() conn.close() time.sleep( 1 ) else: logging.info( 'Post time[%s] is not after last time.' % post_create_time_datetime ) return post
get_post_time_last_time_stamp = 0 else: get_post_time_last_time = datetime.strptime( get_post_time_last_time, '%Y-%m-%d %H:%M:%S' ) get_post_time_last_time_stamp = get_post_time_last_time.timestamp() logging.info( '************last time: %s************' % get_post_time_last_time ) #using cookieJar & HTTPCookieProcessor to automatically handle cookies cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) pda_url = 'http://www.hi-pda.com/' pda_request = HttpRequest( pda_url ) pda_request.send_request() pda_resp_content = pda_request.get_resp_content() formhash_url = 'http://www.hi-pda.com/forum/logging.php?action=login&referer=http%3A//www.hi-pda.com/forum/' formhash_request = HttpRequest( formhash_url, None, { 'Host' : 'www.hi-pda.com' } ) formhash_request.send_request() formhash_resp_content = formhash_request.get_resp_content() try: formhash_resp_content = formhash_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error( 'Decode formhash response content failed.' ) logging.exception( e ) # print( formhash_resp_content ) # <input type="hidden" name="formhash" value="2f68efff" /> re_formhash = re.compile( r'''<input type="hidden" name="formhash" value="(.+)" />''' ) formhash = re_formhash.search( formhash_resp_content )
def get_post_content_and_time(post_url, post_type, post_name, time_last_time): post_full_url = 'http://www.hi-pda.com/forum/' + post_url post_headers = { 'Referer': 'http://www.hi-pda.com/forum/', 'Host': 'www.hi-pda.com' } logging.info('Get post[%s] by url[%s].' % (post_name, post_url)) #根据url请求帖子内容 post_request = HttpRequest(post_full_url, None, post_headers) post_request.send_request() post_resp_content = post_request.get_resp_content() try: post_resp_content = post_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error('Decode post response content failed.') logging.exception(e) #从帖子内容中解析帖子的摘要 re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''') result_content = re_pattern_content.search(post_resp_content) post_content = None post_update_time = None post_create_time = None if result_content is None: logging.warn('Request failed.') else: post_content = result_content.groups()[0] if post_content is None: logging.warn('Get post conetent failed.') else: #从帖子内容中解析帖子更新时间 re_pattern_update_time = re.compile(r'''于 (.*) 编辑''') result_update_time = re_pattern_update_time.search(post_content) if result_update_time is None: pass else: post_update_time = result_update_time.groups()[0] if post_update_time is None: #从帖子内容中解析帖子发表时间 re_pattern_create_time = re.compile(r'''<em id=".+">发表于 (.+)</em>''') result_create_time = re_pattern_create_time.search(post_resp_content) if result_create_time is None: logging.warn('Get post time failed.') else: post_create_time = result_create_time.groups()[0] else: post_create_time = post_update_time post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M') post_create_time_stamp = post_create_time_datetime.timestamp() post = None #比较帖子时间和上次爬取时间,如果大于上次爬取时间,则视为本次爬取目标 if (post_create_time_stamp - time_last_time) > 0: conn = mysql.connector.connect(user=db_user, password=db_passwd, database=db_name) cursor = conn.cursor() logging.info('post_type:' + post_type) logging.info('post_name:' + post_name) logging.info('post_url:' + post_full_url) logging.info('post_create_time:' + post_create_time) logging.info('post_content:' + post_content) post_id = next_id() post = Post(id=post_id, post_type=post_type, post_title=post_name, post_owner='hipda', post_content=post_content, post_link=post_full_url, post_time=post_create_time) # post.save() cursor.execute( 'insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [ post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp ]) conn.commit() cursor.close() conn.close() time.sleep(1) else: logging.info('Post time[%s] is not after last time.' % post_create_time_datetime) return post
def get_post_content_and_time( post_url, post_type, post_name, time_last_time ): post_full_url = 'http://www.hi-pda.com/forum/' + post_url post_headers = { 'Referer' : 'http://www.hi-pda.com/forum/', 'Host' : 'www.hi-pda.com' } logging.info( 'Get post[%s] by url[%s].' % ( post_name, post_url ) ) post_request = HttpRequest( post_full_url, None, post_headers ) try: post_request.send_request() except TimeoutError: logging.warn(" Request url[%s] failed. " % post_full_url) post_resp_content = post_request.get_resp_content() try: post_resp_content = post_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error( 'Decode post response content failed.' ) logging.exception( e ) #<meta name="description" content=" Hi!PDA 本帖最后由 一炮而红 于 2015-12-1 22:59 编辑 三原色智能通讯欢迎您! http://187161236.taobao.com/ 论坛5年商家。&nbsp;&nbsp;微信&nbsp;&nbsp;QQ:18 ... - Board" /> re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''' ) result_content = re_pattern_content.search( post_resp_content ) post_content = None post_update_time = None post_create_time = None if result_content is None: logging.warn( 'Request failed.' ) else: post_content = result_content.groups()[0] if post_content is None: logging.warn( 'Get post conetent failed.' ) else: re_pattern_update_time = re.compile( r'''于 (.*) 编辑''' ) result_update_time = re_pattern_update_time.search( post_content ) if result_update_time is None: pass else: post_update_time = result_update_time.groups()[0] if post_update_time is None: re_pattern_create_time = re.compile( r'''<em id=".+">发表于 (.+)</em>''' ) result_create_time = re_pattern_create_time.search( post_resp_content ) if result_create_time is None: logging.warn( 'Get post time failed.' ) else: post_create_time = result_create_time.groups()[0] else: post_create_time = post_update_time post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M') post_create_time_stamp = post_create_time_datetime.timestamp() post_create_time_stamp - time_last_time post = None if ( post_create_time_stamp - time_last_time ) > 0: conn = mysql.connector.connect(user = db_user, password = db_passwd, database = db_name) cursor = conn.cursor() logging.info( 'post_type:' + post_type ) logging.info( 'post_name:' + post_name ) logging.info( 'post_url:' + post_full_url ) logging.info( 'post_create_time:' + post_create_time ) logging.info( 'post_content:' + post_content ) post_id = next_id() post = Post( id = post_id, post_type = post_type, post_title = post_name, post_owner = 'hipda', post_content = post_content, post_link = post_full_url, post_time = post_create_time ) # post.save() cursor.execute('insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp]) conn.commit() cursor.close() conn.close() time.sleep( 1 ) else: logging.info( 'Post time[%s] is not after last time.' % post_create_time_datetime ) return post
get_post_time_last_time, '%Y-%m-%d %H:%M:%S') get_post_time_last_time_stamp = get_post_time_last_time.timestamp() logging.info('************last time: %s************' % get_post_time_last_time) #using cookieJar & HTTPCookieProcessor to automatically handle cookies cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) pda_url = 'http://www.hi-pda.com/' pda_request = HttpRequest(pda_url) pda_request.send_request() pda_resp_content = pda_request.get_resp_content() formhash_url = 'http://www.hi-pda.com/forum/logging.php?action=login&referer=http%3A//www.hi-pda.com/forum/' formhash_request = HttpRequest(formhash_url, None, {'Host': 'www.hi-pda.com'}) formhash_request.send_request() formhash_resp_content = formhash_request.get_resp_content() try: formhash_resp_content = formhash_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error('Decode formhash response content failed.') logging.exception(e) # print( formhash_resp_content ) # <input type="hidden" name="formhash" value="2f68efff" /> re_formhash = re.compile(