def get_post_content_and_time( post_url, post_type, post_name, time_last_time ):
    post_full_url = 'http://www.hi-pda.com/forum/' + post_url
    post_headers = {
        'Referer' : 'http://www.hi-pda.com/forum/',
        'Host'    : 'www.hi-pda.com'
    }
    
    logging.info( 'Get post[%s] by url[%s].' % ( post_name, post_url ) )

    #根据url请求帖子内容
    post_request = HttpRequest( post_full_url, None, post_headers )
    post_request.send_request()
    post_resp_content = post_request.get_resp_content()
    try:
        post_resp_content = post_resp_content.decode('gbk')
    except UnicodeDecodeError as e:
        logging.error( 'Decode post response content failed.' )
        logging.exception( e )
    
    #从帖子内容中解析帖子的摘要
    re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''' )
    result_content = re_pattern_content.search( post_resp_content )

    post_content = None
    post_update_time = None
    post_create_time = None

    if result_content is None:
        logging.warn( 'Request failed.' )
    else:
        post_content = result_content.groups()[0]

    if post_content is None:
        logging.warn( 'Get post conetent failed.' )
    else:
        #从帖子内容中解析帖子更新时间
        re_pattern_update_time = re.compile( r'''于 (.*) 编辑''' )
        result_update_time = re_pattern_update_time.search( post_content )
        if result_update_time is None:
            pass
        else:
            post_update_time = result_update_time.groups()[0]

    if post_update_time is None:
        #从帖子内容中解析帖子发表时间
        re_pattern_create_time = re.compile( r'''<em id=".+">发表于 (.+)</em>''' )
        result_create_time = re_pattern_create_time.search( post_resp_content )
        
        if result_create_time is None:
            logging.warn( 'Get post time failed.' )
        else:
            post_create_time = result_create_time.groups()[0]
    else:
        post_create_time = post_update_time

    
    post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M')
    post_create_time_stamp = post_create_time_datetime.timestamp()


    post = None

    #比较帖子时间和上次爬取时间,如果大于上次爬取时间,则视为本次爬取目标
    if ( post_create_time_stamp - time_last_time ) > 0:

        conn = mysql.connector.connect(user = db_user, password = db_passwd, database = db_name)
        cursor = conn.cursor()
        logging.info( 'post_type:' + post_type )
        logging.info( 'post_name:' + post_name )
        logging.info( 'post_url:' + post_full_url )
        logging.info( 'post_create_time:' + post_create_time )
        logging.info( 'post_content:' + post_content )
        
        post_id = next_id()
        post = Post( id = post_id, post_type = post_type, post_title = post_name, post_owner = 'hipda', post_content = post_content, post_link = post_full_url, post_time = post_create_time )
        # post.save()
        cursor.execute('insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp])
        conn.commit()
        cursor.close()
        conn.close()
        time.sleep( 1 )
    else:
        logging.info( 'Post time[%s] is not after last time.' % post_create_time_datetime )

    return post
            get_post_time_last_time_stamp = 0
        else:
            get_post_time_last_time = datetime.strptime( get_post_time_last_time, '%Y-%m-%d %H:%M:%S' )
            get_post_time_last_time_stamp = get_post_time_last_time.timestamp()

        logging.info( '************last time: %s************' % get_post_time_last_time )

        #using cookieJar & HTTPCookieProcessor to automatically handle cookies
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        urllib.request.install_opener(opener)

        pda_url = 'http://www.hi-pda.com/'
        pda_request = HttpRequest( pda_url )
        pda_request.send_request()
        pda_resp_content = pda_request.get_resp_content()

        formhash_url = 'http://www.hi-pda.com/forum/logging.php?action=login&referer=http%3A//www.hi-pda.com/forum/'
        formhash_request = HttpRequest( formhash_url, None, { 'Host' : 'www.hi-pda.com' } )
        formhash_request.send_request()
        formhash_resp_content = formhash_request.get_resp_content()
        try:
            formhash_resp_content = formhash_resp_content.decode('gbk')
        except UnicodeDecodeError as e:
            logging.error( 'Decode formhash response content failed.' )
            logging.exception( e )

        # print( formhash_resp_content )
        # <input type="hidden" name="formhash" value="2f68efff" />
        re_formhash = re.compile( r'''<input type="hidden" name="formhash" value="(.+)" />''' )
        formhash = re_formhash.search( formhash_resp_content )
Exemplo n.º 3
0
def get_post_content_and_time(post_url, post_type, post_name, time_last_time):
    post_full_url = 'http://www.hi-pda.com/forum/' + post_url
    post_headers = {
        'Referer': 'http://www.hi-pda.com/forum/',
        'Host': 'www.hi-pda.com'
    }

    logging.info('Get post[%s] by url[%s].' % (post_name, post_url))

    #根据url请求帖子内容
    post_request = HttpRequest(post_full_url, None, post_headers)
    post_request.send_request()
    post_resp_content = post_request.get_resp_content()
    try:
        post_resp_content = post_resp_content.decode('gbk')
    except UnicodeDecodeError as e:
        logging.error('Decode post response content failed.')
        logging.exception(e)

    #从帖子内容中解析帖子的摘要
    re_pattern_content = re.compile(
        r'''<meta name="description" content="(.*)" />''')
    result_content = re_pattern_content.search(post_resp_content)

    post_content = None
    post_update_time = None
    post_create_time = None

    if result_content is None:
        logging.warn('Request failed.')
    else:
        post_content = result_content.groups()[0]

    if post_content is None:
        logging.warn('Get post conetent failed.')
    else:
        #从帖子内容中解析帖子更新时间
        re_pattern_update_time = re.compile(r'''于 (.*) 编辑''')
        result_update_time = re_pattern_update_time.search(post_content)
        if result_update_time is None:
            pass
        else:
            post_update_time = result_update_time.groups()[0]

    if post_update_time is None:
        #从帖子内容中解析帖子发表时间
        re_pattern_create_time = re.compile(r'''<em id=".+">发表于 (.+)</em>''')
        result_create_time = re_pattern_create_time.search(post_resp_content)

        if result_create_time is None:
            logging.warn('Get post time failed.')
        else:
            post_create_time = result_create_time.groups()[0]
    else:
        post_create_time = post_update_time

    post_create_time_datetime = datetime.strptime(post_create_time,
                                                  '%Y-%m-%d %H:%M')
    post_create_time_stamp = post_create_time_datetime.timestamp()

    post = None

    #比较帖子时间和上次爬取时间,如果大于上次爬取时间,则视为本次爬取目标
    if (post_create_time_stamp - time_last_time) > 0:

        conn = mysql.connector.connect(user=db_user,
                                       password=db_passwd,
                                       database=db_name)
        cursor = conn.cursor()
        logging.info('post_type:' + post_type)
        logging.info('post_name:' + post_name)
        logging.info('post_url:' + post_full_url)
        logging.info('post_create_time:' + post_create_time)
        logging.info('post_content:' + post_content)

        post_id = next_id()
        post = Post(id=post_id,
                    post_type=post_type,
                    post_title=post_name,
                    post_owner='hipda',
                    post_content=post_content,
                    post_link=post_full_url,
                    post_time=post_create_time)
        # post.save()
        cursor.execute(
            'insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)',
            [
                post_id, post_type, post_name, 'hipda', post_content,
                post_full_url, post_create_time_stamp, post_create_time_stamp
            ])
        conn.commit()
        cursor.close()
        conn.close()
        time.sleep(1)
    else:
        logging.info('Post time[%s] is not after last time.' %
                     post_create_time_datetime)

    return post
Exemplo n.º 4
0
def get_post_content_and_time( post_url, post_type, post_name, time_last_time ):
    post_full_url = 'http://www.hi-pda.com/forum/' + post_url
    post_headers = {
        'Referer' : 'http://www.hi-pda.com/forum/',
        'Host'    : 'www.hi-pda.com'
    }
    
    logging.info( 'Get post[%s] by url[%s].' % ( post_name, post_url ) )

    post_request = HttpRequest( post_full_url, None, post_headers )
    try:
        post_request.send_request()
    except TimeoutError:
            logging.warn(" Request url[%s] failed. " % post_full_url)
    post_resp_content = post_request.get_resp_content()
    try:
        post_resp_content = post_resp_content.decode('gbk')
    except UnicodeDecodeError as e:
        logging.error( 'Decode post response content failed.' )
        logging.exception( e )

    #<meta name="description" content=" Hi!PDA  本帖最后由 一炮而红 于 2015-12-1 22:59 编辑 三原色智能通讯欢迎您! http://187161236.taobao.com/ 论坛5年商家。&amp;nbsp;&amp;nbsp;微信&amp;nbsp;&amp;nbsp;QQ:18 ... - Board" />
    re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''' )
    result_content = re_pattern_content.search( post_resp_content )

    post_content = None
    post_update_time = None
    post_create_time = None

    if result_content is None:
        logging.warn( 'Request failed.' )
    else:
        post_content = result_content.groups()[0]

    if post_content is None:
        logging.warn( 'Get post conetent failed.' )
    else:
        re_pattern_update_time = re.compile( r'''于 (.*) 编辑''' )
        result_update_time = re_pattern_update_time.search( post_content )
        if result_update_time is None:
            pass
        else:
            post_update_time = result_update_time.groups()[0]

    if post_update_time is None:
        re_pattern_create_time = re.compile( r'''<em id=".+">发表于 (.+)</em>''' )
        result_create_time = re_pattern_create_time.search( post_resp_content )
        
        if result_create_time is None:
            logging.warn( 'Get post time failed.' )
        else:
            post_create_time = result_create_time.groups()[0]
    else:
        post_create_time = post_update_time


    
    post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M')
    post_create_time_stamp = post_create_time_datetime.timestamp()

    post_create_time_stamp - time_last_time

    post = None

    if ( post_create_time_stamp - time_last_time ) > 0:

        conn = mysql.connector.connect(user = db_user, password = db_passwd, database = db_name)
        cursor = conn.cursor()
        logging.info( 'post_type:' + post_type )
        logging.info( 'post_name:' + post_name )
        logging.info( 'post_url:' + post_full_url )
        logging.info( 'post_create_time:' + post_create_time )
        logging.info( 'post_content:' + post_content )
        
        post_id = next_id()
        post = Post( id = post_id, post_type = post_type, post_title = post_name, post_owner = 'hipda', post_content = post_content, post_link = post_full_url, post_time = post_create_time )
        # post.save()
        cursor.execute('insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp])
        conn.commit()
        cursor.close()
        conn.close()
        time.sleep( 1 )
    else:
        logging.info( 'Post time[%s] is not after last time.' % post_create_time_datetime )

    return post
Exemplo n.º 5
0
                get_post_time_last_time, '%Y-%m-%d %H:%M:%S')
            get_post_time_last_time_stamp = get_post_time_last_time.timestamp()

        logging.info('************last time: %s************' %
                     get_post_time_last_time)

        #using cookieJar & HTTPCookieProcessor to automatically handle cookies
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(
            urllib.request.HTTPCookieProcessor(cj))
        urllib.request.install_opener(opener)

        pda_url = 'http://www.hi-pda.com/'
        pda_request = HttpRequest(pda_url)
        pda_request.send_request()
        pda_resp_content = pda_request.get_resp_content()

        formhash_url = 'http://www.hi-pda.com/forum/logging.php?action=login&referer=http%3A//www.hi-pda.com/forum/'
        formhash_request = HttpRequest(formhash_url, None,
                                       {'Host': 'www.hi-pda.com'})
        formhash_request.send_request()
        formhash_resp_content = formhash_request.get_resp_content()
        try:
            formhash_resp_content = formhash_resp_content.decode('gbk')
        except UnicodeDecodeError as e:
            logging.error('Decode formhash response content failed.')
            logging.exception(e)

        # print( formhash_resp_content )
        # <input type="hidden" name="formhash" value="2f68efff" />
        re_formhash = re.compile(