Exemplo n.º 1
0
def parse(message):
    pattern=scrapemark.compile("""
     <xml>
     <ToUserName>![CDATA[{{ message.toUser }}]]</ToUserName>
     <FromUserName>![CDATA[{{ message.fromUser }}]]</FromUserName>
     <CreateTime>{{ message.createTime }}</CreateTime>
     <MsgType>![CDATA[{{ message.msgType }}]]</MsgType>
     {*
     <Content>![CDATA[{{ message.content }}]]</Content>
     *}
     {*
     <Location_X>{{ message.locationX }}</Location_X>
     <Location_Y>{{ message.localtionY }}</Location_Y>
     <Scale>20</Scale>
     <Label>![CDATA[{{ message.label }}]]</Label>
     *}
     {*
     <PicUrl>![CDATA[{{ message.picUrl }}]]</PicUrl>
     *}
     {*
     <Title>![CDATA[{{ message.title }}]]</Title>
     <Description>![CDATA[{{ message.description }}]]</Description>
     <Url>![CDATA[{{ message.url }}]]</Url>
     *}
     {*
     <Event>![CDATA[{{ message.event }}]]</Event>
     <EventKey>![CDATA[{{ message.eventkey }}]]</EventKey>
     *}
     {*
     <MsgId>{{ message.msgId }}</MsgId>
     *}
     </xml>
    """)
    msg=dict([(k,v) for (k,v) in pattern.scrape(html=re.sub('<(\!\[CDATA\[.*\]\])>', cdatarepl, message))['message'].items() if v])
    msg['message']=message
    msg['createTime']=int(msg['createTime'])
    return msg
Exemplo n.º 2
0
            headers['User-Agent'] = user_agent
    if verbose:
        print 'fetching', url, '...'
    request = urllib2.Request(url, post, headers)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
    res = opener.open(request).read()
    if verbose:
        print 'DONE fetching.'
    bs=BeautifulSoup(res).prettify()
    return bs

scrapemark.fetch_html=fetch_beautified_html

pattern = scrapemark.compile("""
    {*
    <td valign="top" width="140"><img src="{{ [fishes].image }}" /></td><td>{{ [fishes].data }}</td>
    *}
    """)

def scrape():
    for fish in pattern.scrape(url='http://www.tcfishery.com/price/default.asp', post={'page':7})['fishes']:
        yield fish

if __name__ == "__main__":
    for fish in scrape():
        data=fish['data'].split(' ')
        fish['name']=data[1]
        fish['price']=float(data[5])
        fish['date']=data[8]
        del fish['data']
        logging.error(fish)
Exemplo n.º 3
0
from application import scrapemark
import logging
import re

pattern = scrapemark.compile(
    """
     <xml>
     <ToUserName>![CDATA[{{ message.toUser }}]]</ToUserName>
     <FromUserName>![CDATA[{{ message.fromUser }}]]</FromUserName>
     <CreateTime>{{ message.createTime }}</CreateTime>
     <MsgType>![CDATA[{{ message.msgType }}]]</MsgType>
     {*
     <Content>![CDATA[{{ message.content }}]]</Content>
     *}
     {*
     <Location_X>{{ message.locationX }}</Location_X>
     <Location_Y>{{ message.localtionY }}</Location_Y>
     <Scale>20</Scale>
     <Label>![CDATA[{{ message.label }}]]</Label>*}
     {*
     <PicUrl>![CDATA[{{ message.picUrl }}]]</PicUrl>
     *}
     </xml>
    """
)


def cdatarepl(matchobj):
    return matchobj.group(1)