def process(thread): for url in pages(thread): stanza_template=Template(u""" {* <table id="firstAuthor"> <tr> <td> <a>${author}</a> 发表日期:{{ [stanzas].datetime }} </td> </tr> </table> <div id="pContentDiv"> <div class="post"> {{ [stanzas].content|html }} </div> </div> *} {* <table> <tr> <td> <a>${author}</a> 回复日期:{{ [stanzas].datetime }} </td> </tr> </table> <div class="post"> {{ [stanzas].content|html }} </div> *} """) logging.info(thread['author']) pattern=scrapemark.compile(stanza_template.substitute(author=thread['author'])) logging.info(pattern) thread['stanzas'][url]=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas'] logging.info(thread['stanzas'][url])
def process(): url='http://www.tianya.cn/publicforum/content/develop/1/905898.shtml' template=Template(u""" {* <table id="firstAuthor"> <tr> <td> <a>${author}</a> 发表日期:{{ [stanzas].datetime }} </td> </tr> </table> <div id="pContentDiv"> <div class="post"> {{ [stanzas].content }} </div> </div> *} {* <table> <tr> <td> <a>${author}</a> 回复日期:{{ [stanzas].datetime }} </td> </tr> </table> <div class="post"> {{ [stanzas].content }} </div> *} """) pattern=template.substitute(author=u'flp713') pattern=scrapemark.compile(pattern) stanzas=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas'] return stanzas