def parse(self): path = [] for i in self.path.split('/'): path.append(filename(i)) path[-1] = filename(path[-1], ".htm") self.path = "/".join(path) htm = self.content soup = BeautifulSoup(htm) main = soup.find('div', {'id': 'content'}) title = main.find("h2").string try: print title print "\n" except: pass return { "keyword": soup.find('meta', {'name': "Keywords"})['content'], "title": title, "content": ''.join([ unicode(i) for i in main.find("div", { "id": "text" }).contents ]), "url": self.url }
def parse(self): path=[] for i in self.path.split('/'): path.append(filename(i)) path[-1]=filename(path[-1].split(' ',2)[0]+".txt") self.path="/".join(path).decode('utf-8') htm=self.content soup=BeautifulSoup(htm) cn_title=soup.find('h1').find("span",{"id":"ctl00_MasterContentPlaceHolder_TitleLabel"}).string en_title=soup.find('h2').find("span").string if not en_title: en_title="" from zspy.html2txt import html2txt brief=soup.find("span",{"id":"ctl00_MasterContentPlaceHolder_AbstractLabel"}).string if brief: brief=html2txt(brief) else:brief="" page_author=soup.find("span",{"id":"ctl00_MasterContentPlaceHolder_AuthorDataList"}) if page_author: author='\t'.join( [i.string for i in page_author.findAll('a')] ) else:author='' page_word=soup.find("span",{"id":"ctl00_MasterContentPlaceHolder_KeywordDatalist"}) if page_word: page_word=page_word.findAll('a') else: page_word=[] keyword='\t'.join([ i.string for i in page_word ]) magezine=soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_PeriodicalLink"}).string time=soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_IssueLink"}).string kind=' >>> '.join([ soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_topnavigation"}).string, soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_subnavigation"}).string ]) return { "url":self.url, "cn_title":cn_title, "en_title":en_title, "keyword":keyword, "author":author, "magezine":magezine, "time":time, "kind":kind, "brief":brief.replace('" class="highLight">','') }
def _local_saver(downer,meta): if exist(meta.link):return else: print "saveing",meta.link meta['summary']=img_saver(downer,meta.link,meta['summary_detail']['value'],"../../img/") template_render(meta,template_name) c=template_render(meta,template_name) file_saver(join(dirname,filename(meta.title)+'.htm'),c) db_saver(meta,c)
def _local_saver(downer, meta): if exist(meta.link): return else: print "saveing", meta.link meta['summary'] = img_saver(downer, meta.link, meta['summary_detail']['value'], "../../img/") template_render(meta, template_name) c = template_render(meta, template_name) file_saver(join(dirname, filename(meta.title) + '.htm'), c) db_saver(meta, c)
def parse(self): path = [] for i in self.path.split('/'): path.append(filename(i)) path[-1] = filename(path[-1].split(' ', 2)[0] + ".htm") self.path = "/".join(path) htm = self.content soup = BeautifulSoup(htm) title = soup.find('div', {'id': 'artibodyTitle'}).find("h1").string content = ''.join([ unicode(i) for i in soup.find("div", { "id": "artibody" }).contents ]) try: print title print "\n" except: pass return {"url": self.url, "title": title, "content": content}
def parse(self): path = [] for i in self.path.split("/"): path.append(filename(i)) path[-1] = filename(path[-1], ".htm") self.path = "/".join(path) htm = self.content soup = BeautifulSoup(htm) main = soup.find("div", {"id": "content"}) title = main.find("h2").string try: print title print "\n" except: pass return { "keyword": soup.find("meta", {"name": "Keywords"})["content"], "title": title, "content": "".join([unicode(i) for i in main.find("div", {"id": "text"}).contents]), "url": self.url, }
def parse(self): path=[] for i in self.path.split('/'): path.append(filename(i)) path[-1]=filename(path[-1].split(' ',2)[0]+".htm") self.path="/".join(path) htm=self.content soup=BeautifulSoup(htm) title=soup.find('div',{'id':'artibodyTitle'}).find("h1").string content=''.join([unicode(i) for i in soup.find("div",{"id":"artibody"}).contents]) try: print title print "\n" except: pass return { "url":self.url, "title":title, "content":content }
def site(index_url, parser, once=True): from zspy.filesys import makedirs, filename site_name = filename(index_url.split("://", 2)[1]) task = load_db(site_name) from down import Downer downer = Downer(task) if once and index_url in downer.history: print "Continue %s" % index_url else: print "New Start %s" % index_url downer.add(parser(index_url, "%s/%s" % (config.FETCH_TO, site_name))) return downer
def site(index_url,parser,once=True): from zspy.filesys import makedirs,filename site_name=filename(index_url.split("://",2)[1]) task=load_db(site_name) from down import Downer downer=Downer(task) if once and index_url in downer.history: print "Continue %s"%index_url else: print "New Start %s"%index_url downer.add( parser( index_url, "%s/%s"%( config.FETCH_TO, site_name ) ) ) return downer
def parse(self): path = [] for i in self.path.split('/'): path.append(filename(i)) path[-1] = filename(path[-1].split(' ', 2)[0] + ".txt") self.path = "/".join(path).decode('utf-8') htm = self.content soup = BeautifulSoup(htm) cn_title = soup.find('h1').find( "span", { "id": "ctl00_MasterContentPlaceHolder_TitleLabel" }).string en_title = soup.find('h2').find("span").string if not en_title: en_title = "" from zspy.html2txt import html2txt brief = soup.find("span", { "id": "ctl00_MasterContentPlaceHolder_AbstractLabel" }).string if brief: brief = html2txt(brief) else: brief = "" page_author = soup.find( "span", {"id": "ctl00_MasterContentPlaceHolder_AuthorDataList"}) if page_author: author = '\t'.join([i.string for i in page_author.findAll('a')]) else: author = '' page_word = soup.find( "span", {"id": "ctl00_MasterContentPlaceHolder_KeywordDatalist"}) if page_word: page_word = page_word.findAll('a') else: page_word = [] keyword = '\t'.join([i.string for i in page_word]) magezine = soup.find( "a", { "id": "ctl00_MasterContentPlaceHolder_PeriodicalLink" }).string time = soup.find("a", { "id": "ctl00_MasterContentPlaceHolder_IssueLink" }).string kind = ' >>> '.join([ soup.find("a", { "id": "ctl00_MasterContentPlaceHolder_topnavigation" }).string, soup.find("a", { "id": "ctl00_MasterContentPlaceHolder_subnavigation" }).string ]) return { "url": self.url, "cn_title": cn_title, "en_title": en_title, "keyword": keyword, "author": author, "magezine": magezine, "time": time, "kind": kind, "brief": brief.replace('" class="highLight">', '') }