def index_allfiles(): for x in file_names: file_data="" with open(x) as file: file_data = file.read() file.close() try: soup = BeautifulSoup(file_data) except UnicodeEncodeError: print "Soup error: "+x except TypeError: soup = BeautifulSoup(file_data.decode('utf-8','ignore')) if soup.title is None: page_title = " " else: page_title = soup.title.string for script in soup(["script", "style"]): script.extract() data = soup.getText(separator=u' ') try: writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data)) except UnicodeDecodeError: print "Error in " + x except UnicodeEncodeError: print "Error in " + x writer.commit()
def get_chapters(chapter_url, fic, web_site): content_tag = "div" content_class = {"class" : "list"} chapter_tag = "li" chapter_class = {} #build index html_page = urllib2.urlopen(chapter_url) html_content = html_page.read() ans = [] #get content content = BeautifulSoup(html_content) out = content.findAll(content_tag, content_class) contents = ''.join([str(item) for item in out]) chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class) for item in chapters: try: item_str = str(item) except: item_str = unicode(item).encode('utf-8') item_tag = BeautifulSoup(item_str) try: url = item_tag.a['href'] if 'http' not in url and url[0] != '/': url = chapter_url + url chapter_title = item_tag.getText() except: continue ans.append((url, chapter_title)) if not ans: fic.delete() return print '获取小说 %s 章节完毕' % fic.fiction_title save_chapter(ans, fic, web_site) print '保存章节完毕'
def index_data(): for x in file_array: file_data="" with open(x) as file: file_data = file.read() file.close() try: soup = BeautifulSoup(file_data) except UnicodeEncodeError: pass except TypeError: soup = BeautifulSoup(file_data.decode('utf-8','ignore')) if soup.title is None: page_title = " " else: page_title = soup.title.string for skip in soup(["script", "style"]): skip.extract() data = soup.getText(separator=u' ') try: writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data)) stop_writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data)) stemming_writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data)) stemming_stop_writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data)) except UnicodeDecodeError: pass except UnicodeEncodeError: pass writer.commit() stop_writer.commit() stemming_writer.commit() stemming_stop_writer.commit()
def sanitize(self, data): soup = BeautifulSoup(data) for element in soup.findAll(['script', 'style']): element.extract() for comments in soup.findAll(text=lambda text: isinstance(text, Comment)): comments.extract() return html2text.html2text(soup.getText())
def view_feed_view(request): feed_requested = request.matchdict.get('feed') rss = DBSession.query(Flux).filter_by(id=feed_requested).first() rss_id = rss.id url = rss.text count = rss.count title = rss.title url = rss.text count = 10 user = check_user_logged(request) if user is False: return dict( feed= "not logged", project= 'PyNews' ) content = [] try: feeds = feedparser.parse( url ) title = feeds['feed']['title'] feed_id = 0 for items in feeds["items"]: text = items['summary_detail']['value'] soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES) result = "" if count > 0: line = { "head": items['title_detail']['value'], "date": "sans date", "feed": soup.getText(), "link": items['link'], "feed_id": count, } content.append(line) count -= 1 feed_id += 1 feed = { 'url':url, 'title':title, 'count':count, 'content': content, 'id': rss_id, } return dict( feed= feed, title= feeds['feed']['title'], project= 'PyNews' ) except: feed = { 'url':url, 'title':"", 'count':"", 'content': content, 'id': rss_id, } return dict( feed= "error", title= "Error unable to fetch url: " + url, project= 'PyNews' )
def LyricWikia(artist, title): proxy = urllib.request.getproxies() url = 'http://lyrics.wikia.com/api.php?action=lyrics&artist={artist}&song={title}&fmt=json&func=getSong'.format( artist=artist, title=title).replace(" ", "%20") r = requests.get(url, timeout=15, proxies=proxy) # We got some bad formatted JSON data... So we need to fix stuff :/ returned = r.text returned = returned.replace("\'", "\"") returned = returned.replace("song = ", "") returned = json.loads(returned) if returned["lyrics"] != "Not found": # set the url to the url we just recieved, and retrieving it r = requests.get(returned["url"], timeout=15, proxies=proxy) soup = BeautifulSoup(r.text, "lxml") soup = soup.find("div", {"class": "lyricbox"}) [elem.extract() for elem in soup.findAll('div')] [elem.replaceWith('\n') for elem in soup.findAll('br')] #with old BeautifulSoup the following is needed..? For recent versions, this isn't needed/doesn't work try: soup = BeautifulSoup(str(soup), convertEntities=BeautifulSoup.HTML_ENTITIES) except: pass soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)', '', str(soup)), "lxml") [elem.extract() for elem in soup.findAll('script')] return (soup.getText()) else: return ("error")
def get_chapter_630(chapter_url, fic, web_site): content_tag = "div" content_class = {"class" : "zjbox"} chapter_tag = "dd" chapter_class = {} #build index html_page = urllib2.urlopen(chapter_url) html_content = html_page.read() ans = [] #get content content = BeautifulSoup(html_content) out = content.findAll(content_tag, content_class) contents = ''.join([str(item) for item in out]) chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class) for item in chapters: item_str = str(item) if isinstance(item, unicode): item_str = item.encode('utf-8') item_tag = BeautifulSoup(item_str) if item_tag.a: url = item_tag.a['href'] chapter_title = item_tag.getText() ans.append((url, chapter_title)) if not ans: fic.delete() save_chapter(ans, fic, web_site)
def LyricWikia(artist, title): url = 'http://lyrics.wikia.com/api.php?action=lyrics&artist={artist}&song={title}&fmt=json&func=getSong'.format(artist=artist, title=title).replace(" ","%20") r = requests.get(url, timeout=15) # We got some bad formatted JSON data... So we need to fix stuff :/ returned = r.text returned = returned.replace("\'", "\"") returned = returned.replace("song = ", "") returned = json.loads(returned) if returned["lyrics"] != "Not found": # set the url to the url we just recieved, and retrieving it r = requests.get(returned["url"], timeout=15) soup = BeautifulSoup(r.text) soup = soup.find("div", {"class": "lyricbox"}) [elem.extract() for elem in soup.findAll('div')] [elem.replaceWith('\n') for elem in soup.findAll('br')] #with old BeautifulSoup the following is needed..? For recent versions, this isn't needed/doesn't work try: soup = BeautifulSoup(str(soup), convertEntities=BeautifulSoup.HTML_ENTITIES) except: pass soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)', '', str(soup))) [elem.extract() for elem in soup.findAll('script')] return(soup.getText()) else: return()
def init_file(file): dic = { 'id': file[0], 'name': file[1], 'icon_link': file[2], 'icon_path': file[3], 'source': file[4], 'source_link': file[5], 'rating': file[6], 'version': file[7], 'developer': file[8], 'sdk_support': file[9], 'category': file[10], 'screen_support': file[11], 'apk_size': file[12], 'language': file[13], 'publish_date': file[14], 'downloads': file[15], 'description': file[16], 'images': file[17], 'images_path': file[18], 'qr_link': file[19], 'download_link': file[20], 'last_crawl': file[21], 'vol_id': file[22], 'package_name': file[23], 'version_code': file[24], 'sig': file[25], 'min_sdk_version': file[26], 'is_break': file[27], 'platform': file[28], 'file_type': file[29], 'package_hash': file[30], } try: if dic.get('description'): soup = BeautifulSoup(dic.get('description').decode('utf8')) dic['description'] = soup.getText('\n') else: dic['desctiption'] = '' if dic['source'] == 'itunes.apple.com': if '游戏' in dic['sig']: if '网游' in dic['name'] or 'online' in dic['name']: dic['category'] = '网络游戏' elif '飞机' in dic['name'] or '射击' in dic['name'] or '飞行' in dic[ 'name']: dic['category'] = '射击游戏' else: dic['category'] = dic['sig'] else: if '主题' in dic['name'] or '壁纸' in dic['name']: dic['category'] = '主题美化' elif dic['category'] == '社交': dic['category'] = random.choice(['社交一', '社交二']) dic['category'] = _adapt_cate_str(dic.get('category')) except Exception as e: print dic['source_link'] print e return dic
def slug(content): res=u'' soup=BeautifulSoup(content) res=soup.getText() if len(res)>100: res=res[:100] res+=u'...' return res
def _adapt_desc_str(desc_str): if not desc_str: return None soup = BeautifulSoup(desc_str) desc = soup.getText('\n') desc = __strip(desc) desc = __removeDuplicated(desc) desc = __cutTail(desc) return desc
def init_file(file): dic = {'id': file[0], 'name': file[1], 'icon_link': file[2], 'icon_path': file[3], 'source': file[4], 'source_link': file[5], 'rating': file[6], 'version': file[7], 'developer': file[8], 'sdk_support': file[9], 'category': file[10], 'screen_support': file[11], 'apk_size': file[12], 'language': file[13], 'publish_date': file[14], 'downloads': file[15], 'description': file[16], 'images': file[17], 'images_path': file[18], 'qr_link': file[19], 'download_link': file[20], 'last_crawl': file[21], 'vol_id': file[22], 'package_name': file[23], 'version_code': file[24], 'sig': file[25], 'min_sdk_version': file[26], 'is_break': file[27], 'platform': file[28], 'file_type': file[29], 'package_hash': file[30], } try: if dic.get('description'): soup = BeautifulSoup(dic.get('description').decode('utf8')) dic['description'] = soup.getText('\n') else: dic['desctiption'] = '' if dic['source'] == 'itunes.apple.com': if '游戏' in dic['sig']: if '网游' in dic['name'] or u'网游' in dic['description'] or 'online' in dic['name'] or u'online' in dic['description']: dic['category'] = '网络游戏' elif '飞机' in dic['name'] or '射击' in dic['name'] or '飞行' in dic['name'] or u'飞机' in dic['description'] or u'射击' in dic['description']or u'飞行' in dic['description']: dic['category'] = '射击游戏' else: dic['category'] = dic['sig'] else: if '主题' in dic['name'] or '壁纸' in dic['name'] or u'主题' in dic['description'] or u'壁纸' in dic['description']: dic['category'] = '主题美化' elif dic['category'] == '社交': dic['category'] = random.choice(['社交一', '社交二']) dic['category'] = _adapt_cate_str(dic.get('category')) except Exception as e: print dic['source_link'] print e return dic
def process(self, item, spider, matcher): if item['url']: item['url'] = item['url'].lower() if item['price'] != 'NA': item['price'] = utils.cleanNumberArray(item['price'], 'float') if item['brand']: temp = item['brand'][0] temp = matcher.dualMatch(temp) item['brand'] = temp if not item['brand']: logging.info(item['url']) raise DropItem("**** **** **** Missing brand in %s . Dropping" % item) if item['description']: temp = item['description'] bad = BeautifulSoup(temp[0]) item['description'] = bad.getText() if item['volume']: #volume can be string or list depening on item count on particular page temp = item['volume'] if isinstance(temp, list): if len(temp) > 1: temp = utils.getElementVolume(temp) item['volume'] = temp else: item['volume'] = utils.extractVolume(item['volume']) if item['category']: tempCat = item['category'] item['category'] =utils.cleanChars(tempCat[0]) if item['image']: temp = item['image'] temp = temp[0] item['image'] = temp if item['sku']: temp = item['sku'] temp = temp[0] if item['comments']: comment_html = item['comments'] try: item['comments'] = self.get_comments(comment_html, item['url']) except: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.error('Error getting comments %s , Exception information: %s, %s, Stack trace: %s ' % (item['url'], exceptionType, exceptionValue, traceback.extract_tb(exceptionTraceback))) temp = price_package(item) item['price'] = temp print ' big pacakge: %s' % temp return item
def getKeyword(self): soup = BeautifulSoup(self.getContent()) comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] rmlist = ["script", "style", "img"] for tag in soup.findAll(): if tag.name.lower() in rmlist: tag.extract() return "".join(soup.getText())
def get_short_content(self, obj): from BeautifulSoup import BeautifulSoup # soup = BeautifulSoup(obj.content) soup = BeautifulSoup(obj.content, convertEntities=BeautifulSoup.HTML_ENTITIES) [s.extract() for s in soup('script')] data = ''.join(soup.findAll(text=True))[:200] data = soup.getText() data = re.sub(r'(\n)+|(\s)+', ' ', data) data = data.strip() return data
def play_byu_live(self): soup = BeautifulSoup(make_request(self.apiurl + 'GetLiveStreamUrl?context=Android%24US%24Release')) urlCode = soup.getText().strip('"') reqUrl = 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/Iyamk6YZTw8DxrC60h0fQipg3BfL/'+urlCode+'?device=android_3plus_sdk-hook&domain=www.ooyala.com&supportedFormats=mp4%2Cm3u8%2Cwv_hls%2Cwv_wvm2Cwv_mp4' data = json.loads(make_request(reqUrl)) for stream in data['authorization_data'][urlCode]['streams']: url = b64decode(stream['url']['data']) item = xbmcgui.ListItem(path=url) try: xbmcplugin.setResolvedUrl(int(sys.argv[1]), True, item) except: continue
def view_feed_view(request): feed_requested = request.matchdict.get('feed') rss = DBSession.query(Flux).filter_by(id=feed_requested).first() rss_id = rss.id url = rss.text count = rss.count title = rss.title url = rss.text count = 10 user = check_user_logged(request) if user is False: return dict(feed="not logged", project='PyNews') content = [] try: feeds = feedparser.parse(url) title = feeds['feed']['title'] feed_id = 0 for items in feeds["items"]: text = items['summary_detail']['value'] soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES) result = "" if count > 0: line = { "head": items['title_detail']['value'], "date": "sans date", "feed": soup.getText(), "link": items['link'], "feed_id": count, } content.append(line) count -= 1 feed_id += 1 feed = { 'url': url, 'title': title, 'count': count, 'content': content, 'id': rss_id, } return dict(feed=feed, title=feeds['feed']['title'], project='PyNews') except: feed = { 'url': url, 'title': "", 'count': "", 'content': content, 'id': rss_id, } return dict(feed="error", title="Error unable to fetch url: " + url, project='PyNews')
def get_one_cloud_hourly_rate(bot_hdr, website): soup = BeautifulSoup(website.read()) # using beautiful soup to extract the raw text from the web page, see if the cloud we're looking at is # IaaS. if not, exit. else, do parsing voodoo to extract the base plan price and the name of the cloud # raw_txt = soup.getText() if raw_txt.find("Infrastructure as a Service") == -1 or raw_txt.find("Plan Price") == -1: return -1, -1 cloud_name = website.geturl().split("/")[-1] cloud_hourly_rate = raw_txt.split("Plan Price")[1].split(" ")[0] cloud_hourly_rate = float(cloud_hourly_rate.encode("ascii").strip("$")) return cloud_name, cloud_hourly_rate
def _sanitize(self, html): """ Clean html by removing tags, comments etc Returns: str """ blacklist = ['style', 'script', '[document]', 'head', 'title', 'meta'] soup = BeautifulSoup(html) for s in soup(blacklist): s.extract() for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() return soup.getText()
def getReleases(page): latestPart = BeautifulSoup(page).findAll('div', {'class' : 'latest'}) episodes = BeautifulSoup(str(latestPart)).findAll('div', {'class' : 'episode'}) list = [] for episode in episodes: resolutionsblock = BeautifulSoup(str(episode)).findAll('div', {'class' : 'resolutions-block'}) for res in resolutionsblock: resolutionblock = BeautifulSoup(str(res)).findAll('div', {'class' : 'linkful resolution-block'}) allA = BeautifulSoup(str(BeautifulSoup(str(resolutionblock[len(resolutionblock) - 1])).findAll('a'))) for a in allA: dLinks = BeautifulSoup(str(a)) if (dLinks.getText() == 'Magnet'): list.append(dLinks.find('a').get('href')) return list
def getLineItems(self, html): """Detects HTML list markups and returns a list of plaintext lines""" soup = BeautifulSoup(html) text = soup.getText("\n") # will need to be updated for bs4 if soup.findAll("ol"): self.markup = "ol" elif soup.findAll("ul"): self.markup = "ul" else: self.markup = "div" # remove empty lines: lines = re.sub(r"^( )+$", "", text, flags=re.MULTILINE).splitlines() items = [line for line in lines if line.strip() != ''] return items, None
def getStems(html, withUnicode=False, asString=False): stemmer = utils.getStems if withUnicode: stemmer = utils.getUnicodeStems parsed = BeautifulSoup(html) text = parsed.getText(" ") goodStems = [] skipWords = ["is", "an", "the", "and", "but", "a", "i"] for stem in stemmer(text.lower().split(), skipWords=skipWords): if len(stem) == 1: continue goodStems.append(stem) result = [x for x in goodStems if x] if asString: result = ",".join(result) return result
def url2words(url): try: html = urllib2.urlopen(url).read() except HTTPError: html = "" #plain_text = nltk.clean_html(html).replace('\n','')#nltk.clean_html is not implement. instead of use beautifulsoup... soup = BeautifulSoup(html) #kill script tag and style tag for script in soup(["script","style"]): script.extract() plain_text = soup.getText()#plain_text がうまくいかないよ... #print plain_text words = extract_words(plain_text) return words
def url2words(url): try: html = urllib2.urlopen(url).read() except HTTPError: html = "" #plain_text = nltk.clean_html(html).replace('\n','')#nltk.clean_html is not implement. instead of use beautifulsoup... soup = BeautifulSoup(html) #kill script tag and style tag for script in soup(["script", "style"]): script.extract() plain_text = soup.getText() #plain_text がうまくいかないよ... #print plain_text words = extract_words(plain_text) return words
def process(self, item, spider, matcher): if item['url']: item['url'] = item['url'].lower() if item['price']!= 'NA': temp = item['price'] clean = cleanNumberArray(temp, 'float') item['price'] = clean if item['description']: item['description'] = item['description'][0] soup = BeautifulSoup(item['description']) out = soup.getText() item['description'] = out if item['name']: #temp = item['brand'][0] #temp = cleanChars(temp) brand = matcher.dualMatch(item['name']) item['brand'] = brand if not item['brand']: raise DropItem("******* Missing BRAND in %s . Dropping" % item['name']) #if item['description']: #temp = item['description'] #temp = temp[0] #temp = cleanHtmlTags(temp) #item['description'] = temp if item['category']: tempCat = item['category'] item['category'] = tempCat[0] item['category'] = '' if item['image']: temp = item['image'] temp = temp[0] item['image'] = temp if item['volume']: temp = item['name'] item['volume'] = multiStateVolume(temp) if item['sku']: temp = item['sku'] temp = temp[0] item['sku'] = '' return item
def __douban(self): name = '豆瓣·事情' req = urllib2.Request(url = self.url, headers = self.__headers) res = urllib2.urlopen(req).read() subject = re.search(r'<title>([\w\W]*)</title>', res).groups()[0] subject = subject.strip() res = res.replace('<img src="', '[img]').replace('" alt=', '[/img]<span id=') res = res.replace('<br>', '\n') soup = BeautifulSoup(res) soup = soup.find('div', {'class': 'note', 'id': 'link-report'}) content = soup.getText() content = content.replace('[/img]', '[/img]\n').replace('[img]', '\n[img]') h = HTMLParser.HTMLParser() content = h.unescape(content).encode('utf8') content = self.__replaceImgs(content) message = self.__message(content, 'http://thing.douban.com/', name) return subject + ' | ' + name , message
def start_refresh(): while True: apks = get_apks() if not apks: return report_list = [] for apk in apks: try: if not apk[1]: desc = '' else: soup = BeautifulSoup(apk[1]) desc = soup.getText('\n') report_list.append((apk[0], desc)) except Exception as e: print file print e report_status(report_list)
def getLineItems(self, html): """Detects HTML list markups and returns a list of plaintext lines""" if ANKI20: # do not supply parser to avoid AttributeError soup = BeautifulSoup(html) else: soup = BeautifulSoup(html, "html.parser") text = soup.getText("\n") # will need to be updated for bs4 if soup.findAll("ol"): self.markup = "ol" elif soup.findAll("ul"): self.markup = "ul" else: self.markup = "div" # remove empty lines: lines = re.sub(r"^( )+$", "", text, flags=re.MULTILINE).splitlines() items = [line for line in lines if line.strip() != ""] return items, None
def main(): if 2 > len(sys.argv): print 'python {0:s} <ted_url>'.format(__file__) return url = sys.argv[1] print url soup = BeautifulSoup(urlopen(url).read()) attr_map = soup.find(id="share_and_save").attrMap langs = ['en', 'zh-cn'] for lang in langs: subtitle_url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/html' % (attr_map['data-id'], lang) beautiful_soup = BeautifulSoup(urlopen(subtitle_url).read()) subtitle = beautiful_soup.getText("\n").encode('utf-8') with open("%s-%s.txt" % ((attr_map['data-title']), lang), 'w') as f: f.write(subtitle)
def has_badword(content, extra=u''): """ function:检查html中是否含有关键字。 params: content - 要检测的内容 extra - 附加检查的内容 return: True or False """ keywords = [u'土巴兔', u'小兔', u'酷家乐', u'小酷', u'小乐', u'乐乐', u'kujiale', 'to8to', u'优居客', u'小优', u'youjuke', u'x团', u'X团'] if (type(content) == str) or (type(content) == unicode): content = BeautifulSoup(content) html_text = content.getText() text = u'%s,%s' % (html_text, extra) for kw in keywords: if text.find(kw) > -1: return True return False
def __init__(self, hedef): from BeautifulSoup import BeautifulSoup import urllib text = "" url = hedef html = urllib.urlopen(url).read() soup = BeautifulSoup(html) if soup is not None: for script in soup(["script", "style"]): script.extract() # rip it out text = soup.getText() lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) Derlem.__init__(self, text.encode("utf8").splitlines(True))
def clean(document, document_type): if document_type == 0: soup = BeautifulSoup(document) document = soup.getText() document = document.replace(' ', ' ') document = document.replace('£', ' ') document = document.replace('þ', ' x ') document = document.replace('ý', ' x ') document = document.replace('x', ' x ') document = document.replace('☒', ' x ') document = document.replace(' ', ' ') document = document.replace(';', ' ') document = document.replace('\t', ' ') document = document.replace('\r\n', ' ') document = document.replace('\n', ' ') document = document.replace('\r', ' ') document = re.sub(' +', ' ', document) #document = document.lower() return document
def prepare_for_markdown(string): soup = BeautifulSoup(string) for tag in soup.findAll(): if tag.name == 'code': if tag.string is not None: tag.string = '```' + tag.string + '```' elif tag.name == 'a': if tag['href'] is not None: tag.replaceWith(tag['href']) elif tag.name == 'strong': if tag.string is not None: tag.string = '*' + tag.string + '*' htmlParser = HTMLParser() soup_text = soup.getText('\n') soup_text = soup_text.replace('*', '\*') soup_text = soup_text.replace('_', '\_') plain_text = htmlParser.unescape(soup_text) plain_text = plain_text.replace('\n\n', '\n') return plain_text
def process(self, item,spider,matcher): if item['brand']: temp = item['brand'][0] temp = matcher.dualMatch(temp) item['brand'] = temp if not item['brand']: logging.info(item['url']) raise DropItem("**** **** **** Missing brand in %s . Dropping" % item) if item['price'] != 'NA': temp = item['price'] item['price'] = utils.cleanNumberArray(temp, 'float') if item['description']: temp = item['description'] soup = BeautifulSoup(temp[0]) temp = soup.getText() if re.search(r'<xml>', temp) is None: item['description'] = temp else: print 'too long text going for else' a =soup.p.findChild('span') if a: a = a.getText() item['description'] = a else: print ' no decription extracted' if item['volume']: item['volume'] = utils.extractVolume(item['name']) if item['comments']: comment_html = item['comments'] try: item['comments'] = self.get_comments(comment_html, item['url']) except: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.error('Error getting comments %s , Exception information: %s, %s, Stack trace: %s ' % (item['url'], exceptionType, exceptionValue, traceback.extract_tb(exceptionTraceback))) return item
def get_chapter_aoye(chapter_url, fic, web_site): """获取xs8的某篇小说的所有的章节信息""" content_tag = "div" content_class = {"id": "detaillist"} chapter_tag = "li" chapter_class = {} html_page = urllib2.urlopen(chapter_url) html_content = html_page.read() ans = [] html_content = gzip_content(html_content) content = BeautifulSoup(html_content) out = content.findAll(content_tag, content_class) contents = ''.join([str(item) for item in out]) chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class) for item in chapters: item_str = str(item) item_tag = BeautifulSoup(item_str) if item_tag.a: url = item_tag.a['href'] chapter_title = item_tag.getText() ans.append((url, chapter_title)) save_chapter(ans, fic, web_site)
def get_chapter_aoye(chapter_url, fic, web_site): """获取xs8的某篇小说的所有的章节信息""" content_tag = "div" content_class = {"id" : "detaillist"} chapter_tag = "li" chapter_class = {} html_page = urllib2.urlopen(chapter_url) html_content = html_page.read() ans = [] html_content = gzip_content(html_content) content = BeautifulSoup(html_content) out = content.findAll(content_tag, content_class) contents = ''.join([str(item) for item in out]) chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class) for item in chapters: item_str = str(item) item_tag = BeautifulSoup(item_str) if item_tag.a: url = item_tag.a['href'] chapter_title = item_tag.getText() ans.append((url, chapter_title)) save_chapter(ans, fic, web_site)
def save(self, domain_override=None, subject_template_name='registration/password_reset_subject.txt', email_template_name='registration/password_reset_email.html', use_https=False, token_generator=default_token_generator, from_email=None, request=None, html_email_template_name=None, extra_email_context={}, user=None): """ Generates a one-use only link for resetting password and sends to the user. """ from django.core.mail import send_mail email = self.cleaned_data["email"] if not domain_override: current_site = get_current_site(request) site_name = current_site.name domain = current_site.domain else: site_name = domain = domain_override c = { 'email': user.email, 'domain': domain, 'site_name': site_name, 'uid': urlsafe_base64_encode(force_bytes(user.pk)), 'user': user, 'token': token_generator.make_token(user), 'protocol': 'https' if use_https else 'http', 'extra' : extra_email_context, } subject = loader.render_to_string(subject_template_name, c) # Email subject *must not* contain newlines subject = ''.join(subject.splitlines()) html_email = loader.render_to_string(email_template_name, c) soup = BeautifulSoup(html_email) email = soup.getText() send_mail(subject, email, from_email, [user.email], html_message=html_email, fail_silently=False)
def get_chapter_longtengzw(chapter_url, fic, web_site): """获取龙腾中文网的某篇小说的所有的章节信息""" content_tag = "div" content_class = {"class" : "readerListShow"} chapter_tag = "td" chapter_class = {"class" : "ccss"} html_page = urllib2.urlopen(chapter_url) html_content = html_page.read() ans = [] html_content = gzip_content(html_content) content = BeautifulSoup(html_content) out = content.findAll(content_tag, content_class) contents = ''.join([str(item) for item in out]) chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class) for item in chapters: item_str = str(item) item_tag = BeautifulSoup(item_str) if item_tag.a: url = item_tag.a['href'] url = chapter_url + url chapter_title = item_tag.getText() ans.append((url, chapter_title)) save_chapter(ans, fic, web_site)
def LyricWikia(artist,title): url = 'http://lyrics.wikia.com/api.php?artist={artist}&song={title}&fmt=json'.format(artist=artist,title=title).replace(" ", "%20") r = requests.get(url, timeout=15) #We got some bad formatted JSON data... So we need to fix stuff :/ returned = r.text returned = returned.replace("\'","\"") returned = returned.replace("song = ","") returned = json.loads(returned) if returned["lyrics"] != "Not found": #set the url to the url we just recieved, and retrieving it r = requests.get(returned["url"], timeout=15) #curl.setopt(curl.URL, str(curl_return["url"])) #curl.perform() soup = BeautifulSoup(r.text) soup = soup.find("div", {"class": "lyricbox"}) [elem.extract() for elem in soup.findAll('div')] [elem.replaceWith('\n') for elem in soup.findAll('br')] soup = BeautifulSoup(str(soup),convertEntities=BeautifulSoup.HTML_ENTITIES) soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)','',str(soup))) [elem.extract() for elem in soup.findAll('script')] return soup.getText() else: return
def get_chapter_longtengzw(chapter_url, fic, web_site): """获取龙腾中文网的某篇小说的所有的章节信息""" content_tag = "div" content_class = {"class": "readerListShow"} chapter_tag = "td" chapter_class = {"class": "ccss"} html_page = urllib2.urlopen(chapter_url) html_content = html_page.read() ans = [] html_content = gzip_content(html_content) content = BeautifulSoup(html_content) out = content.findAll(content_tag, content_class) contents = ''.join([str(item) for item in out]) chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class) for item in chapters: item_str = str(item) item_tag = BeautifulSoup(item_str) if item_tag.a: url = item_tag.a['href'] url = chapter_url + url chapter_title = item_tag.getText() ans.append((url, chapter_title)) save_chapter(ans, fic, web_site)
def main(): # read dataset file f = 'dataset.csv' df = file_read(f) # check web site updateSites = [] reg = [] def _toMD5(text): return hashlib.md5(text).hexdigest() for i, row in df.iterrows(): try: url = row[0] hash_ago = row[1] except: hash_ago = 0 print url # read html x = AccessPage(url) soup = BeautifulSoup(x.html) soup = soup.body html = soup.getText().encode('ascii', errors='backslashreplace') hash_now = _toMD5(html) if hash_now != hash_ago and hash_ago != 0: updateSites.append(url) reg.append([url, hash_now]) df = pd.DataFrame(reg) # recode file_write(f, df) # view page for i in updateSites: print i commands.getoutput('open ' + i)
def importCity(cityname, url, package): if cityname == 'hamburg': # Only take 'open data' if package['type'] != 'dataset' or 'forward-reference' in package[ 'title']: return {} #There is a version of CKAN that can output private datasets! but DKAN is using this field for different purposes if package['private'] and cityname not in dkanCities: return {} resources = [] formats = set() files = [] # Key for the file link in the resource urlkeys = ['url'] formatkey = 'format' if ('resources' in package): resources = package['resources'] for file in resources: for urlkey in urlkeys: if (file[urlkey] not in [None, '']): if '://' not in file[urlkey]: files.append(url + file[urlkey]) else: files.append(file[urlkey]) break if formatkey in file and file[formatkey] not in [None, '']: format = file[formatkey] formats.add(format.upper()) row = {} row[u'Stadt'] = cityname row[u'Dateibezeichnung'] = package['title'] if 'name' in package: row[u'URL PARENT'] = url + '/dataset/' + package['name'] elif 'url' in package: row[u'URL PARENT'] = package['url'] else: row[u'URL PARENT'] = '' if cityname in v3cities: licensekey = 'license_id' vstellekey = 'author' catskey = 'groups' catssubkey = 'title' if cityname == 'berlin': catssubkey = 'name' elif cityname == 'muenchen': licensekey = 'license_id' vstellekey = 'maintainer' catskey = 'groups' catssubkey = 'title' elif cityname in dkanCities: licensekey = 'license_title' vstellekey = 'maintainer' catskey = 'tags' catssubkey = 'name' # Generate URL for the catalog page if 'notes' in package and package['notes'] != None: row[u'Beschreibung'] = package['notes'] if cityname == 'koeln': soup = BeautifulSoup(row[u'Beschreibung']) row[u'Beschreibung'] = soup.getText('\n') else: row[u'Beschreibung'] = '' row[u'Zeitlicher Bezug'] = '' if licensekey in package and package[licensekey] != None: row[u'Lizenz'] = package[licensekey] # if not already short, try to convert if metautils.isopen(row[u'Lizenz']) is 'Unbekannt': row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz']) else: row[u'Lizenz'] = 'nicht bekannt' if vstellekey in package and package[vstellekey] != None: row[u'Veröffentlichende Stelle'] = package[vstellekey] else: row[u'Veröffentlichende Stelle'] = '' if 'extras' in package: print 'WARNING: No author/maintainer/publisher, checking extras' for extra in package['extras']: if extra['key'] == 'contacts': print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra[ 'value'] cat_groups = metautils.setofvaluesasarray(package[catskey], catssubkey) if cityname != 'berlin': odm_cats = metautils.matchCategories(cat_groups) else: for group in cat_groups: odm_cats = berlin_to_odm(group) row[u'categories'] = odm_cats row[u'Format'] = formats row[u'files'] = files row['metadata'] = package row[u'original_metadata'] = { u'metadata_created': package['metadata_created'], u'metadata_modified': package['metadata_modified'] } return row
def cleanup(self, msg): soup = BeautifulSoup(msg) return soup.getText().encode('utf-8').strip()
print "Crawling Pages, please wait..." with tqdm(total=retrieveLimit) as progress: for page in urlList: if docIDCounter > retrieveLimit: break #quits crawling if retrieval limit is reached try: #---------- Page Crawler (gets words and links from each page --------- soup = "" browse.open(page) if page.endswith(".txt"): soup = browse.response().read() else: soup = BeautifulSoup(browse.response().read( )) #if can't parse, assumed to be binary file or 404 soup = soup.getText() hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest() if hashTest not in duplicateDetect: duplicateDetect.append(hashTest) wordsInPage = soup.split() if not page.endswith(".txt"): for link in browse.links(): tempURL = urlparse.urljoin(link.base_url, link.url) #BELOW: gets rid of duplicate urls resulting from index.html/index.htm if tempURL.endswith("index.html"): tempURL = tempURL.replace("index.html", "") elif tempURL.endswith("index.htm"): tempURL = tempURL.replace("index.htm", "") if tempURL not in urlList:
def get_text(self, html): s = BeautifulSoup(html) return s.getText()
def buildpattern(html, debug): doc = {} docwords = {} structure = [] fulltext = [] title = '' attributes = {} x = [] y = [] # HTMLDELIM = ["</title>", "</div>", "</script>", "</p>", "</li>", "</html>"] html = re.sub(r'<\/script>', "</script>\n", html) html = re.sub(r'<meta ', "\n<meta ", html) html = re.sub(r'<\/title>', "</title>\n", html) html = re.sub(r'<\/div>', "</div>\n", html) html = re.sub(r'<\/p>', "</p>\n", html) html = re.sub(r'<\/li>', "</li>\n", html) html = re.sub(r'<\/style>', "</style>\n", html) html = re.sub(r'<\/dd>', "</dd>\n", html) htmlstrings = html.splitlines() if htmlstrings: lineID = 0 for line in htmlstrings: lenstr = len(line) words = len(line.split()) comas = len(line.split(",")) dots = len(line.split(".")) equal = len(line.split("=")) soup = BeautifulSoup(line) if words: htmltags = [] visiblecontent = soup.getText() for child in soup.recursiveChildGenerator(): name = getattr(child, "name", None) if name is not None: htmltags.append(name) elif not child.isspace(): # leaf node, don't print spaces donothing = 1 matrix = {} visiblewords = len(visiblecontent.split()) matrix['words'] = str(words) matrix['visiblewords'] = 0 matrix['comas'] = comas matrix['dots'] = dots matrix['equal'] = equal matrix['html'] = line matrix['tags'] = str(visiblecontent) code = 'W' + str(visiblewords) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal) matrix['code'] = code if visiblewords > 10: matrix['visiblewords'] = str(visiblewords) doc[lineID] = matrix lineID = lineID + 1 if debug: sorted(doc, key=int) #for lineID in doc: for lineID,item in doc.items(): #lineID = 1003 if lineID: code = item['code'] line = str(item['html']) words = item['words'] words = item['visiblewords'] tags = item['tags'] x.append(lineID) y.append(int(words)) #print 'W' + str(words) + ' ' + line + ' ' + code if words: print str(lineID) + ',' + code + ',' + line + '\t' + tags return (x,y,doc)
def init_file(file): dic = { 'id': file[0], 'name': file[1], 'icon_link': file[2], 'icon_path': file[3], 'source': file[4], 'source_link': file[5], 'rating': file[6], 'version': file[7], 'developer': file[8], 'sdk_support': file[9], 'category': file[10], 'screen_support': file[11], 'apk_size': file[12], 'language': file[13], 'publish_date': file[14], 'downloads': file[15], 'description': file[16], 'images': file[17], 'images_path': file[18], 'qr_link': file[19], 'download_link': file[20], 'last_crawl': file[21], 'vol_id': file[22], 'package_name': file[23], 'version_code': file[24], 'sig': file[25], 'min_sdk_version': file[26], 'is_break': file[27], 'platform': file[28], 'file_type': file[29], 'package_hash': file[30], } try: if dic['source'] == 'nduoa.com': dic['downloads'] = dic['downloads'].replace(u',', '') dic['apk_size'] = get_apk_size(dic.get('apk_size')) elif dic['source'] == 'hiapk.com': dic['apk_size'] = get_apk_size(dic.get('apk_size')) elif dic['source'] == 'goapk.com': if dic.get('downloads') and u'\u5927\u5c0f\uff1a' in dic[ 'downloads'].decode('utf8'): dic['apk_size'] = dic['downloads'].decode('utf8') dic['downloads'] = 0 if dic.get('category') and u'\u5927\u5c0f\uff1a' in dic[ 'category'].decode('utf8'): dic['apk_size'] = dic['category'].decode('utf8') dic['category'] = '' if dic.get('category' ) and u'\u7c7b\u522b' in dic['category'].decode('utf8'): dic['category'] = dic['category'].split(':')[1] dic['version'] = get_version(dic['version']) dic['apk_size'] = get_apk_size(dic.get('apk_size')) elif dic['source'] == 'appchina.com': dic['apk_size'] = get_apk_size(dic.get('apk_size')) elif dic['source'] == 'mumayi.com': if dic.get('apk_size' ) and u'\u672a\u77e5' in dic['apk_size'].decode('utf8'): dic['apk_size'] = 0 dic['apk_size'] = get_apk_size(dic.get('apk_size')) elif dic['source'] == 'as.baidu.com': dic['developer'] = None if dic.get('description'): soup = BeautifulSoup(dic.get('description').decode('utf8')) dic['description'] = soup.getText('\n') else: dic['desctiption'] = '' dic['rating'] = get_raing(dic.get('rating')) dic['category'] = _adapt_cate_str(dic.get('category')) except Exception as e: print dic['source_link'] print e return dic
def unrenderhtml(html): soup = BeautifulSoup(html) return soup.getText('\n')
def buildpattern(html, debug): doc = {} docwords = {} structure = [] fulltext = [] title = '' attributes = {} x = [] y = [] # HTMLDELIM = ["</title>", "</div>", "</script>", "</p>", "</li>", "</html>"] html = re.sub(r'<script', "\n<script", html) html = re.sub(r'<style', "\n<style", html) html = re.sub(r'<\/script>', "\n</script>\n", html) html = re.sub(r'<meta ', "\n<meta ", html) html = re.sub(r'<\/title>', "</title>\n", html) html = re.sub(r'<\/div>', "</div>\n", html) html = re.sub(r'<\/p>', "</p>\n", html) html = re.sub(r'<\/li>', "</li>\n", html) html = re.sub(r'<\/style>', "\n</style>\n", html) html = re.sub(r'<\/dd>', "</dd>\n", html) htmlstrings = html.splitlines() if htmlstrings: lineID = 0 active = 1 for line in htmlstrings: lenstr = len(line) words = len(line.split()) comas = len(line.split(",")) dots = len(line.split(".")) equal = len(line.split("=")) soup = BeautifulSoup(line) if words: htmltags = [] visiblecontent = soup.getText() for child in soup.recursiveChildGenerator(): name = getattr(child, "name", None) if name is not None: htmltags.append(name) elif not child.isspace(): # leaf node, don't print spaces donothing = 1 matrix = {} visiblewords = len(visiblecontent.split()) openignore = re.match(r'<style|<script', line) closeignore = re.match(r'<\/style|<\/script', line) urlstatus = re.findall(r'<a', line) timeflag = re.findall('([0-9]+:[0-9]+)', line) if openignore: active = 0 matrix['words'] = str(words) matrix['visiblewords'] = 0 matrix['comas'] = comas matrix['dots'] = dots matrix['equal'] = equal matrix['html'] = line matrix['status'] = 'active' if timeflag: matrix['timeflag'] = str(timeflag) else: matrix['timeflag'] = '' matrix['tags'] = str(visiblecontent) if urlstatus: matrix['urlstatus'] = 1 else: matrix['urlstatus'] = 0 code = 'W' + str(visiblewords) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal) + ',U' + str(matrix['urlstatus']) + 'T' + matrix['timeflag'] matrix['code'] = code if visiblewords > 0: matrix['visiblewords'] = str(visiblewords) if active == 0: matrix['visiblewords'] = 0 matrix['status'] = 'ignored' if visiblewords <= 1: matrix['status'] = 'ignored' doc[lineID] = matrix if closeignore: active = 1 lineID = lineID + 1 if debug: sorted(doc, key=int) #for lineID in doc: for lineID,item in doc.items(): line = str(item['html']) openignore = re.match(r'<style|<script', line) closeignore = re.match(r'<\/style|<\/script', line) #lineID = 1003 if lineID: code = item['code'] words = item['words'] words = item['visiblewords'] tags = item['tags'] status = item['status'] x.append(lineID) y.append(int(words)) if status == 'active': outstr = str(lineID) + ',' + code + ',' + line + '\t' + tags #print outstr + '\n' f.write(outstr + '\n') # python will convert \n to os.linesep return (x,y,doc)
#Use print to debug #print filename #writes results in .csv file #toread contains the exact file path towrite = filename+"\\"+file+".csv" toread = targetpath+"\\"+file+"\\html"+"\\"+file+".htm" #Here we use toread to read the htm file in the given path #I am using beautifulsoup to read htm files because we can start playing with htm tags which is quite easy to write a parser with codecs.open(toread, encoding='utf-8', errors='replace') as f: t = f.read().encode('utf-8') #x variable has potential to access data in specific tags because we have used beautifulsoup x = BeautifulSoup(t) #So, before writing a parser it is necessary to study few documents you want to parse. #Then you will get some idea what are the common points in each document and while using those common points you won't loose any data #Here I am considering a Page as checkpoint. So, it finds out page and get the Text. checkpoint = x.getText().find("Page") #If it is true then it goes inside the if loop if checkpoint != -1: #Here split is done with a term called Page where split actually makes a list but as mentioned it displays the data in the list where #the index in a list is 1 x = x.getText().split("Page",1)[1] #endpoint is nothing but getting the last index of the text endpoint = len(x)-1 #So, in the startpoint it finds ('----------') startpoint = x.find('----------') else: #If there is no term called Page in a document which you trying to parse. Then it directly tries to find ('----------') and get the #end point of the text x = x.getText().encode('utf-8') endpoint = len(x)-1 startpoint = x.find('----------')
print "Found " + str(len(links)) + " links!" final_links = [] print "Searching links for matches..." for i in range(len(links)): print str(i) link = links[i] if (not re.search("\.html$", link) and not re.search("/[^\.]*$", link, re.IGNORECASE)): continue # Try three times to get the text; after three failures, move on f = None for i in range(10): try: f = urllib2.urlopen(link, timeout=1) except urllib2.URLError: pass except socket.timeout: pass if f == None: continue soup = BeautifulSoup(f) current_text = soup.getText() if (re.search(regex, current_text)): final_links.append(link) for final_link in final_links: print final_link
def cleantags(html): soup = BeautifulSoup(html) return soup.getText(separator=u' ') #(soup.text)
def process(self, item,spider,matcher): if item['url']: item['url'] = item['url'].lower() if item['sku']: item['sku'] = utils.cleanSkuArray(item['sku'], 'string') if item['price'] != 'NA': temp = item['price'] if len(temp) > 1: volarray = [] parray = [] for item in temp: if re.search(r'true', item): item = item.replace("'disponivel': true,","") dic = ast.literal_eval(item) price = dic['preco_promo'] volume = dic['descricao'] parray.append(price) volarray.append(volume) else: item = item.replace("'disponivel': false,","") dic = ast.literal_eval(item) price = dic['preco_promo'] volume = dic['descricao'] parray.append(price) volarray.append(volume) item['price'] = utils.cleanNumberArray(parray, 'float') item['volume'] = volarray print 'BELEZA MULTI PASS' print item['price'] print item['volume'] else: item['price'] =utils.cleanNumberArray(item['price'], 'float') if item['description']: temp = item['description'] soup = BeautifulSoup(temp[0]) text = soup.getText() item['description'] = text if item['brand']: tempBrand = item['brand'] tempBrand = tempBrand[0] tempBrand = utils.extractBrand(tempBrand) tempBrand = utils.cleanChars(tempBrand) item['brand'] = tempBrand if item['volume']: #first check if volume array exists(if not getelement returns empty and see if the name contains volume information) print 'PIPELINE INPUT volume is %s' % item['volume'] temp = item['volume'] if isinstance(temp, list): length = len(temp) print "multi value volume %s" % temp item['volume'] = utils.getElementVolume(temp) else: print 'NON multi volume field %s' % item['volume'] if item['category']: tempCat = item['category'] item['category'] = tempCat[0] if item['image']: temp = item['image'] temp = temp[0] item['image'] = temp if item['comments']: comment_html = item['comments'] try: item['comments'] = self.get_comments(comment_html, item['url']) except: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.error('Error getting comments %s , Exception information: %s, %s, Stack trace: %s ' % (item['url'], exceptionType, exceptionValue, traceback.extract_tb(exceptionTraceback))) return item
def get_description(self, obj): from BeautifulSoup import BeautifulSoup soup = BeautifulSoup(obj.content) [s.extract() for s in soup('script')] return soup.getText()[:200]