#! /usr/bin/python2 # -*- encoding: utf-8 -*- # 유닉스의 경우에, cgi 스크립트를 실행하기 위해서는 현재 파일을 chmod +x 로 실행가능비트로 지정하고 #! /usr/bin/python2와 같이 경로를 지정한다. # windows에서는 이런게 필요없다. # python 2.4.3 # Beautiful Soup (2.1.1) import urllib from BeautifulSoup import BeautifulSoup #from bs4 import BeautifulSoup html_source = urllib.urlopen('http://www.naver.com').read() soup = BeautifulSoup(html_source, fromEncoding="utf-8") for link in soup.findAll('a'): # print(link.get('href')) print soup.get_text()
################## Pre-Processing Of Text ############################### # data = soup.findAll(text=True) # [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] # visible_text = soup.getText() # kill all script and style elements for script in soup(["script", "style","title","head","[document]"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) visible_text=(text.encode('utf-8')) FewText=visible_text[2500:3000] for words in SearchWords :