def careermatch(username, password): browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)')] browser.set_handle_robots(False) browser.open("http://training.careermatch-uk.com/eco_login.php") browser.select_form(nr=0) browser.form.set_all_readonly(False) browser["txtUserId"] = username browser["txtPassword"] = password browser.submit() browser.open("http://training.careermatch-uk.com/eco_learn.php?filter=enrolled") html1 = browser.response().read() for course in re.findall(r'<dt>(.*?)</dt>', html1): if "href" in course: soup1 = BeautifulSoup(course, "html.parser") tag1 = soup1.find("a") url1 = "http://training.careermatch-uk.com/"+tag1["href"] name1 = soup1.getText() print name1 browser.open(url1) html2 = browser.response().read() for topic in re.findall(r'<dt>(.*?)</dt>', html2): if "href" in topic: soup2 = BeautifulSoup(topic, "html.parser") tag2 = soup2.find("a") url2 = "http://training.careermatch-uk.com/"+tag2["href"] name2 = soup2.getText() print "\t"+name2 browser.open(url2) html3 = browser.response().read() pattern = re.compile(r'\s+') html3 = re.sub(pattern, ' ', html3) for mp4 in re.findall(r'{ file: "(.*?)" , label: "SD"}', html3): url3 = "http"+mp4.split('http')[-1] name3 = url3.split('/')[-1] print "\t\t"+name3 if not os.path.isfile(name3): time.sleep(1) u = urllib2.urlopen(url3) f = open(name3, 'wb') meta = u.info() file_size = int(meta.getheaders("Content-Length")[0]) print "\t\t\tDownloading: %s Size: %s MB" % (name3, file_size/(1024*1024)) print "\t\t\t", file_size_dl = 0 block_sz = 8192 status="" time.sleep(1) while True: buffer = u.read(block_sz) if not buffer: break file_size_dl += len(buffer) f.write(buffer) if r"[%3.0f%%]" % (file_size_dl * 50. / file_size) != status: status = r"[%3.0f%%]" % (file_size_dl * 50. / file_size) print ".", f.close() print "\n",
def grabing(data,ipPort): soup = BeautifulSoup(data) a_list = soup.find_all("a",attrs={"itemprop": "url"}) a_list.pop(0) content_list = [] f=open('D:\\novel.txt','w') for atag in a_list: url = re.search(r'http://read.qidian.com/BookReader/4fknnsotQvLZ6ZDT--NUMw2,?.*\.aspx',str(atag)) txt = re.search(r'<span itemprop="headline">.*</span>',str(atag)) soup = BeautifulSoup(txt.group()) file_name = soup.getText() print(soup.getText()) f.write(soup.getText() + "\n") d = agentGo(url.group(),ipPort) soup = BeautifulSoup(d) div_content = soup.find("div",id="chaptercontent") txt_url = re.search(r'http://files.qidian.com/Author6/?.*\.txt', str(div_content),re.DOTALL) #print(txt_url.group()) urllib.request.urlretrieve(txt_url.group(),"F:\\novel\\" + file_name + ".txt") #保存到txt文件 # oper = urllib.request.urlopen(getReq(txt_url.group())) # data = oper.read() # content = str(data) # content_list.append(content) # print(content) # f.write(content + "\n") f.close()
def get_content(soup): enpcontent = re.findall('<!--enpcontent-->(.*)<!--enpcontent-->',str(soup), re.DOTALL) if enpcontent: enpcontent = enpcontent[0] contentsoup = BS(enpcontent) text = contentsoup.getText().encode('utf-8') return text
def stackoverflowESLoader(folder): """Loads into ES all Stackoverflow data files located in a folder and subfolders Inputs: folder: directory where the data is located. Must contain a subfolder for each stackexchange community, with a Posts.xml file with the community posts """ tagdetector = re.compile("<[^>]+>") # Iterate over each subolder (community) for communityfolder in glob.glob(folder + '/*.com'): # Get community name communityname = communityfolder.split('/')[-1] # Process the post file for such community root = xml.etree.ElementTree.parse(communityfolder + "/Posts.xml").getroot() # Get all posts for row in root.iter('row'): id = row.attrib["Id"] body = {} # Get text withouth the HTML tags html = row.attrib["Body"] soup = BeautifulSoup(html, "lxml") body["text"] = soup.getText() # Get post tags if "Tags" in row.attrib: tagsraw = row.attrib["Tags"] body["tags"] = tagdetector.findall(tagsraw) # Add to Elasticsearch es.index(index=communityname, doc_type="post", id=id, body=body, request_timeout=30)
def feed_page(feed_link=None, feed_index=None): import time import feedparser from bs4 import BeautifulSoup from unidecode import unidecode from string import lower if not feed_index: try: if not feed_link: # No feed link was provided, so show the first one feed_index = 0 else: # Try to extract the feed link number feed_index = [x[1] for x in app.config.get('FEEDS')].index(feed_link) except ValueError as ve: abort(404) except Exception as e: print(e) import traceback print(traceback.format_exc()) feed_index = 0 try: feedDesc, feedLink, feedURL = app.config.get('FEEDS')[feed_index] except: #feedDesc, feedLink, feedURL = ('Test Feed', 'test_feed', 'http://feedparser.org/docs/examples/atom10.xml') abort(404) rss = feedparser.parse(feedURL) maxEntries = 12 rssFormatted = [] for post in rss.entries[:maxEntries]: date_p = post.published_parsed date = time.strftime("%d.%m.%Y", date_p) catstr = unidecode(lower(post.category)) for fr,to in [(u"tutkimus", u"research"), (u"palkitut", u"honored"), (u"opiskelu", u"studies"), (u"yhteistyo", u"cooperation"), (u"muut", u"other")]: catstr = catstr.replace(fr, to) category = catstr.split(',') # We use BeautifulSoup to strip html from the description descSoup = BeautifulSoup(post.description) desc = descSoup.getText() postDict = {'date': date, 'category': category, 'category_unformatted': post.category.replace(u",", u", "), 'title': post.title, 'description': desc, 'link': post.link} rssFormatted.append(postDict) try: return render_template('index.html', entries=rssFormatted) except TemplateNotFound: abort(404)
def clean_up(aLink): html = urllib2.urlopen(aLink).read() soup = BeautifulSoup(html,"html5lib") texts = soup.findAll(text=True) [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] visible_text = soup.getText().encode('utf-8').strip() l = visible_text.split('\n') l = filter(lambda element: (('â' or '€') not in element) and (element != '') and ('\xc2\xa0' not in element), l) #Cleaning up the crawled data for i in range(len(l)): if 'for discount' in l[i]: deals_start = i + 1 clean_list = [] for i in range(deals_start, len(l)): if 'COMMENT' in l[i]: l[i] = '************break***************' elif (l[i].isupper()): continue elif 'miss a single chance' in l[i]: break clean_list.append(l[i]) return clean_list
def _thread(self, args): category = args # browser asks question self._text_to_speech("What do you want to load, buddy?") # user gives answer answer = self.speech_to_text.convert() if not answer: return # get url from search engine url = search_engine(category, answer) if not url: return # browser tells user that content is being retrieved self._text_to_speech("Cool. I will get you stuff now...") # get web content request = requests.get(url) soup = BeautifulSoup(request.text, 'lxml') # get text from web content [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] text = soup.getText() # speak each line of text try: for line in text.split('\n'): if self.is_stop: return if len(line) >= self.MIN_LINE_LENGTH: self._text_to_speech(line) except: print "Browser: error converting text to speech"
def get_cleantext(html) : soup = BeautifulSoup(html) [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] cleantext = soup.getText() cleantext = cleantext.encode('utf8', 'ignore') cleantext = " ".join(cleantext.split()) return cleantext
def getRssInfo(): url = "http://www.huxiu.com/rss/0.xml" d = feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList = [] for entry in d.entries: info = {} info["url"] = entry.link info["newsid"] = getMd5(info["url"]) info["title"] = entry.title info["description"] = entry.description info["ctime"] = (long)(time.mktime(entry.published_parsed)) info["author"] = entry.source.title info["source"] = ctable info["keywords"] = "" # print entry # print info['url'] # print info['newsid'] # print info['title'],info['ctime'] # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['author'] # print entry.published_parsed # print info['description'] soup = BeautifulSoup(entry.description, "html.parser", from_encoding="utf-8") img = soup.find("img") info["thumb"] = img.get("src") if img else "" info["summary"] = soup.getText() # print info['thumb'] # print info['summary'] infoList.append(info) return infoList
def getClasses(): def getIndex(index): return visible_text.index(reqList[index][0]+" "+reqList[index][1]) soup = BeautifulSoup(open('degree.htm'), 'html.parser') categories = soup.find_all("font") requirements = dict() reqList = list() #Looking for things like 1) CSE REQ or maybe > 2) Statistics - Reqd valid = re.compile(r".*([1-9]+\))\s([^-]+).*$") for req in categories: if req.string is not None: regexResult = valid.match(req.string) if regexResult is not None: reqList.append(regexResult.groups()) [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] visible_text = soup.getText() index = 0 takenCourses = re.compile(r"((?:FA|SP|WI)(?:15|16|17))\s([A-Z]{3,4}\s+[0-9]{1,3}[A-Z]*)") missingReq = re.compile(r"\nNeeds:\s+([1-9]+)\sCourses\n") for index in range(1,len(reqList)): #Going to have the requirements be (takenCourses, missingCourses) requirements[reqList[index-1][1]] = [[],0] stringSegment = visible_text[getIndex(index-1):getIndex(index)] regexResult = takenCourses.findall(stringSegment) for result in regexResult: #Add to the taken courses requirements[reqList[index-1][1]][0].append(result[0] + " " + result[1]) regexResult = missingReq.search(stringSegment) if regexResult is not None: requirements[reqList[index-1][1]][1] = int(regexResult.groups()[0]) return requirements
def parse_content(self, url): yield from asyncio.sleep(3.0) logging.info('Extracting content for: %s', url) #extract page content try: response = urllib.request.urlopen(url) content = response.read() except URLError: print('Error') return yield from asyncio.sleep(5.0) logging.info('Start to parse content for: %s', url) soup = BeautifulSoup(content, 'html.parser') #parse and store content of pages for s in soup(['style', 'script', '[document]', 'had', 'title']): s.extract() logging.info('Storing Content in for: %s', url) asyncio.Task(Page(url, soup.getText()).save()) logging.info('Updated queue with new links: %s', url) asyncio.Task(self._extract_links(soup)) logging.info('Finish to parse content for: %s', url)
def pull_headlines(tweet): ent = tweet.get('entities') urls = ent.get('urls') t = "" if urls: for u in urls: try: url = u.get('expanded_url') r = requests.get(url) headlines = BeautifulSoup(r.content).find('title') if not headlines: headlines = BeautifulSoup(r.content).find('h1') # remove domain domain = '{uri.netloc}'.format(uri=urlparse(url)) + NEWS_DOMAINS hwords = [h for h in headlines.getText().split() if h.lower() not in domain] t = "%s %s" % (t,' '.join(hwords)) except: continue # also pull quoted tweets if tweet.get('is_quote_status'): try: quote = tweet.get('quoted_status').get('text') except: quote = '' t+=quote return t
def getBillText(): bill_id_list = getBillIdList() txt_json = getBill(bill_id_list) decoded_txt = '' txt_json = txt_json['bill'] #'text' is the first object in the json file, increment to the 'doc' txt_json = txt_json['texts'] txt_json = txt_json[0] doc_id = txt_json['doc_id'] searchId = requests.get('https://api.legiscan.com/?key=2d28553a502d7fed3b68863b2f592f19&op=getBillText&id='+str(doc_id)) resultsId = searchId.json() resultsId = resultsId['text'] resultsId = resultsId['doc'] decodedResults = base64.b64decode(resultsId.encode('ascii')) bsObj2 = BeautifulSoup(decodedResults) bsObj2.style.decompose() htmlText = bsObj2.getText() #the bill text is MIME:txt/html and base64 encoded. So decode it #decoded_txt = base64.b64decode(txt_json.encode('ascii')) #the decoded text is an ugly html string. Use BS to parse and clean it #This only works when MIME is html, need to account for PDF**** #bsObj = BeautifulSoup(decoded_txt) #use BS to get the text from the bsObj #prettyText = bsObj.getText() return htmlText ''' I've managed to parse the bill ids from the json file and can now use th
def get_surname_info(last_name): url = ("https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=intitle:%22"+ urllib.quote(last_name.encode('utf8'))+"%22AND%22surname%22&srlimit=2&format=json&utf8=") response = urllib.urlopen(url) data = json.loads(response.read()) nomatches = True if len(data['query']['search']) > 0: for d in data['query']['search']: #print d if "is a" in d['snippet'] and "surname" in d['snippet']: nomatches = False #print d['snippet'] soup = BeautifulSoup(d['snippet'].encode("utf-8"),"html.parser") text = soup.getText() sentences = text.split(".") finals = '' for s in sentences: #print s if last_name in s or finals != '': finals = finals + s if "is a " in s or "is an " in s: if len(s) > 35: if last_name in s: return s else: return finals break if nomatches: return "NO ORIGIN DATA"
def web_crawling(url): """ Main task is extract page content by url, parse html and get all links and add to redis queue. """ logging.info('Extracting content for: %s', url) #extract page content try: page = urlopen(url) content = page.read() except (HTTPError, URLError): return logging.info('Start to parse content for: %s', url) soup = BeautifulSoup(content, 'html.parser') #parse and store content of pages for s in soup(['style', 'script', '[document]', 'had', 'title']): s.extract() page = Page(url, soup.getText()) page.save() logging.info('Stored Content in for: %s', url) #find all links and add to queue links = soup.findAll('a', attrs={'href': re.compile('^http://')}) for link in links: href = link.get('href') q.put(href) logging.info('Added %s to Url Queue for processing', url) logging.info('Finish to parse content for: %s', url)
def get_debate(url): r = requests.get(url) tmp = BeautifulSoup(r.text, 'html.parser') debate = [] check = [] trump_flag = False for t in tmp.select('div p'): line = t.text soup = BeautifulSoup(line) line = soup.getText() line = line.encode('utf-8', 'ignore') line = deal_with_unicode(line) if line.startswith('TRUMP') : trump_flag = True line = ' '.join(line.split()[1:]) line.strip() if not line.startswith("...") and len(line.split()) > 4 : debate.append(line) elif line.split()[0].isupper() and line.split()[0] <> "(APPLAUSE)": trump_flag = False elif trump_flag: line.strip() if len(line.split()) > 4: debate.append(line) else: check.append(line) return debate
def parse(self, html): soup = BeautifulSoup(html, "html.parser") #clean-up invalid HTML tags prettyHtmlDoc = soup.prettify() #get title, body, etc. text allText = soup.getText() return allText
def getRssInfo(): url='http://36kr.com/feed' d=feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList=[] for entry in d.entries: info={} info['url']=entry.link info['newsid']=getMd5(info['url']) info['title']=entry.title info['description']=entry.description info['ctime']=(long)(time.mktime(entry.published_parsed)) info['author']=entry.author info['source']=ctable info['keywords']='' soup = BeautifulSoup(entry.description, "html.parser",from_encoding='utf-8') img=soup.find('img') info['thumb']=img.get('src') if img else '' info['summary']=soup.getText() # print entry # print info['newsid'],info['url'] # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['title'] # print info['author'],info['thumb'] # print info['description'] # print info['summary'] infoList.append(info) return infoList
def get_text(url): html = urllib.request.urlopen(url).read() soup = BeautifulSoup(html, "html.parser") [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] text = soup.getText() return text
def privmsg(self, user, channel, msg): if msg == '_unmute': if channel in self.config['muted']: self.config['muted'].remove(channel) self.manager.wittyconf.update_plugin_config(self.plugin_name, self.config) logging.info('Added channel %s to the mute list' % channel) elif msg == '_mute': if channel not in self.config['muted']: self.config['muted'].append(channel) self.manager.wittyconf.update_plugin_config(self.plugin_name, self.config) logging.info('Removed channel %s from the mute list' % channel) if self.block or channel in self.config['muted']: return if msg.startswith('_feed'): self.block = True url = msg[5:].strip() self.manager.app.say(channel, 'Eating %s...' % url) try: html = urlopen(url) except URLError, e: self.manager.app.say(channel, e) return soup = BeautifulSoup(html) text = soup.getText() f = open(self.data_path, 'a') f.write(text.encode('ascii', 'ignore')) f.close() logging.info('Reloading...') self.post_init() self.manager.app.say(channel, 'Done. So much wisdom!') self.block = False
def getArticleFromXML(self, root): tags = [] #Fetch id, title and categories id = root.find('{http://www.w3.org/2005/Atom}id').text title = unicode(root.find('{http://www.w3.org/2005/Atom}title').text) #Check if record needs to be eliminated from zotero OR #resource title needs to be stripped if ':' in title: if self.isOmissible(title): log.debug('Omitting record with title- %s' % title) return None else: title = self.stripRsrc(title) categories = root.findall('{http://www.w3.org/2005/Atom}category') tags = self.produceTag(tags, categories, title) #Fetch HTML content and URL content = '' url = '' if root.find('{http://www.w3.org/2005/Atom}content').text != None: soup = BeautifulSoup(root.find('{http://www.w3.org/2005/Atom}content').text) content = soup.getText() if soup.find('a') != None: url = (soup.find('a')).get('href') httpObj = httplib2.Http() try: log.debug('Trying URL:%s' % url) resp, content = httpObj.request(url, 'HEAD') if resp['status'] == '404': tags.append({'tag':'404:'+time.strftime("%H-%M-%S")}) log.info('Code 404: URL %s broken.' % url) elif resp['status'] == '301': tags.append({'tag':'301:'+time.strftime("%H-%M-%S"),'tag':'old-url:'+url}) log.info('Code 301: URL %s redirecting to %s.' % (url, resp['location'])) url = resp['location'] except socket_error: tags.append({'tag':'111:Connection refused'}) log.info('Connection refused: URL %s' % url) except httplib2.RedirectLimit as redir_lim: tags.append({'tag':'301:'+time.strftime("%H-%M-%S")}) log.info('Redirect limit reached:'+url) log.info(str(redir_lim)) except Exception as httpex: tags.append({'tag': str(httpex.__class__.__name__)}) log.info(str(httpex)+':'+url) blogUrl = '' tmpList = root.findall("{http://www.w3.org/2005/Atom}link[@rel='alternate']") if len(tmpList) > 0: blogUrl = tmpList[0].attrib['href'] #Eliminate the first category value(http://../kind#post) that's taken as a tag if 'kind#post' in tags[0]['tag']: tags.pop(0) issn = self.getISSNFromXML(root) if issn!=None: return Article(id, title, tags, content, url, blogUrl, issn, 'journalArticle') else: return Article(id, title, tags, content, url, blogUrl, issn, 'webpage')
def clean_html_and_save_to_file(file_directory,file_name,save_directory): try: #File location file_path = file_directory + file_name # Open file and turn into beautiful soup object with open(file_path, "r",encoding="utf-8",errors="ignore") as file: soup_object = BeautifulSoup(file,"html.parser") # Extract only text from document text_section = soup_object.getText() # Convert all text to string str_text_section = str(text_section) # Trim file based on following; ALL SCRIPTS => start and "Back to IMSDb" the end of script start= str_text_section.find("ALL SCRIPTS") finish = str_text_section.find("Back to IMSDb") cleaned_text = str_text_section[start:finish] # Save text to file new_file_name = file_name.replace(".html",".txt") # change file extension save_path = save_directory + new_file_name new_file = open(save_path,"w") new_file.write(cleaned_text) except FileExistsError: print("File not found")
def crawl_website(website,zin,resultaten,domains,index,beschrijving,titel): try: url = urlparse(website) domain = url.netloc while domain in domains: time.sleep(1) domains[index] = domain html = get(website) domains[index] = "" soup = BeautifulSoup(html,"html.parser") if config["CRAWLER"]["CHECK_ONLY_VISIBLE_TEXT"].lower() == "true": [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title','meta'])] text = soup.getText() overeenkomstWaarde,plagiaatZin,text = algoritme.vergelijk_tekst(text,zin) # dit is het tekst vergelijk algoritme if overeenkomstWaarde > 3 and website not in veelGevonden: #als de plagiaat waarde hoog is kan er meer gekopieerd zijn dus scan de url bij iedere zin. #veelGevonden.append(website) veelGevonden.append((website,beschrijving,titel)) resultaten[index] = { "waarde": overeenkomstWaarde, "plagiaatZin": plagiaatZin, "text": text, "url":website, } # dit is als het ware de return except Exception as e: print(e) resultaten[index] = { "waarde": 0, "plagiaatZin": "", "text": zin, "url": website, } finally: domains[index] = ""
def index_document(id, filename): #we get the file from disk and initialize an html parsing library with it soup = BeautifulSoup(open(filename)) #remove all the text inside every <script> tag #[x.extract() for x in soup.findAll('script')] map(lambda x: x.extract(), soup.findAll("code")) # delete all map(lambda x: x.extract(), soup.findAll("style")) # delete all map(lambda x: x.extract(), soup.findAll("script")) # delete all #removes all the special punctuation text = remove_puctuation(soup.getText()) #get title of document title = soup.title.contents[0] #get snippet of text (20 words from the middle of the text) snip = text.split()[160:190] #save the title and the snippet into the document_table (reference) document_table[mydoc.id].title = title document_table[mydoc.id].snip = " ".join(snip) #text = soup.getText() #tokenize the page into words separated by spaces terms = [i.lower() for i in text.split() if i not in stop] for term in terms: try: invindex[term][str(id)] = invindex[term][str(id)] + 1 except: invindex[term][str(id)] = 1
def get_hocr_zones(processing_folder, png_filename, engine="tesseract"): image_filename = processing_folder + "/" + png_filename logging.info(image_filename) image = PillowImage.open(image_filename) if engine == "tesseract": engine_filename = engine + ".hocr" else: engine_filename = engine + ".hocr.html" hocr_filename = "{0}/{1}/{2}".format(processing_folder, engine, engine_filename) soup = BeautifulSoup(open(hocr_filename)) logging.info("opened " + hocr_filename) logging.info(soup.getText()) regions = [] for zone, region in read_hocr_tesseract(soup, image): regions.append(region) # TODO page number folder zone.save("{0}/{1}/{2}.bin.png".format(processing_folder, engine, region["id"])) with io.open( "{0}/{1}/{2}.txt".format(processing_folder, engine, region["id"]), "w", encoding="utf8") as fh: fh.write(region["text"]) with io.open( "{0}/{1}/master.json".format(processing_folder, engine), "w", encoding="utf8") as fh: fh.write(u"var regions = \n") fh.write(json.dumps(regions, ensure_ascii=False)) logging.info("Done")
class BSTokenizer: """ """ def __init__(self, url): """ Receive an url, reads it, create a beautiful object and extract sections don't wanted. Arguments: url (str): The string representation of an url to be analyzed """ self.raw_content = urllib2.urlopen(url).read() self.soup = BeautifulSoup(self.raw_content, "html.parser") [s.extract() for s in self.soup(['style', 'script', '[document]', 'head', 'title', 'meta'])] def get_most_common_words(self, amount=100): """It gets all the text, split it, and count using a utility from collections standard libraries Arguments: amount (int): The amount of elements that should be returned Returns: List: A list with a list where first element is the word and second the number of repetitions [["hello", 10], ["bye", 4],...] """ text = self.soup.getText() frequencies = Counter(text.split()) return frequencies.most_common(amount)
def summarize(): text = "" if shmoopurl: page = requests.get(shmoopurl + "summary.html") soup = BeautifulSoup(page.content, 'lxml') descript = BeautifulSoup(str(soup.find(class_ = 'content-learning-guide')), 'lxml') title = descript.find('h1') par = descript.find_all('p') text += "<h1>" + title.getText() + "</h1>" i = 0 while i < len(par) - 4: text += par[i].getText() i += 1 while "\n" in text: text = text.replace('\n', '<br>') elif sparkurl: page = requests.get(sparkurl + "summary.html") soup = BeautifulSoup(page.content, 'lxml') char = BeautifulSoup(str(soup.find(class_='studyGuideText')), 'lxml') ads = char.find_all(class_= 'floatingad') strads = [] text = char.getText() for ad in ads: strads.append(BeautifulSoup(str(ad), 'lxml').getText()) for ad in strads: text = text.replace(ad, '') while '\n' in text: text = text.replace('\n', '<br>') return text
def getContent(self): # Scrape the site for the article content html = urllib.urlopen(self.article).read() soup = BeautifulSoup(html) [s.extract() for s in soup(["style", "script", "[document]", "head", "title"])] self.visible_text = soup.getText() return
def getRssInfo(url): d=feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList=[] for entry in d.entries: info={} # print entry info['url']=entry.link info['newsid']=getMd5(info['url']) info['title']=entry.title info['ctime']=(long)(time.mktime(entry.published_parsed)) info['author']=entry.author # print timeFormat.getTimeStamp(info['ctime']),info['title'] info['source']=ctable tags=entry.tags if 'tags' in entry else None info['keywords']=','.join(tag.term for tag in tags) if tags else '' info['description']=entry.summary #entry.content[0].value soup = BeautifulSoup(info['description'], "html.parser",from_encoding='utf-8') img=soup.find('img') info['thumb']=img.get('src') if img else '' info['summary']=soup.getText() #' '.join(p.getText().strip() for p in soup.find_all('p')) # print info['newsid'],info['url'] # print info['author'] # print info['keywords'],info['thumb'] # print info['summary'] # print info['description'] infoList.append(info) return infoList
def cleanHtml(i): i = str(i) bs = BeautifulSoup(i) i = bs.getText() return i
def snippet_builder(doc_ID): if doc_ID in key_docs: a = search_list_of_dict(doc_ID, sample_content_chunk) snippet = {} soup = BeautifulSoup(a['html']) s = soup.getText() desc = s[1:300] snippet['title'] = a['title'] snippet['href'] = a['link'] snippet['desc'] = desc + '...' return snippet
def tokenize(rawHtml): soup = BeautifulSoup(rawHtml, 'html.parser') rawDocument = soup.getText().encode('utf-8').lower() tokens = nltk.word_tokenize(rawDocument) for punctuation in string.punctuation: tokens = filter(lambda a: a != punctuation, tokens) # remove `` manually tokens = filter(lambda a: a != "``", tokens) # remove '' manually tokens = filter(lambda a: a != "''", tokens) return tokens
def format_item(item): url = "https://intra.epitech.eu" soup = BeautifulSoup(item["title"], "html.parser") links = [] output = u"EPITECH\n-------\n{message}\n-----\nLiens:\n{links}" for link in soup.find_all("a"): links.append("[{index}]: {url}{link}".format(index=len(links) + 1, url=url, link=link.get("href"))) link.replace_with("{string}[{index}]".format(string=link.string, index=len(links))) message = soup.getText() return output.format(message=message, links="\n".join(links))
def download_page(url): cookies = pickle.load(open("d:/cookies.pkl", "rb")) print(cookies) cookie_jar = RequestsCookieJar() for c in cookies: cookie_jar.set(c['name'], c['value'], domain="jd.com") page = requests.get(url, cookies=cookie_jar) soup = BeautifulSoup(page.text, 'html.parser', from_encoding='utf-8') print(soup.getText()) print(page) print('爬取成功')
def make_lexifications(): host_name = name.get() port_no = port.get() + 3602 port_string = str(port_no) output = "http://" + host_name + ":" + port_string + "/cgi-bin/cg?cb-start" print(output) number_of_terms = create_cyc_lex_no_var.get() path = create_cyc_lex_var.get() titlish = create_cyc_lex_name_var.get() progress_bar["maximum"] = 3 if path == "": msg.showwarning("Utility Warning", "No path specified.") return "" if titlish == "": msg.showwarning("Utility Warning", "Provide a filename.") return "" if titlish == "": msg.showwarning("Utility Warning", "Indicate number of terms.") return "" try: urllib.request.urlopen(output) lights_on() print("OPEN") except urllib.error.HTTPError: msg.showwarning("Utility Warning", "Check connection.") return "" except urllib.error.URLError: msg.showwarning("Utility Warning", "Check connection.") return "" progress_bar["value"] = 1 progress_bar.update() subl_query = "(get-lexification-sets-for-n-concepts " + str( number_of_terms) + ")" uri_query = urllib.parse.quote(subl_query) header = "http://" + host_name + ":" + port_string + "/cgi-bin/cb-eval-subl?expression=" request = header + uri_query print(request) progress_bar["value"] = 2 progress_bar.update() content_return = urllib.request.urlopen(request).read() soup_return = BeautifulSoup(content_return, "html.parser") json_return = json.loads(soup_return.getText()) returns = json_return["results"] returns_string = str(returns) returns_string = returns_string[3:-3] returns_string = returns_string.replace("\\'", "\'") returns_dict = eval(returns_string) progress_bar["value"] = 3 progress_bar.update() file_name = path + "/" + titlish + '.pickle' with open(file_name, 'wb') as handle: pickle.dump(returns_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) sleep(0.75) progress_bar["value"] = 0
def get_article_text(article_url): if type(article_url) != str and type(article_url) != unicode: raise TypeError('URL must be a string') html = requests.get(article_url).content soup = BeautifulSoup(html, 'html.parser') for s in soup(['style', 'script', '[document]', 'head', 'title']): s.extract() return soup.getText().strip().encode("utf-8")
def get_file_list(course_url, cookies): r = requests.get(course_url, cookies=cookies) soup = BeautifulSoup(r.text, 'lxml') soup_list = soup.find_all('div', {'class': 'activityinstance'}) files = {} for soup in soup_list: soup = soup.find('a') link = soup['href'] if 'resource' in link: files[link.split('=')[-1]] = soup.getText() return files
def parse_details(self, response): data = response.meta['data'] try: d = BeautifulSoup("\n".join( response.xpath( "//p[@style = 'text-align: justify;']").extract())) description = d.getText().replace("\u2019", "'") except: description = "Description indisp." try: ttb = BeautifulSoup( response.xpath( "//div[@class='biz-hours']").extract_first().replace( "</h4>", " : ").replace(" </li>", ". ")) timetable = ttb.getText() except: timetable = "Horaires indisp." try: p = BeautifulSoup( response.xpath( "//div[@class='fee-kind']").extract_first().replace( "</h4>", "\n").replace('class ="fee-conditions">', '> (').replace("</ul>", ")").replace( "</li>", ". ")) price = normalize("NFKD", p.getText()) except: price = "Tarifs indisp." data['url'] = response.url data['timetable'] = timetable data['reviews'] = "" data['rank'] = 0 data['summary'] = description data['price'] = price data['source'] = "3-expoInTheCity" yield data
def fetch_artical(url): d = {} resp = requests.get(url) if resp.status_code != 200: return None content = BeautifulSoup(resp.text, 'html.parser') # get the information of board, artical title, author if content.select_one('#topbar > a.board > span') != None: content.select_one('#topbar > a.board > span').decompose() board = '' if content.select_one('#topbar > a.board') != None: board = content.select_one('#topbar > a.board').getText() title = '' if content.select_one( '#main-content > div:nth-child(3) > span.article-meta-value' ) != None: title = content.select_one( '#main-content > div:nth-child(3) > span.article-meta-value' ).getText() author = '' if content.select_one( '#main-content > div:nth-child(1) > span.article-meta-value' ) != None: author = content.select_one( '#main-content > div:nth-child(1) > span.article-meta-value' ).getText() # remove pushes and other unimportant information content = content.select_one('#main-content') if content == None: print(url) for metaline in content.find_all('div', class_='article-metaline'): metaline.decompose() if content.select_one( '#main-content > div.article-metaline-right') != None: content.select_one( '#main-content > div.article-metaline-right').decompose() pushes = content.find_all('div', class_='push') for p in pushes: p.decompose() # get the artical artical = content.getText() json_data = { 'board': board, 'title': title, 'author': author, 'artical': artical } return json_data
def get_chapter_text(self, chapter): """Gets the chapter text from the specified chapter. Work.load_chapters() must be called first.""" if chapter > 0 and chapter <= self.chapters and self.chapters > 1: if len(self.chapter_ids) == self.chapters: chapter_html = self.request( "https://archiveofourown.org/works/%i/chapters/%s?view_adult=true" % (self.workid, self.chapter_ids[chapter - 1])) div = chapter_html.find("div", {'role': 'article'}) return str(BeautifulSoup.getText(div)) else: raise utils.UnloadedError( "Work.load_chapters() must be called first") elif chapter == 1: div = self.soup.find("div", {'role': 'article'}) return str(BeautifulSoup.getText(div)) else: raise utils.UnloadedError( "Work.load_chapters() must be called first")
def index_document(document): # Create BeautifulSoup object from html text, and ignore/remove the non-ASCII soup = BeautifulSoup(document.encode("ascii", errors='ignore'), 'html.parser') # Remove non-visible tags [Reference: https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text] [ tag.extract() for tag in soup(['style', 'script', '[document]', 'head', 'title']) ] # Get visible text from html document visible_text = soup.getText() return index_string(visible_text)
def parse_html(html): soup = BeautifulSoup(html, 'html.parser') text = soup.getText() res = '' for line in text.split('\n'): while line is not None and len(line) > 2 and (line[0] == ' ' or line[0] == ' '): line = line[1:] if line is not None and len(line) > 1 and line[0] != '\n': # print(line) res = res + line + '\n' return res
def parse_json(): parser = EmailReplyParser(language='en') with open('english.json', 'rb') as fl: messages = json.load(fl) parsed = [] for text in messages: soup = BeautifulSoup(text, 'lxml') text = soup.getText('\n') text = parser.parse_reply(text) parsed.append(text) import code code.interact(local=locals())
def verify(): parser = EmailReplyParser(language='fi') texts = json.load(open('test/emails/emails.json')) texts = list(filter(lambda d: type(d) == str, texts)) parsed = [] for text in texts: print('-'*100) soup = BeautifulSoup(text, 'lxml') text = soup.getText('\n') text = parser.parse_reply(text) parsed.append(text) print(text)
def get_areaCode(self, number): try: url = 'http://www.allareacodes.com/' + str(number) site = self.mechRead(url) nameSplit1 = site.split('<td>Major City:</td>')[1] nameSplit2 = nameSplit1.split('</td>')[0] soup = BeautifulSoup(nameSplit2) area = soup.getText() return area.encode("utf-8") except Exception: return 'None'
def transformContent(self, dbName, collectionName): db = self.mongoClient.getConnection(dataBaseName=dbName) collection = db[collectionName] for i in collection.find(): content = i['content'] soup = BeautifulSoup(content, 'lxml') content = soup.getText() i['content'] = self.cuttingMachine.deleSpecialChar(content) i['keywords'] = self.cuttingMachine.doCutting(i['content']) i['status'] = 0 #处理状态,0:未处理 1:待处理 2:处理中 3:处理完成 i['mtime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) db.article_text.insert(i)
def get_Job(self, url): site = self.mechRead(url) try: nameSplit1 = site.split( '<div class="_42ef"><div><div class="_50f3">')[1] nameSplit2 = nameSplit1.split('<span class="_50f8">')[0] soup = BeautifulSoup(nameSplit2) job = soup.getText() return job.encode("utf-8") except Exception: return 'None'
def buildHashMap(zipped_folder): ''' This function will take in a zipped folder of html pages and iterate through them, extracting all words from the pre-processed html text. It will output a dictionary of words occuring the collection of pages. The dictionary keys are words that contain a list of occurrences and locations of where the terms are found in the documents. ''' with ZipFile(zipped_folder, 'r') as zip: # Display the directory of files in the zip folder zip.printdir() word_list = {} # Loop through each file in the zip container for name in zip.namelist(): # Read the html file data = zip.read(name) # Parse the data with BeautifulSoup soup = BeautifulSoup(data, 'html.parser') # Extract the text from the html file and generate a list of "words" separated by spaces page_text = soup.getText().split() # Loop through words in extracted text and preprocess to exclude prohibited words doc_words = {} for id, word in enumerate(page_text): # Check if word contains only letters if word.isalpha(): # set word to lowercase word = word.lower() # Map frequencies and occurrences for each word in the corpus if word in doc_words.keys(): doc_words[word]['freq'] += 1 doc_words[word]['locations'].append(id) else: doc_words[word] = { 'doc_id': name, 'freq': 1, 'tf_idf': 0, 'locations': [id] } # Add document words to corpus vocabulary for word in doc_words.keys(): if word in word_list.keys(): word_list[word].append(doc_words[word]) else: word_list[word] = [doc_words[word]] doc_words = {} return word_list
def process_set(paths, sent_dict): neg_but_pos = 0 pos_but_neg = 0 reviews = [] for path in paths: fileset = [f for f in listdir(path)] for f in fileset: with open(path + "/" + f, 'r', encoding="utf8") as content_file: content = content_file.read() soup = BeautifulSoup(content.lower(), 'html.parser') tokens = word_tokenize(soup.getText()) tagged_tokens = pos_tag(tokens) review_str = "" not_flag = False word_count = 0 pos_score = 0 neg_score = 0 for (x, y) in tagged_tokens: if y.startswith("JJ") or y.startswith("RB") or y.startswith( "NN") or y.startswith("V"): review_str += x + " " scores = sent_dict.get(x, [0, 0]) pos = scores[0] neg = scores[1] if not_flag: pos_score += neg neg_score += pos else: pos_score += pos neg_score += neg elif x == "no" or x == "not": not_flag = True else: not_flag = False if pos_score > neg_score: if "neg" in path: pos_but_neg += 1 elif neg_score > pos_score: if "pos" in path: neg_but_pos += 1 reviews.append( ("processed_" + path + "/" + f.split('.')[0] + ".json", review_str, pos_score, neg_score)) tlog("Positive reviews with negative score: " + str(neg_but_pos)) tlog("Negative reviews with positive score: " + str(pos_but_neg)) return reviews
def search_google(query): '''Search google and determine if wikipedia is in it''' search_object = google.search(query) #Determine if a wikipedia url is in the first 5 searches urls = [] for i in range(0, 4): url = search_object.__next__() urls.append(url) if "wikipedia.org/wiki" in url: wikipedia_search = wikipedia.search(query)[0] url = wikipedia.page(wikipedia_search).url response = wikipedia.summary(wikipedia_search) + " ({0})".format( url) return response #If there were no wikipedia pages first_url = urls[0] try: article = Article(first_url) article.download() article.parse() article.nlp() article_summary = article.summary article_title = article.title return "{0}\n{1} - ({2})".format(article_summary, article_title, first_url) except Exception as article_exception: try: log.debug( "Got error {0}, {1} while using newspaper, switching to bs4". format(article_exception.message, article_exception.args)) html = requests.get(first_url).text #Parse the html using bs4 soup = BeautifulSoup(html, "html.parser") [ s.extract() for s in soup( ['style', 'script', '[document]', 'head', 'title']) ] text = soup.getText() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines soup_text = '\n'.join(chunk for chunk in chunks if " " in chunk) response = format(soup_text) + " ({0})".format(first_url) return response except Exception as search_exception: log.info("Error {0},{1} occurred while searching query {2}".format( search_exception.message, search_exception.args, query)) return "Error encountered on query {0}".format(query)
def get_page(resp): """ Argument: The object contains information about the GET request The function will get the text of the file and write it the page_text file line by line """ txt = resp.text soup = BeautifulSoup(txt, "html.parser") text_page = [soup.getText()] page_txt = open("page_text", 'w') for line in text_page: line = line.encode('utf-8').strip() page_txt.write(line + "\n") page_txt.close()
def get_City(self, url): site = self.mechRead(url) try: nameSplit1 = site.split( 'hovercard="/ajax/hovercard/page.php?id=112222822122196">')[2] nameSplit2 = nameSplit1.split('</a></div>')[0] soup = BeautifulSoup(nameSplit2) city = soup.getText() city = city.replace(',', '') return city except Exception: return 'None'
def get_content(url): #如果url是网上链接 try: resp = urllib.request.urlopen(url) html = resp.read() html = (html.replace('<br>', '\n')).replace('<br/>', '\n') #urllib.request.urlopen(url).read() bs = BeautifulSoup(html, "html.parser") return bs.get_text(), bs except: #如果url是本地地址. try: htmlfile = open(url, 'r', encoding='utf-8') html = htmlfile.read() html = (html.replace('<br>', '\n')).replace('<br/>', '\n') bs = BeautifulSoup(html, "html.parser") return bs.getText(), bs except: #第三种是假的html,还是需要先下载. import requests r = requests.get(url) with open('laji.html', "wb") as f: # 这里面补上后缀名 f.write(r.content) url = 'laji.txt' htmlfile = open('laji.html', 'r', encoding='utf-8') html = htmlfile.read() html = (html.replace('<br>', '\n')).replace('<br/>', '\n') bs = BeautifulSoup(html, "html.parser") return bs.getText(), bs return bs.get_text(), bs
def get_strings(html, select=None): bs = BeautifulSoup(html, 'lxml') if select is not None: bs = bs.select(select)[0] for s in bs(['script', 'style']): s.extract() txt = bs.getText(separator=' ') res = [] for line in txt.split('\n'): pret = prettify(line) if pret != '': res.append(pret) return res
def clean_html(html): if "<" in html and ">" in html: try: soup = BeautifulSoup(html, features="html.parser") plist = soup.find('plist') if plist: plist.decompose() # remove plists because ugh text = soup.getText() except: text = remove_tags(html) return '. '.join(text.split("\r\n\r\n\r\n")) else: return '. '.join(html.split("\r\n\r\n\r\n"))
def getalltext(soup: BeautifulSoup) -> object: """ Returns all text from an HTML soup :param soup: Beautifulsoup document :return: Text without HTML tags """ # if isinstance(soup, NavigableString): # return str(soup) # txt = ''.join(soup.find_all(text=True)) # body = soup.find('body').getText() # title = soup.find('title').getText() # return re.sub(r'[\n\t\r\,]', ' ', body + title) return re.sub(r'[\n\t\r\,]', ' ', soup.getText())
def searchWikia(query): engine = "http://lyrics.wikia.com/wiki/Special:Search?search=" searchlink = engine + query soup = BeautifulSoup(urllib.request.urlopen(searchlink), "lxml") soup = soup.find("a", "result-link") if soup != None: desc = soup.getText() link = soup["href"] else: desc = None link = None result = SearchResult(desc, link) return result
def get_text(page_source): """ :param page_source: :return: the Text content in the page source """ soup = BeautifulSoup(page_source, 'html.parser') texts = soup.findAll(text=True) [ s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title']) ] visible_text = soup.getText() return visible_text
def extract_content(self, soup: BeautifulSoup) -> str: txt = str(soup) start_tag = '<div class="story_text">' end_tag = '<p class="autor">' start = txt.find(start_tag) end = txt.find(end_tag) content = txt[start:end] content = BeautifulSoup(str(content), features="lxml") # remove all <> return content.getText()
def process(self, html: BeautifulSoup, url, status_code): if status_code == 200: canonical = html.find("link", {"rel": "canonical"}) if canonical is None: sep = chr(1) texts = [ t.strip(" \n\t") for t in html.getText(separator=sep).split(sep) ] data = PageContent( url, [t for t in texts if len(t.split()) > self.min_length]) self.content.append(data)