def start(self): while True: # Get links from subreddit we are interested in for submission in self.subreddit.get_hot(limit=10): # Check for the cases where we will skip a submission: if "imgur.com/" not in submission.url: continue # skip non-imgur submissions if 'http://imgur.com/a/' in submission.url: # This is an album submission. albumId = submission.url[len('http://imgur.com/a/'):] htmlSource = requests.get(submission.url).text soup = BeautifulSoup(htmlSource) try: matches = soup.select('.album-view-image-link a') for match in matches: imageUrl = match['href'] if '?' in imageUrl: imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')] else: imageFile = imageUrl[imageUrl.rfind('/') + 1:] self.checkHigherRes('http:' + match['href']) except Exception as e: print e.message elif 'http://i.imgur.com/' in submission.url: # The URL is a direct link to the image. mo = self.imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html imgurFilename = mo.group(2) if '?' in imgurFilename: # The regex doesn't catch a "?" at the end of the filename, so we remove it here. imgurFilename = imgurFilename[:imgurFilename.find('?')] self.checkHigherRes(submission.url) elif 'http://imgur.com/' in submission.url and 'http://imgur.com/' not in submission.url: # This is an Imgur page with a single image. htmlSource = requests.get(submission.url).text # download the image's page soup = BeautifulSoup(htmlSource) try: imageUrl = soup.select('.image div img')[0]['src'] if imageUrl.startswith('//'): # if no schema is supplied in the url, prepend 'http:' to it imageUrl = 'http:' + imageUrl imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')] if '?' in imageUrl: imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')] else: imageFile = imageUrl[imageUrl.rfind('/') + 1:] self.checkHigherRes(imageUrl) except Exception as e: print e.message time.sleep(1800)
def get_elections(): r = requests.get('http://elections.sos.state.tx.us/index.htm') soup = BeautifulSoup(r.text) return [{ 'election_code': o['value'], 'title': o.text } for o in soup.select('option')]
def getSeasonPage(seasonLink): selected_episode = None sleep(0.5) request_season_page = requests.get(seasonLink, headers=headers) soup = BeautifulSoup(request_season_page.text, 'html.parser') seasonEpisodes = soup.select('[class*="EpisodeListItem__title"]') minEntryIndex = int(min(range(len(seasonEpisodes))) + 1) maxEntryIndex = int(max(range(len(seasonEpisodes))) + 1) for index, seasonEpisode in enumerate(seasonEpisodes): print(seasonEpisode.text) print('Index : %d' % (index + 1)) select_number = input( 'Please select a number from %d to %d for the desired episode: ' % (minEntryIndex, maxEntryIndex)) while not select_number: select_number = input( 'Please select a number from %d to %d for the desired episode: ' % (minEntryIndex, maxEntryIndex)) while not int(select_number) >= minEntryIndex: select_number = input( 'Please select a number from %d to %d for the desired episode: ' % (minEntryIndex, maxEntryIndex)) while not int(select_number) <= maxEntryIndex: select_number = input( 'Please select a number from %d to %d for the desired episode: ' % (minEntryIndex, maxEntryIndex)) episodeURI = seasonEpisodes[int(select_number) - 1].find('a') getEpisodeSong(episodeURI['href'])
def fetchGoal(link): page = fetchHtmlContent(link) soup = BeautifulSoup(page, 'lxml') description = '' # Huvudmål tags = soup.select('h1.goal-title') for tag in tags: description += tag.text + '\n\n' # Lång beskrivning tags = soup.select('div.single-goal-long-description-text') for tag in tags: description += tag.text description += '\n' # Delmål tags = soup.select('div.container div.row div.col-8 h4') for tag in tags: description += tag.text + '\n' description += '\n' # Beskrivningar av delmål tags = soup.select( 'div.container div.row div.col-8 div.target-description') for tag in tags: description += tag.text description += '\n' # Snabba tips tags = soup.select('div.single-tip-inner h4') for tag in tags: description += tag.text + '\n' # additional tips tags = soup.select('div.single-additional-tip-content h4') for tag in tags: description += tag.text + '\n' description += '\n' # Snabba tips descriptions tags = soup.select('div.single-tip-inner div.tip-description p') for tag in tags: description += tag.text + '\n' # Additional tips descriptions tags = soup.select('div.single-additional-tip-content p') odd = False for tag in tags: if (odd): description += tag.text + '\n' odd = not odd return description
def getContent(html): try: soup = BeautifulSoup(html, "html.parser") result = soup.select('div.articulum > p') content = result[0].text.encode("utf-8") if len(content) != 0: #TODO pass else: getContent(html) except: print("soup exception")
def crawler(): counter = 1 for url_ref in config.FULL_URLS: resp = requests.get(url_ref) if resp.status_code == 200: _, name = get_name(url_ref) # Ensure folder exists folter_path = create_folder([config.LYRICS_FOLDER, name]) # Get all links parsed_html = BeautifulSoup(resp.content, features='html.parser') lyrics_links = parsed_html.select('.listalbum-item a') LOG.info(f"Number of {name.upper()} songs: {len(lyrics_links)}") lyric_paths = [extract_link(link) for link in lyrics_links] for lyric_path in lyric_paths: try: writer, song_name = get_name(lyric_path) if name != writer: alt_folder = create_folder( [config.LYRICS_FOLDER, writer]) lyrics_file = alt_folder.joinpath(song_name + '.txt') file_found = lyrics_file.is_file() else: writer = name lyrics_file = folter_path.joinpath(song_name + '.txt') file_found = lyrics_file.is_file() if not file_found: # url = config.BASE_URL + lyric_path text = get_lyrics(lyric_path).strip() LOG.info("Downloading (" + str(counter).zfill(3) + f") [{writer}]: {song_name}") counter += 1 with open(lyrics_file, "w") as f: f.write(text) time.sleep(config.CRAWLER_WAIT + config.CRAWLER_WAIT * random.random()) except IndexError: LOG.error( f"Access denied while scraping: {lyric_path} \n" f"Try increasing the waiting time.\n" f"Finishing the scrapping for the moment. Try to access on your browser to unblock access" ) return except Exception as err: print(f"ERROR: {lyric_path}: {err}") else: LOG.warning(f"Unable to load: {url_ref}")
def getUserInfo(shared_url, **headers): html_doc = getHtml(shared_url, **headers) result = {} if html_doc: html_doc = html_doc.replace('&#', 'hzsd') soup = BeautifulSoup(html_doc, 'html.parser') header_url = soup.select("[class~=avatar]")[0]['src'] nickname = soup.select("[class~=nickname]")[0].string uid = soup.select("[class~=shortid]")[0].get_text() uid = uid.split(" ") id = woff2tff(uid) sign = soup.select("[class~=signature]")[0].string dataInfo = soup.select("[class~=follow-info]")[0] dataInfo = splitByChinese(dataInfo.get_text()) dataInfo = [d for d in dataInfo if len(d) > 0] focus = dataInfo[0].split(' ') focus = woff2tff(focus) fans = dataInfo[1].split(' ') fans = woff2tff(fans) liked = dataInfo[2].split(' ') liked = woff2tff(liked) works = soup.select( "[class='user-tab active tab get-list']")[0].get_text() works = woff2tff(works.split(' ')) result['avatar'] = header_url result['nickname'] = nickname result['id'] = id result['sign'] = sign result['focus'] = focus result['fans'] = fans result['liked'] = liked result['works'] = works return result
def login(browser): br.set_handle_robots(False) # Set cookie container cj = cookielib.CookieJar() br.set_cookiejar(cj) # Allow refresh of the content br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Set the fake user-agent and rest of headers to emulate the browser br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Encoding', 'gzip,deflate,sdch'), ('Accept-Language', 'en-US,en;q=0.8'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'), ] logger = logging.getLogger("mechanize") logger.addHandler(logging.StreamHandler(sys.stdout)) logger.setLevel(logging.DEBUG) # Set the login url for coursera, the final url for the lectures is enclosed within the url here. br.open('https://accounts.coursera.org/signin?course_id=973439&r=https%3A%2F%2Fclass.coursera.org%2Fprogfun-005%2Flecture&user_action=class&topic_name=Functional%20Programming%20Principles%20in%20Scala') soup = BeautifulSoup(response.get_data()) # Look for the right parts in the login form and fill them. Click in the button soup.select("signin-email").string("my-username") soup.select("signin-password").string("my-password") soup.select("btn bt-success coursera-signin-button").
def populateGoalNames(dbname, goal_links): goals = [] for ix, goal_link in enumerate(goal_links): print(goal_link) # print(fetchGoal(goal_link)) page = fetchHtmlContent(goal_link) soup = BeautifulSoup(page, 'lxml') tags = soup.select('h1.goal-title') goals.append(tags[0].text) sql_insert = 'INSERT INTO Goals VALUES ("{}", "{}", "")'.format( ix + 1, tags[0].text) print(sql_insert) writeToSQliteDB(dbname, sql_insert)
def Categories(): BASE = 'http://thepiratebay.org/browse/' addDir('Search','',icon,4, 0) result = cache.cacheFunction(cache_cats) soup = BeautifulSoup(result[0], convertEntities=BeautifulSoup.HTML_ENTITIES) cats = soup.select('optgroup') for i in cats: main_name = i['label'] main_id = i.option['value'][0]+'00' if sort == "1": BASE = BASE.replace('browse', 'top') addDir(main_name, BASE+main_id, icon, 1, 0) for sub in i('option'): sub_name = sub.string sub_id = sub['value'] if sort == "1": BASE = BASE.replace('browse', 'top') addDir(main_name+' - '+sub_name, BASE+sub_id, icon, 1, 0)
def getIngredientNames(index): # get = request.GET # index = int(get.get('index')) # # # from recipes.views import * # getIngredientNames(8279) # urlBase = 'http://cooking.nytimes.com/recipes/' while index < 2000000: url = urlBase + str(index) print index index += 1 try: req = urllib2.Request(url.encode("utf8"), headers={ 'accept': '*/*', 'User-Agent': "Magic Browser" }) html = urllib2.urlopen(req, timeout=10) except: continue soup = BeautifulSoup(html, "html5lib") ingredients = soup.select('.ingredient-name span') for i in ingredients: i = i.text.lower() if not 'nutritional information' in i: if ' and ' in i: i = i.split(' and ') elif ' or ' in i: i = i.split(' or ') elif ', ' in i: i = i.split(' or ') else: i = [i] for part in i: if 'our' in part: Ingredient.objects.get_or_create(name=part) else: if part != singularize(part): print part, singularize(part) Ingredient.objects.get_or_create( name=singularize(part)) print 'DONE'
def getSeason(content): selectedSeason = None sleep(0.5) request_content_page = requests.get(tunefind_search_uri + content['uri'], headers) soup = BeautifulSoup(request_content_page.text, 'html.parser') allSeasons = soup.select('[class*="MainList__item"]') minEntryIndex = int(min(range(len(allSeasons))) + 1) maxEntryIndex = int(max(range(len(allSeasons))) + 1) if minEntryIndex == maxEntryIndex: seasonLink = tunefind_search_uri + allSeasons[0].find('a')['href'] getSeasonPage(seasonLink) return for index, season in enumerate(allSeasons): season_link = season.find('a') if season_link is None: continue print('Title: %s' % (season_link.text)) print('Index: %d' % (int(index) + 1)) select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) while not select_number: select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) while not int(select_number) >= minEntryIndex: select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) while not int(select_number) <= maxEntryIndex: select_number = input('Please select a number from %d to %d: ' % (minEntryIndex, maxEntryIndex)) selectedSeason = allSeasons[int(select_number) - 1] seasonLink = str(tunefind_search_uri + selectedSeason.find('a')['href']) getSeasonPage(seasonLink)
def getIngredientNames(index): # get = request.GET # index = int(get.get('index')) # # # from recipes.views import * # getIngredientNames(8279) # urlBase = 'http://cooking.nytimes.com/recipes/' while index < 2000000: url = urlBase + str(index) print index index += 1 try: req = urllib2.Request(url.encode("utf8"), headers={'accept': '*/*', 'User-Agent' : "Magic Browser"}) html = urllib2.urlopen(req, timeout=10) except: continue soup = BeautifulSoup(html, "html5lib") ingredients = soup.select('.ingredient-name span') for i in ingredients: i = i.text.lower() if not 'nutritional information' in i: if ' and ' in i: i = i.split(' and ') elif ' or ' in i: i = i.split(' or ') elif ', ' in i: i = i.split(' or ') else: i = [i] for part in i: if 'our' in part: Ingredient.objects.get_or_create(name = part) else: if part != singularize(part): print part, singularize(part) Ingredient.objects.get_or_create(name = singularize(part)) print 'DONE'
def get_page_details(self, page_url, thread_id): self.check_html_getter() html = self.html_getter.get_html(page_url) soup = BeautifulSoup(html, "lxml") items = soup.select(".gl-item") for item in items: item_id = item.select(".gl-i-wrap.j-sku-item").attrs("data-sku") item_name = item.select(".p-name em").get_text() item_price = item.select(".J_price.js_ys").get_text() if not item_price: continue item_url = item.select(".p-name a").attrs("href") img_url = item.select(".p-img img").attrs("src") if img_url is None: img_url = item.select(".p-img img").attrs("data-lazy-img") yield { 'item_id': item_id, 'item_name': item_name, 'item_price': item_price, 'item_url': item_url, 'img_url': img_url, }
def getGroups(url, f): log.write('-defGroups-\n') groups = [] try: html = requests.get(url) content = BeautifulSoup(html.text, "lxml") log.write('->Got main page content\n') #Selecting all the links that lead to a group page for link in content.select('a[href*="index.cfm?CatID="]'): #Selecting the groups if link.strong: try: group = Group() group.setName(link.text.replace("on Twitter", "").replace("players", "").encode('utf-8')) group.setLink(link['href']) getSubGroups(url, group, f) groups.append(group) except Exception, e: log.write('erro: '+str(e)+'\n') #Resting for 10 seconds to avoid overhead time.sleep(10) except Exception, e: log.write('erro: '+str(e)+'\n')
from soupselect import select from bs4 import BeautifulSoup import re import sys import re import tweepy, time import random reload(sys) sys.setdefaultencoding('utf-8') url = "http://press.unian.ua/announcement/" page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(), "lxml") #зчитування з сайту дату, назву, лінк, час публікації title_data = soup.select('div.other_news span.title') time_data = soup.select('div.other_news span.time') date_link = soup.select('div.other_news span.date') link_data = soup.select('div.other_news ul li a') # full_dict словник, у якому ключ - лінк на статтю full_dict = {"http://press.unian.ua" + a['href']: [] for a in link_data} # список, що складається із словників. Словник має наступну структуру: ключ - лінк на базу псевдосоціолога, значення - список із усіма можливими варіантами запису пріхвища та ім*я fake_sociologists = [{ 'url': 'http://goo.gl/6i4K4g', 'keywords': ["Радчук", "апро"] }, { 'url': "", 'keywords':
def scraping_documenti(): # Browser mechanize br = mechanize.Browser() # Empty dict lista_docs = [] """ articoli del volume 74 n 1 statistica -http://rivista-statistica.unibo.it/issue/view/467 """ # trova gli url degli articoli resp = br.open("http://rivista-statistica.unibo.it/issue/view/467") raw_html = resp.read() # raw html source code soup = BeautifulSoup(raw_html) divclass = soup.findAll('div', attrs={'class': 'tocTitle'}) # lista for div in divclass: results = div.findAll("a") for res in results: url = res["href"] data = {} data['url'] = url data['title'] = res.text lista_docs.append(data) """ articoli del volume 20 issue 11/12 -http://www.dlib.org/dlib/november14/11contents.html """ url_base = "http://www.dlib.org/dlib/november14/11contents.html" resp = br.open("http://www.dlib.org/dlib/november14/11contents.html") raw_html = resp.read() # raw html source code soup = BeautifulSoup(raw_html) divclass = soup.findAll('p', attrs={'class': 'contents'}) # lista for div in divclass: results = div.findAll("a") for res in results: url = res["href"] url = urlparse.urljoin(url_base, url) data = {} data['url'] = url data['title'] = res.text lista_docs.append(data) """ tutti gli articoli di una issue a scelta su dilib -http://www.dlib.org/dlib/july15/07contents.html """ url_base = "http://www.dlib.org/dlib/july15/07contents.html" resp = br.open(url_base) raw_html = resp.read() # raw html source code soup = BeautifulSoup(raw_html) divclass = soup.findAll('p', attrs={'class': 'contents'}) # lista for div in divclass: results = div.findAll("a") for res in results: url = res["href"] url = urlparse.urljoin(url_base, url) data = {} data['url'] = url data['title'] = res.text lista_docs.append(data) """ tutti gli articoli di questa issue http://almatourism.unibo.it/issue/view/512 """ url_base = "http://almatourism.unibo.it/issue/view/512" resp = br.open("http://almatourism.unibo.it/issue/view/512") raw_html = resp.read() # raw html source code soup = BeautifulSoup(raw_html) results = soup.select("div.tocTitle a") for res in results: url = res["href"] url = urlparse.urljoin(url_base, url) data = {} data["url"] = url data["title"] = res.text lista_docs.append(data) """ tutti gli articoli di questa issue http://antropologiaeteatro.unibo.it/issue/view/513 """ url_base = "http://antropologiaeteatro.unibo.it/issue/view/513" resp = br.open(url_base) raw_html = resp.read() # raw html source code soup = BeautifulSoup(raw_html) divclass = soup.findAll('div', attrs={'class': 'tocTitle'}) # lista for div in divclass: results = div.findAll("a") for res in results: url = res["href"] data = {} data['url'] = url data['title'] = res.text lista_docs.append(data) #print json.dumps(lista_docs) return lista_docs
def get_page_nums(self, main_url): html = get_html(main_url) soup = BeautifulSoup(html, "lxml") page = soup.select(".p-skip em b") print "page number:" + page.get_text() return int(page.get_text())
def get_counties(): r = requests.get('http://elections.sos.state.tx.us/elchist175_countyselect.htm') soup = BeautifulSoup(r.text) return [{'id': o['value'], 'name': o.text} for o in soup.select('option')]
while not search_param.strip(): search_param = input( 'Please enter the name of the TV show, movie or artist: ') search_param = urllib.parse.quote(search_param).lower() search_results = requests.get( '%s/search/site?q=%s' % (tunefind_search_uri, search_param), headers) if 'no results found' in search_results.text.lower(): print('No results found for your query') sys.exit(1) soup = BeautifulSoup(search_results.text, 'html.parser') results_table = soup.select('.pageSearchWrapper + .container')[0] if results_table is None: print('Cannot find any search results for some reason.') sys.exit(1) #results_columns = results_table.find_all(class_='col-md-') results_columns = results_table.select('[class*="col-md-"]') if not results_columns: print('Cannot find any search results for some reason.') sys.exit(1) for results_column in results_columns: if not results_column.find('a'): continue results_items = results_column.find_all('li')
def getQuestions(self, page_num): page = self.getPageByNum(page_num) soup = BeautifulSoup(page) questions = soup.select("div.question_list ul li") for question in questions: info = self.getQuestionInfo(question) if info: # 得到问题的URL url = "http://iask.sina.com.cn/" + info[1] ans = self.page_spider.getAnswer(url) print self.getCurrentTime(), "当前爬取第", page_num, "的内容,发现一个问题", info[2], "回答数量", info[3] # 构造问题的字典,插入问题 ques_dict = { "text": info[2], "questioner": info[0], "date": info[4], "ans_num": info[3], "url": url } insert_id = self.mysql.insertData('iask_question', ques_dict) good_ans = ans[0] print self.getCurrentTime(), "保存到数据库,此问题的ID为", insert_id # 如果存在最佳答案,那么就插入 if good_ans: print self.getCurrentTime(), insert_id, "号问题存在最佳答案", good_ans[0] # 构造最佳答案的字典 good_ans_dict = { "text": good_ans[0], "answerer": good_ans[1], "date": good_ans[2], "is_good": str(good_ans[3]), "question_id": str(insert_id) } # 插入最佳答案 if self.mysql.insertData("iask_answers", good_ans_dict): print self.getCurrentTime(), "保存最佳答案成功" else: print self.getCurrentTime(), "保存最佳答案失败" # 获得其他答案 other_anses = ans[1] # 遍历每一个其他答案 for other_ans in other_anses: # 如果答案存在 if other_ans: print self.getCurrentTime(),insert_id,"号问题存在其他答案",other_ans[0] # 构造其他答案的字典 other_ans_dict = { "text": other_ans[0], "answerer": other_ans[1], "date": other_ans[2], "is_good": str(other_ans[3]), "question_id": str(insert_id) } # 插入这个答案 if self.mysql.insertData("iask_answers", other_ans_dict): print self.getCurrentTime(), "保存其他答案成功" else: print self.getCurrentTime(),"保存其他答案失败"
def _downloadFiles(self): message("Creating URL", INFORMATION) parser = self._getSelectedParser() url = parser.getDownloadUrl( self._selected, self._domain ) message("Downloading page contains download link", INFORMATION) response = urllib2.urlopen(url) html = response.read() #remove CDATA e = etree.XML(html) html = etree.tostring(e) message("Parsing page", INFORMATION) soup = Soup(html) fileUrl = select(soup, 'fileurl') if len(fileUrl) == 0: fileUrl = soup.select('fileUrl') edv = select(soup, 'edv') if len(fileUrl) == 0: message("The page does not have a download link. ", ERROR) return False if len(edv) == 0: message("The page does not have 'edv' key", ERROR) return False #name = self.fileName[self.selected-1] name = parser.getFileName(self._selected) fileKeyName = "%s.key" % name edv = edv[0] if html.find("<edv/>") != -1: message("The .key file is not necessary.", INFORMATION) else: message("Creating .key file", INFORMATION) f = open(str(fileKeyName), 'w') f.write(str(edv.string)) f.close() message("Created", DONE) fileUrl = fileUrl[0] downloadUrl = fileUrl.string # contains html entities if downloadUrl.find("&") >= 0: downloadUrl = HTMLParser.HTMLParser().unescape(downloadUrl) listUrl = downloadUrl.split('.') items = len(listUrl) iterationPosition = items - 2 if items < 5 and len(listUrl[iterationPosition]) == 0: message("URL is wrong. Actual URL is: %s" % fileUrl, ERROR) return False i = 1 while(True): part = str(i) if i < 10: part = "0%i" % i listUrl[iterationPosition] = part fileName = "%s.part%s" % (name, part) url = ".".join(listUrl) if self._downloadFile(url, fileName) == False: break i += 1 if i == 1: self._glueNeeded = False if self._downloadFile(downloadUrl, name) == False: message("Something is wrong, probably URL", ERROR) return False message("Downloaded.", INFORMATION) return True
# Read html file to url try: htmlpage = requests.get(url).content except Exception, e: log("Error while opening:\t" + url) log(str(e)) return try: soup = BeautifulSoup(htmlpage) except Exception, e: log("Error while parsing:\t" + url) log(str(e)) return # Extract image links from html urls = soup.select('.album-view-image-link a') processed = [] for url in urls: url = url['href'] # Download image url = format_url(url) processed.append(url) return processed def format_url(url): """ Formats url strings by front adding "http:" if needed and removing ?s :param url: "//imgur.com/XYZ123?1"
def get_countylist(election_code): r = requests.get( 'http://elections.sos.state.tx.us/elchist%s_countyselect.htm' % election_code) soup = BeautifulSoup(r.text) return [{'id': o['value'], 'name': o.text} for o in soup.select('option')]
print(crime_id.attrs['href']) crime_id_unique = crime_id.attrs['href'] crime_id_url = crime_base_url + crime_id_unique crime_links.append(crime_id_url) except: print("Offical report hasn't been filed for {}".format(crime_id)) print(crime_links) if len(crime_links) > 0: with open('crimes.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['incidentId', 'reported', 'occured', 'building', 'location', 'locationCode', 'incidentCode', 'person_status', 'person_name', 'person_RG', 'person_affiliation', 'person_crime', 'person_age', 'synopsis']) for crime in crime_links: html = urllib.request.urlopen(crime) soup = BeautifulSoup(html, 'lxml') incident_id = soup.select('#ctl00_ContentPlaceHolder1__IncidentNumber') incident_id = incident_id[0].text print(incident_id) reported = soup.select('#ctl00_ContentPlaceHolder1_DateReported') reported = reported[0].text print(reported) occured = soup.select('#ctl00_ContentPlaceHolder1_OccurredDate') occured = occured[0].text building = soup.select('#ctl00_ContentPlaceHolder1_Building') building = building[0].text location = soup.select('#ctl00_ContentPlaceHolder1_Location') location = location[0].text location_code = soup.select('#ctl00_ContentPlaceHolder1_LocationCode') location_code = location_code[0].text incident_code = soup.select('#ctl00_ContentPlaceHolder1_IncidentCode') incident_code = incident_code[0].text
def get_lyrics(url): resp = requests.get(url) parsed_html = BeautifulSoup(resp.content, features="html.parser") text = parsed_html.select('.col-xs-12.col-lg-8.text-center')[0].text return text
def get_elections(): r = requests.get('http://elections.sos.state.tx.us/index.htm') soup = BeautifulSoup(r.text) return [{'election_code': o['value'], 'title': o.text} for o in soup.select('option')]
import requests from BeautifulSoup import BeautifulSoup a = [] conn = pymysql.connect(host='52.78.104.59', user='******', password='******', db='', charset='utf8') curs = conn.cursor() req = requests.get("https://www.naver.com/") # connection html = req.text # 소스 가져오기 soup = BeautifulSoup(html, 'html.parser') sillsigan = soup.select( 'div.ah_roll.PM_CL_realtimeKeyword_rolling_base > div > ul > li') b = [] for sill in sillsigan: b.append(sill.text) # 태그 내 문자열을 b리스트에 추가 k = 1 list_sillsigan = [] for i in b: # 문자열에서 핵심 문자열만 list_sillsigan 리스트에 추가 if k > 9: list_sillsigan.append(i[5:-2]) else: list_sillsigan.append(i[4:-2]) k += 1 for s, list in enumerate(list_sillsigan): a.append(list) curs.execute(
from urlparse import urljoin from BeautifulSoup import BeautifulSoup import requests BASE_URL = "http://genius.com" artist_url = "http://genius.com/artists/Andre-3000/" # response = requests.get(artist_url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'}) response = requests.get(artist_url) # response.encoding = "utf-8" text = response.text soup = BeautifulSoup(text) # print soup print soup.select("song_title") # for song_link in soup.select("ul.song_list > li > a"): # link = urljoin(BASE_URL, song_link['href']) # response = requests.get(link) # text = response.text # soup = BeautifulSoup(text) # lyrics = soup.find('div', class_='lyrics').text.strip() # tokenize `lyrics` with nltk
pages = [] books = [] genres = [] ebook_genres = [] MAX_PAGES = 37 for npage in range(1, MAX_PAGES + 1): print("Scanning page {}".format(npage)) req = requests.get(BASE_URL + "ebooks?page=" + str(npage)) dir_html = req.text parsed_dir_html = BeautifulSoup(dir_html) for link in parsed_dir_html.select("main > ol li > a[href]"): pages.append(link['href']) total = len(pages) current = 1 current_genre = 1 for page in pages: try: page_html = requests.get(BASE_URL + page) page_html = BeautifulSoup(page_html.text) title = page_html.select_one(".ebook hgroup > h1").text author = page_html.select_one(".ebook hgroup > h2").text description = page_html.select_one("#description p").text dl_link = page_html.select_one('#download .epub')['href'] tags = [ tag.text for tag in page_html.select("#reading-ease .tags li a")