def scrape_pacific_plants_html(link): if re.search('http', link) is None: link = 'http://www.hear.org/pier/' + link r = requests.get(link) html = BeautifulSoup(r.text, 'lxml') # check Australian table format question_index = 0 answer_index = -1 aus = False try: table = html.find_all('table')[1] except: table = html.find_all('table')[0] aus = True question_index = 2 rows = table.find_all('tr') header = rows[0] rows = rows[1:-1] # check for score column if parse_html(str(header.find_all('td')[-1])) == 'Score': answer_index -= 1 feature_dict = dict() for row in rows: data = row.find_all('td') # end of Australian table if aus and parse_html(str(data[3])) == 'Outcome:': break # check if feature row try: feature_id = int(round(float(parse_html(str(data[question_index]))) * 100)) except: continue answer = parse_html(str(data[answer_index])).lower() if feature_id in EDGE_CASES: try: feature_dict[feature_id] = int(answer) except: feature_dict[feature_id] = 'NA' else: if answer in VAL_LOOKUP: feature_dict[feature_id] = VAL_LOOKUP[answer] else: feature_dict[feature_id] = 'NA' return feature_dict
def searchGenders( browser : mc.Browser, gender : str , page = 1) -> None: url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(page) res = browser.open(url) html = res.read() html = bs4( html, "html.parser" ) pageCount = html.select("div[max_num_pages]") pageCount = pageCount[0].select(":not(:last-child)") maxPage = int(pageCount[ len(pageCount) -1 ].get_text()) linkdata = open("links.html", "a") for i in range(maxPage + 1): if( i >= 1 ): print("===> getting page {}\n".format(i)) url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(i) res = browser.open(url) html = res.read() html = bs4( html, "html.parser" ) booklinks = html.find_all('a', {'class' : 'bookTitle'}) for link in booklinks: linkdata.write( "<a href='"+ str(url) + str( link['href'] ) +"' ></a>\n") print("Ready!")
def parse_me(rsp): html = bs4(rsp.text, 'html.parser') apts = html.find_all('li', attrs={'class': 'result-row'}) results = [] apt_titles = [ apt.find('a', attrs={'class', 'hdrlnk'}).text for apt in apts ] #apt_sizes = [ re.sub(' +',' ', apt.findAll(attrs ={'class':'housing'})[0].text.replace("\n","").strip()) for apt in apts] apt_prices = find_prices(apts) apt_times = [apt.find('time')['datetime'] for apt in apts] apt_links = [ apt.find('a', attrs={'class', 'hdrlnk'})['href'] for apt in apts ] apt_loc = [apt.find('a', attrs={'class', 'hdrlnk'}).text for apt in apts] apt_img = [apt.find('img') for apt in apts] apt_titles = sanitize(apt_titles) #apt_sizes = sanitize(apt_sizes) apt_times = sanitize(apt_times) apt_loc = sanitize(apt_loc) #create a dataframe to hold all info data = np.array( [apt_titles, apt_img, apt_prices, apt_times, apt_links, apt_loc]) col_names = ["Title", "Image", "Price", "Posted On", "link", "Location"] pd.set_option('max_colwidth', 2000) dataframe = pd.DataFrame(data.T, columns=col_names) dataframe.set_index('Title') results.append(dataframe) results = pd.concat(results, axis=0) return results
def listen(): link_list = [] link_list_send = [] send_list = [] url_cl_base = "http://sfbay.craigslist.org" while True: rsp = requests.get(url_base, params=params) html = bs4(rsp.text, 'html.parser') result = parse_me(rsp) result.head() result.to_html("index.html") stylesheet = "style.css" js = 'cl.js' with open("./index.html", "a") as file: file.write("<link rel='stylesheet' type='text/css' href='" + stylesheet + "' >") file.write( "<script src='https://code.jquery.com/jquery-3.3.1.min.js' integrity='sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=' crossorigin='anonymous'></script>" ) file.write("<script src='" + js + "'></script>") apts = html.find_all('li', attrs={'class': 'result-row'}) for apt in apts: title = apt.find_all('a', attrs={'class': 'hdrlnk'})[0] name = ''.join([i for i in title.text]) link = title.attrs['href'] cur_price = apt.find('span', {'class': 'result-price'}).text cur_price = sanitize(cur_price) cur_price = ''.join(cur_price) loc = apt.find('span', attrs={'class', 'result-hood'}) if loc is None: loc = "" else: loc = loc.text if link not in link_list and link not in link_list_send: #print "found new listing" f = open('./home/critter/out.log', 'w') print >> f, 'found new listing', url_cl_base, link f.close() link_list_send.append(link) send_list.append(name + '\n -$$ ' + cur_price + ' $$ - \nLOCATION:: ' + loc + "\n" + url_cl_base + link) if len(link_list_send) > 0: #g = creds.Creds(); #print("sending mail") f = open('./home/critter/out.log', 'w') #print >> f,'sending mail to:: '+ my_email f.close() msg = "\n\n".join(send_list) # m = email.message.Message() # m.set_payload(msg) #g.gmailSend(m,my_email) link_list += link_list_send link_list_send = [] send_list = [] sleep_time = np.random.randint(150, 155) time.sleep(sleep_time)
def search(bookname): # 搜索听书 # 参数 page int 例子: 1 # 参数 searchword string 56听书的限制搜索必须提交GBK编码的字符串 例子: %D5%C2%D3%E3 path = '/search.asp?page=1&searchword=' + urllib.request.quote( bookname.encode('gb2312')) html = getHtmlContent(path) searchList = html.find_all(class_='list-ov-tw') pageCount = html.find( class_='cate-pages').find_all('li')[1].text.split('/') nowPageCount = pageCount[0] # 当前页 allPageCount = pageCount[1] # 总页数 bookList = [] # 搜索结果 for searchItem in searchList: bookUrl = searchItem.find(class_='bt').find('a').attrs['href'] bookImg = searchItem.find('img').attrs['original'] bookName = searchItem.find(class_='bt').text bookAuthor = searchItem.find_all( class_='zz')[0].text + ' ' + searchItem.find_all( class_='zz')[1].text bookContent = searchItem.find(class_='nr').text book = Book(bookUrl, bookImg, bookName, bookAuthor, bookContent, nowPageCount, allPageCount) bookList.append(book) return bookList
def get_flights(html): flights = html.find_all('li', 'flight') result = [] for f in flights: info = str(f.find('div', 'bottom')) info = info.replace('<div class="bottom">', '') info = info.replace("</div>", '') result.append(info) return result
def grab_all_reviews(url, card_id): resp = requests.get(url, PARAMETERS) code = BeautifulSoup(resp.text.encode('utf-8', 'ignore'), 'html.parser') pagination_element = code.find('td', {'class': 'pagination-right'}) last_page = None if pagination_element is None: return last_page = int( code.find('td', { 'class': 'pagination-right' }).find_all('b')[1].text) current_page = 0 while current_page < last_page: current_page = current_page + 1 PARAMETERS['pg'] = current_page response = requests.get(url, PARAMETERS) html = BeautifulSoup(response.text, 'html5lib') reviews = html.find_all('div', {'class': 'review'}) titles = html.find_all('h5', {'class': 'review-title'}) review_count = 0 for review in reviews: review_count = review_count + 1 rating = review.find('span').get('ck-stars').split()[0] review_str = str(review) start = review_str.find('review-title') end = review_str.find('</h5') title = '' if start != -1 and end != -1: title_list = review_str[start:end].split('\n') title = title_list[1].strip() text = review.find('div', {'class': 'readmoreInner'}) paragraphs = text.find_all('p') review_text = '\n\n'.join( [paragraph.text for paragraph in paragraphs]) print(rating, '\n', title, '\n', review_text, '\n') REVIEWS.append(('', title, review_text, rating, card_id)) print(len(REVIEWS)) sleep(3)
def trueAltalsUrl(altalsId): url = webUrl + '/a/?id=' + str(altalsId) try: r = http.request('Get', url, headers=headers) except IOError: print('', end="\r") try: soup = BeautifulSoup(r.data.decode("utf8"), "lxml") except: return else: html = soup.find('div', class_='tuji') if not html: return altalsName = html.find('h1').text.replace('\\',' ').replace('\\',' ').\ replace('/',' ').replace(':',' ').replace('*',' ').replace('?',' ').\ replace('<',' ').replace('>',' ').replace('|',' ').replace('"',' ') p = html.find_all('p') orgid = str(p[0].a['href']).replace('/x/?id=', '') altalsCount = str(p[2].text.replace('图片数量:', '').replace('P', '')) models = [] classs = [] grilIds = '' classIds = '' for m in p[1].find_all('a'): h = m['href'].replace('/t/?id=', '') models.append(h) for c in p[3].find_all('a'): h = c['href'].replace('/s/?id=', '') classs.append(h) for m in models: grilIds += m + ',' for c in classs: classIds += c + ',' grilIds = grilIds.rstrip(',') classIds = classIds.rstrip(',') if not grilIds: grilIds = '-1' try: saveAltals(altalsId, orgid, classIds, grilIds, altalsName, altalsCount) try: imgHtml = soup.find('div', id='kbox').find_all('img') imgs = [] for k in imgHtml: imgUrl = k['data-src'] imgs.append(imgUrl) return imgs except IOError: return False except IOError: return False
def get_bussinessanalysis_EM(code): """ 核心题材 -------------------------- code:为股票代码,为6位数 """ if code[0] in ['6', '9']: code = 'sh' + code if code[0] in ['0', '2', '3']: code = 'sz' + code url = 'http://f10.eastmoney.com/f10_v2/BusinessAnalysis.aspx?code={0}'.format( code) #url='http://emweb.securities.eastmoney.com/CoreConception/Index?type=web&code={0}#'.format(code) r = requests.get(url, headers=hds()) text = r.text html = BeautifulSoup(r.text, 'lxml') try: mainbs = html.find('div', attrs={"class": "section first"}) mainbs = mainbs.text except: mainbs = None try: article = html.find('div', attrs={"class": "article"}) article = article.text except: article = None try: ts = html.find_all('table') df = pd.DataFrame() for t in ts: tbl = str(t) #tbl=tbl.replace('<td class="tips-fieldnameL" rowspan="8">按产品分类</td>','') #tbl=tbl.replace('<td class="tips-fieldnameL" rowspan="3">按地区分类</td>','') #tbl=tbl.replace('<td class="tips-fieldnameL" rowspan="5">按行业分类</td>','') dd = pd.read_html(tbl)[0] df = df.append(dd) except: df = None return mainbs, df, article
def get_sectors(): # set up browser chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(options=chrome_options) # browser = webdriver.Chrome() # open web page browser.implicitly_wait(10) # wait for web page to load url = 'https://www.thestar.com.my/business/marketwatch/' browser.get(url) r = browser.page_source html = BeautifulSoup(r, 'html.parser') # print(html) browser.close() # sector elements htmlPart = html.find(class_=re.compile("stocks")) linkPart = [ x.get_attribute_list('id') for x in htmlPart.find_all('a', {"id": True}) ] for i in range(len(linkPart)): sector_elements.extend(linkPart[i]) # print(linkPart) # print(sector_elements) # print(len(sector_elements)) # sector_list sector = html.find_all('strong') for i in sector: sector_list.append(i.text.strip(':')) # print(i.text) # print(sector_list) # sector_name_list sector_n = html.select('div.text a') for i in sector_n: sector_name_list.append(i.text) # print(sector_name_list) return
def extract_next_links(rawDatas): outputLinks = list() ''' rawDatas is a list of objs -> [raw_content_obj1, raw_content_obj2, ....] Each obj is of type UrlResponse declared at L28-42 datamodel/search/datamodel.py the return of this function should be a list of urls in their absolute form Validation of link via is_valid function is done later (see line 42). It is not required to remove duplicates that have already been downloaded. The frontier takes care of that. Suggested library: lxml ''' #global subdomain_track #global outlink_track for resp in rawDatas: url = urlparse(resp.url) ''' if url.netloc not in subdomain_track: subdomain_track[url.netloc] = 1 else: subdomain_track[url.netloc] += 1 if not url.query: resp.bad_url = true ''' html = BeautifulSoup(resp.content, "lxml") html_links = html.find_all('a') for link in html_links: if link.get('href') is None: continue outputLinks.append(link.get('href')) ''' if len(html_links) > len(outlink_track): outlink_track = (resp.url, len(html_links)) ''' return outputLinks
def get_price(self, html, html_class=''): span_tags = html.find_all('span', attrs={'span', html_class}) normal_price = [] for span_tag in span_tags: normal_price.append(str(span_tag.contents[0]).strip()) return normal_price
def getGame(): time.sleep(7) # Re-get the source so that we can look for any "Continue" buttons html = BeautifulSoup(browser.page_source, 'lxml') try: spans = html.find_all('span') # Check for a "Continue" button for the 18+ games. If we find it, click it for span in spans: if span.get_text().upper() == 'CONTINUE': # Get the xpath xpath = xpath_soup(span) # Use the xpath to grab the browser element (so we can click it) geez_dad_im_not_a_kid_anymore = browser.find_element_by_xpath(xpath) geez_dad_im_not_a_kid_anymore.click() break time.sleep(7) except: print() # Get the source again, just incase we came from clicking the potential "Continue" button html = BeautifulSoup(browser.page_source, 'lxml') # Get all the button tags so we can see whether we need to grab the game or leave buttons = html.find_all('button') for button in buttons: if button.get_text().upper() == 'OWNED': break if button.get_text().upper() == 'GET': # Get the xpath of this button xpath = xpath_soup(button) # Use the xpath to grab the browser element (so we can click it) browser_element = browser.find_element_by_xpath(xpath) browser_element.click() time.sleep(4) # Re-get the source (again) html = BeautifulSoup(browser.page_source, 'lxml') # Get all the spans so we can get the "Place Order" button spans = html.find_all('span') for span in spans: if span.get_text().upper() == 'PLACE ORDER': # Get the xpath xpath = xpath_soup(span) # Use the xpath to grab the browser element (so we can click it) purchase_button_element = browser.find_element_by_xpath(xpath) # Create object and add it to the list purchase_button_element.click() break # If the EULA prompt shows up, click it try: browser.find_element_by_xpath('''//span[contains(text(),'I Agree')]''') except: print() else: EU_Refund_and_Right_of_Withdrawal_Information = browser.find_element_by_xpath('''//span[contains(text(),'I Agree')]''') EU_Refund_and_Right_of_Withdrawal_Information.click() time.sleep(2) # We only want the first one (usually the game), so leave break
html = BeautifulSoup(browser.page_source, 'lxml') # Check for, and close, the cookies banner try: browser.find_element_by_xpath('''/html/body/div/div/div[4]/header/div/button/span''') except: print() else: cookies = browser.find_element_by_xpath('''/html/body/div/div/div[4]/header/div/button/span''') cookies.click() time.sleep(2) # Get all the span tags to make sure we get every available game spans = html.find_all('span') # Create a list for all the game dictionaries games = [] for span in spans: if span.get_text().upper() == 'FREE NOW': # Get the xpath xpath = xpath_soup(span) # Use the xpath to grab the browser element (so we can click it) browser_element = browser.find_element_by_xpath(xpath) # Create object and add it to the list games.append({'xpath': xpath, 'element': browser_element })
def poi_page_link(source): html = BeautifulSoup(source, 'lxml') table = html.find_all(class_='table table-bordered table-hover') links = table[0].find_all('a') links = [x['href'] for x in links] return links
def get_americanlife_info(epno, throwException=True, extraStuff=None, verify=True): """ Returns a tuple of title, year given the episode number for This American Life. """ # first see if this episode of this american life exists... if extraStuff is None: resp = requests.get( 'http://www.thisamericanlife.org/radio-archives/episode/%d' % epno, verify=verify) else: resp = requests.get( 'http://www.thisamericanlife.org/radio-archives/episode/%d/%s' % (epno, extraStuff), verify=verify) if resp.status_code != 200: raise ValueError( "Error, could not find This American Life episode %d, because could not open webpage." % epno) enc = resp.headers['content-type'].split(';')[-1].split( '=')[-1].strip().upper() if enc not in ('UTF-8', ): html = BeautifulSoup(unicode(resp.text, encoding=enc), 'lxml') else: html = BeautifulSoup(resp.text, 'lxml') elem_info_list = list( filter( lambda elem: 'class' in elem.attrs and 'top-inner' in elem.attrs[ 'class'] and 'clearfix' in elem.attrs['class'], html.find_all('div'))) if len(elem_info_list) != 1: if throwException: raise ValueError(" ".join([ "Error, cannot find date and title for This American Life episode #%d," % epno, "because could not get proper elem from HTML source." ])) else: return None elem_info = max(elem_info_list) date_list = list( filter( lambda elem: 'class' in elem.attrs and 'date' in elem.attrs['class' ], elem_info.find_all('div'))) if len(date_list) != 1: if throwException: raise ValueError( "Error, cannot find date and title for This American Life episode #%d." % epno) else: return None date_s = max(date_list).text.strip() date_act = datetime.datetime.strptime(date_s, '%b %d, %Y').date() year = date_act.year title_elem_list = list( filter( lambda elem: 'class' in elem.attrs and 'node-title' in elem.attrs[ 'class'], elem_info.find_all('h1'))) if len(title_elem_list) != 1: raise ValueError( "Error, cannot find date and title for This American Life episode #%d." % epno) title = max(title_elem_list).text.strip() title = titlecase.titlecase(':'.join(title.split(':')[1:]).strip()) return title, year
def parse_object(object): repeat = True for city_data in cities: if (repeat == False): break url = object city = city_data[0] city_id = city_data[1] city_name = city_data[2] alternative_name = color = price_old = name = ram = rom = material = price_current = name_shop = address_shop = None url = url + "/shopdirections?cityId=" + city r = request(url) html = get_html(r) log.info("Парсинг %s", url) try: count_shops = int( html.xpath( './/li[@class="c-tabs__menu-item active"]/a/span/text()') [0]) print(count_shops) except: count_shops = None if ((count_shops != None) and (count_shops > 0)): span_showcase = html.xpath('.//span[@class="fl-embedded-wrapper"]') if (span_showcase != []): print("Тута") continue div_old = html.xpath('.//div[@class="c-pdp-price__old"]') if (div_old != []): price_old = div_old[0].text price_old = ''.join(filter(str.isdecimal, price_old)) price_old = float(price_old) print("Старая цена:", price_old) price_current = html.xpath( './/div[@class="c-pdp-price__current sel-product-tile-price"]/text()' )[0] price_current = ''.join(filter(str.isdecimal, price_current)) price_current = float(price_current) print("Текущая цена:", price_current) code = html.xpath('.//p[@class="c-product-code"]')[0].text print("Код товара:", code) brand = html.xpath("//ul[@class='c-breadcrumbs__list']")[0].xpath( "li")[-1].xpath("a/span/text()")[0] print("Brand", brand) #Парсинг параметров объекта url_new = url + "/specification?ssb_block=descriptionTabContentBlock&cityId=" + city r = request(url_new) html = html_bs4(r) div_characteristic = html.find_all('div', { "class": "product-details-tables-holder sel-characteristics-table" })[0] characteristic_h3 = div_characteristic.find_all("h3") characteristic_table = div_characteristic.find_all("table") count_parameters = 5 count_current = 0 color_repeat = False for i, val in enumerate(characteristic_h3): if (count_current < count_parameters): # if (count_current < count_parameters): # if (val.text == "Корпус"): # tr = characteristic_table[i].tbody.find_all("td") # for i, value in enumerate(tr): # if (value.span.get_text().strip() == "Материал корпуса"): # material = tr[i + 1].span.get_text().strip() # if (material.find("/") != -1): # material = material.replace("/", ",") # material = material.title() # count_current += 1 # print(material) if (count_current < count_parameters): if (val.text == "Серия модели"): tr = characteristic_table[i].tbody.find_all("td") for i, value in enumerate(tr): if (value.span.get_text().strip() == "Серия"): name = brand + " " + tr[ i + 1].span.get_text().strip() count_current += 1 if (count_current < count_parameters): if (val.text == "Цвет, размеры и вес"): tr = characteristic_table[i].tbody.find_all("td") for i, value in enumerate(tr): if (value.span.get_text().strip() == "Цвет"): color = tr[i + 1].span.get_text().strip() if (color.find("/") != -1): color_repeat = True else: color = color.capitalize() if (color.find("Золотистый") != -1): color = "Золотой" if (color.find("Серый") != -1): color = "Серый" count_current += 1 print("Цвет:", color) if (count_current < count_parameters): if (val.text == "Модель"): tr = characteristic_table[i].tbody.find_all("td") for i, value in enumerate(tr): if (value.span.get_text().strip() == "Модель"): alternative_name = tr[ i + 1].span.get_text().strip() alternative_name = alternative_name.upper() count_current += 1 print("Альтернативное имя:", alternative_name) if (count_current < count_parameters): if (val.text == "Память"): tr = characteristic_table[i].tbody.find_all("td") for i, value in enumerate(tr): if (value.span.get_text().strip() == "Оперативная память (RAM)"): ram = tr[i + 1].span.get_text().strip() ram = ram.upper() index_ram = ram.find(" ") ram = float(ram[:index_ram]) * 1024 count_current += 1 print("RAM:", ram) if (value.span.get_text().strip() == "Встроенная память (ROM)"): rom = tr[i + 1].span.get_text().strip() rom = rom.upper() index_rom = rom.find(" ") rom = float(rom[:index_rom]) * 1024 count_current += 1 print("ROM:", rom) if (count_current < count_parameters): if (val.text == "Память"): tr = characteristic_table[i].tbody.find_all("td") for i, value in enumerate(tr): if (value.span.get_text().strip() == "Оперативная память (RAM)"): ram = tr[i + 1].span.get_text().strip() ram = ram.upper() index_ram = ram.find(" ") ram = float(ram[:index_ram]) * 1024 count_current += 1 print("Ram:", ram) if (value.span.get_text().strip() == "Встроенная память (ROM)"): rom = tr[i + 1].span.get_text().strip() rom = rom.upper() index_rom = rom.find(" ") rom = float(rom[:index_rom]) * 1024 count_current += 1 print("Rom", rom) if (color_repeat): if (count_current < count_parameters): if (val.text == "Служебная информация"): tr = characteristic_table[i].tbody.find_all( "td") for i, value in enumerate(tr): if (value.span.get_text().strip() == "Базовый цвет"): color = tr[i + 1].span.get_text().strip() if (color.find("/") != -1): print() #exit(0) else: color = color.capitalize() if (color.find("Золотистый") != -1): color = "Золотой" if (color.find("Серый") != -1): color = "Серый" count_current += 1 print("Цвет", color) else: break # request список магазинов log.info( "%s - Имя: %s | Цвет: %s | Альтернативное имя: %s | Текущая цена: %s | Старая цена: %s", code, name, color, alternative_name, price_current, price_old) url_shops = 'https://www.mvideo.ru/sitebuilder/blocks/browse/product-detail/tabs/availableStoresTable.jsp?productId=' + code + '&tab=list&cityId=' + city + '&ajax=true&json=true&take-today=&page=1&viewAll=true' #url = 'https://www.mvideo.ru/sitebuilder/blocks/browse/product-detail/tabs/availableStoresTable.jsp?productId=30028814&tab=list&cityId=CityCZ_2246&ajax=true&json=true&take-today=&page=1&viewAll=true' r = request_json(url_shops) log.info("Начала парсинга магазинов| %s", url) html = get_html_json(r["storeList"]) li = html.cssselect('li.store-locator-list-item') for li in li: city_id = object[2] div_pickup = li.cssselect('div.pickup')[0] try: hours = div_pickup.cssselect('p>span')[0].text.strip() except IndexError: continue try: div_stock = li.cssselect('div.stock>i')[0].get('class') except IndexError: continue if ((div_stock.find("ico-stock-level-out-of") != -1) or (div_stock.find("ico-stock-level-showcase") != -1)): continue if (hours.find("Через час") != -1): div_name_shop = li.cssselect('div.name')[0] name_shop = str( div_name_shop.cssselect('h3>a')[0].text.strip()) split_name_shop = name_shop.split(", ") address_shop = str( div_name_shop.cssselect('p')[0].text.strip()) if (address_shop.find("обл.") != -1): address_shop = address_shop.replace("обл.", "область") if (address_shop.find("МО") != -1): address_shop = address_shop.replace( "МО", "Московская область") if (address_shop.find("р-н") != -1): address_shop = address_shop.replace("р-н", "район") if (address_shop.find("Всеволжский район") != -1): address_shop = address_shop.replace( "Всеволжский район", "Всеволожский район") print(address_shop) print(name_shop) try: count = li.cssselect('i.ico')[0].get("data-title") except: count = None if ((count != None) and (count != "Привезем под заказ") and (name_shop.find("Пункт выдачи") == -1) and (address_shop.find("Пункт выдачи") == -1)): print("1:", name_shop, "2:", address_shop) #Проверка на уже существования магазина по параметрам company = "М.Видео" sql = """SELECT id FROM shop WHERE name=%s and address=%s and chain_stores_id=%s""" data = [(name_shop, address_shop, 1)] row = select_request_db(sql, data) if (len(row) == 1): shop_id = row[0] else: print("Ошибка") exit(0) print("ID магазина", shop_id) sql = """SELECT product.id from product where product.name=%s and product.color = %s and product.ram = %s and product.rom=%s""" data = [(name, color, ram, rom)] print(data) row = select_request_db(sql, data) print("Ответ", row) if (len(row) == 1): product_id = row[0] print("PRODUCT VARIANTION_ID", product_id) try: sql = """INSERT INTO price(shop_id, product_id, price_current, price_old, status_sales,url) VALUES (%s, %s, %s, %s, %s,%s)""" data = [(shop_id, product_id, price_current, price_old, 1, url)] date = insert_request_db(sql, data) global count_succes count_succes += 1 print("Количество добавленных объектов", count_succes) except Exception as e: print("Ошибка", e) print("Данные ошибки", data) print("Тута") exit(0) else: global count_lose count_lose = count_lose + len(cities) print("Количество недобавленных объектов", count_lose) repeat = False break
def create_resort_list(): ''' Gets all of the necessary information from the resorts from onthesnow.com Returns a list of dictionaries ''' page = requests.get(starting_url) page.raise_for_status() html = bs4.BeautifulSoup(page.text, 'html5lib') # creates a dictionary where the key is an id for the reosrt # and the entry in another dictionary containing all of the info resort_dictionary = {} # getting the rating of the resorts (out of 5 stars) rating_tags = html.find_all('b', class_ = 'rating_small') rating_list = [] for i in rating_tags: b_tag = i.find_all('b') rating = b_tag[1].text rating_list.append(rating) # getting the name and the links of the resorts resort_links = html.find_all('div', class_ = 'name') resort_name_and_link = [] for resort in resort_links: entry = [] a_tag = resort.find('a') # name of the resort resort_name = a_tag['title'] entry.append(resort_name) # link to the resort resort_url = a_tag['href'] resort_url = 'http://www.' + limiting_domain + resort_url entry.append(resort_url) resort_name_and_link.append(entry) # adding the resort name, link, and rating to the dictionary for i in range(len(resort_name_and_link)): resort = resort_name_and_link[i][0] resort_dictionary[i] = create_dictionary() resort_dictionary[i]['Resort Name'] = resort link = resort_name_and_link[i][1] resort_dictionary[i]['link'] = link rate = rating_list[i] resort_dictionary[i]['Rating'] = rate # Looking at all of the individual ski resorts main page and obtaining # general info about each resort such as night skiing, terrain parks, etc. for resort in resort_dictionary: page = requests.get(resort_dictionary[resort]['link']) page.raise_for_status() html = bs4.BeautifulSoup(page.text, 'html5lib') # info: state region_info = html.find('div', class_ = 'resort_header_inner_wrap') state = region_info.find('a') resort_dictionary[resort]['State'] = state['title'] # info: num runs, % beginner, % intermediate, % advanced, % expert, # num terrain parks, whether they have night skiing terrain_table = html.find_all('p') terrain_info = [] for p_tag in terrain_table: if p_tag.has_attr('class'): entry = [] if 'label' in p_tag['class']: entry.append(p_tag.text) if 'value' in p_tag['class']: entry.append(p_tag.text) if entry != []: terrain_info.append(entry) # entering the terrain_info into the dictonary for i in range(0, len(terrain_info), 2): info = terrain_info[i][0] if info in resort_dictionary[resort]: if info == 'Night Skiing': resort_dictionary[resort]['Night Skiing'] = True else: value = terrain_info[i + 1] value = re.search('[0-9]+', value[0]).group() resort_dictionary[resort][info] = value # info: open/close dates, elevation, num lifts, lift tickets resort_overview = html.find('div', class_ = 'resort_overview resort_box module') td_tags = resort_overview.find_all('td') resort_overview_list = [] for i in td_tags: class_name = i.find('span', class_ = 'ovv_t t2') if class_name == None: resort_overview_list.append(i.text) # adding this info (resort_overview_list) to the dictionary open_close = resort_overview_list[0] open_close = re.findall('[0-9]+/[\s]*[0-9]+', open_close) open_close = open_close[0] + '-' + open_close[1] resort_dictionary[resort]['Open/Close Dates'] = open_close resort_dictionary[resort]['Elevation'] = resort_overview_list[1] resort_dictionary[resort]['Number Lifts'] = resort_overview_list[2] if 'N/A' in resort_overview_list[3]: resort_dictionary[resort]['Min Ticket Price'] = 'N/A' resort_dictionary[resort]['Max Ticket Price'] = 'N/A' else: tic_price = resort_overview_list[3] tic_price = re.findall('[0-9]+.[0-9]+', tic_price) resort_dictionary[resort]['Min Ticket Price'] = tic_price[0] resort_dictionary[resort]['Max Ticket Price'] = tic_price[1] # obtaining the average snowfall and adding to dictionary important_dates = html.find('div', id = 'resort_impdates') important_dates = important_dates.find_all('li') for date in important_dates: text = date.text if 'Average Snowfall' in text: average_snowfall = re.search('[0-9]+', text).group() resort_dictionary[resort]['Average Snowfall'] = average_snowfall # obtaining the town, address, and zip code of the resort resort_contact = html.find('div', id = 'resort_contact') address_info = resort_contact.find_all('p') address = address_info[1].text resort_dictionary[resort]['Address'] = address city = address_info[2].text zip_code = re.search('[0-9]+', city).group() # if the zip code starts with a 0 if len(zip_code) == 4: resort_dictionary[resort]['Zip Code'] = '0' + zip_code else: resort_dictionary[resort]['Zip Code'] = zip_code # if a town is available for the resort if re.search('[A-Za-z]+', city) != None: town = re.search('[A-Za-z]+', city).group() resort_dictionary[resort]['City'] = town else: resort_dictionary[resort]['City'] = '' # obtaining the latitude and longitude of the resort lat, lon = latitude_and_longitude(resort_dictionary[resort]['Address'], resort_dictionary[resort]['City'], resort_dictionary[resort]['State'], resort_dictionary[resort]['Zip Code']) resort_dictionary[resort]['Latitude'] = lat resort_dictionary[resort]['Longitude'] = lon
'''--------------- getting homepage content for [urls], and storing relvant articles by [companies] on {articles} ---------------''' problematic_urls = [] articles = [] for website in urls: web_content = requests.get(website) if (web_content.status_code == 403): problematic_urls.append(website) continue html = BeautifulSoup(web_content.text, "lxml") for link in html.find_all('a'): title = link.get_text().strip() for c in companies: if (c.lower() in title.lower()) and (c.lower() != title.lower()) and ('google+'.lower() != title.lower()): obj = { 'company': c, 'title': link.get_text().strip().split('\t')[0].split('\n')[0], 'url': link.get('href').split('?')[0], 'source': website, } articles.append(obj) '''--------------- saving articles from {articles} to csv file ---------------'''