def down(link_, max: int = 1): try: if max > 5: return print(link_[0]) link_down = link_[1] link_down = link_down if ( 'http' or 'https') in link_down else "https:" + link_down r = req_link(link_down) b = Bs(r.content, 'html.parser') nome = b.find("h1", itemprop='name').text.split()[-1] + '-' nome += b.find("h2", itemprop='alternativeHeadline').text bb = b.find("source") if bb: ll = bb.get("src") if ( 'http' or 'https') in bb.get('src') else "https:" + bb.get('src') r = ll return {nome: r} else: bb = b.find('a', title="Baixar Video") if bb: r = req_link( bb.get("href") if ('http' or 'https' ) in bb.get('href') else "https:" + bb.get('href')) b = Bs(r.content, 'html.parser') bb = b.find("a", "bt-download") if bb: head = bb.get("href") if ('http' or 'https') in bb.get( 'href') else "https:" + bb.get('href') return {nome: head} except Exception as e: print("Down", link_[0], e) return down(link_, max + 1)
def get_organization_page(domain, proxy=None): global HEADERS global BASE_URL if proxy is not None: proxies = {f'{proxy.split(":")[0].strip()}': proxy} else: proxies = None response = requests.get(f"{BASE_URL}/scholar?q={domain}", headers=HEADERS, proxies=proxies) logger.info(f'{response.status_code} {response.reason}') tree = Bs(response.content, "lxml") org_href = get_org_href(tree) if not org_href: return None, None response = requests.get(f"{BASE_URL}{org_href}", headers=HEADERS, proxies=proxies) if not response.ok: return None, None tree = Bs(response.content, "lxml") return tree, org_href
def get_parser_body(self, ref): """Returns 4 thousand characters of the text of the news in response""" result = str() if "ria" in self.url: source = requests.get(ref).text soup = Bs(source, 'lxml') raw_result = soup.find_all('div', class_='article__text') for i in raw_result: result = result + i.text + ' ' elif "interfax" in self.url: source = requests.get(f'{self.url}{ref}') source.encoding = 'cp1251' source = source.text soup = Bs(source, 'lxml').find('div', class_='mainblock') raw_result = soup.find_all_next('p') for i in raw_result: result = result + i.text + ' ' elif "tass" in self.url: source = requests.get(f'{self.url}{ref}').text soup = Bs(source, 'lxml') raw_result = soup.find_all('div', class_='text-content') result = result + raw_result[0].text else: pass return result[:4000]
def bypass(link): print(" *Bypassing please wait...") for _ in range(5): try: req = ses.post("https://nuubi.herokuapp.com/pahe/bypass", data={'url': link}) url = re.findall(r"href='(.*?)'", req.text)[0] except Exception as er: print(f"Oops, {er}, mengulangi lagi...") time.sleep(3) continue else: break req2 = ses.get(url) bs = Bs(req2.text, 'html.parser') url2 = bs.find('a', {'class': 'btn btn-primary btn-xs'})['href'] reqs = ses.get(url2) link = re.findall("href='(.*?)'>Download ", reqs.text)[0] reqs_ = ses.get(link) time.sleep(2) reqs2 = ses.get(link) bs2 = Bs(reqs2.text, 'html.parser') dlink = bs2.find('a', {'title': 'Download'})['href'] download(dlink, f"{info['title'][pil-1][1]} ({info['resu'][lih-1][1]}).mkv")
def parse_for_stories(html_page): # parses an html page (of a story). # should work offline. excpet if there are multiple pages # then it would attepmt to fetch those. header = [] pages = [] soup = Bs(html_page, parser) head = soup.find("div", attrs={"class": "b-story-header"}).text body = soup.find("div", attrs={"class": "b-story-body-x"}).text parse_header = re.search(r"([\w\s'.,/-]+)\n\s*by([\w\s',/-]+)", head) if parse_header: header.append(parse_header.group(1)) header.append(parse_header.group(2)) header.append(head) pages.append(body) while True: next_page = soup.find("a", attrs={"class": "b-pager-next"}) if next_page: new_page = requests.get(next_page["href"]) soup = Bs(new_page.text, "html5lib") body = soup.find("div", attrs={"class": "b-story-body-x"}).text pages.append(body) else: break story_object = {"title": header[0], "author": header[1], "pages": pages} return story_object
def getsearchHotelList(self, citycode): # 获取总页数 totalpage = self.gettotalpage(self.citycode) unreadlist = [] loop = range(1, (int(totalpage) + 1)) # 逐页查询 for x in loop: try: # 打开第x页 html = requests.get(self.mainUrl + str(citycode) + "/p" + str(x)) print "page " + str(x) + " is Searching" bsObj = Bs(html.text, 'lxml') hotel_list = bsObj.find("div", { "id": "hotel_list" }).findAll("div", {"class": "hotel_new_list"}) # 异步请求方式爬取 # param = {"StartTime": "2018-05-22", "DepTime": "2018-06-19", "cityName": "广州", # "RoomGuestCount": "1,1,0", # "IsOnlyAirHotel": "F", "cityId": 32, "cityCode": 020, "page": x} # list = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx" # html = requests.post(list, data=param, headers=self.header) # bsObj = Bs(html.text.replace("\\", ""), 'lxml') # hotel_list = bsObj.findAll("div", {"class": "hotel_new_list"}) # 数据转换 hotel_total_list = self.bsobj_to_hotelobj(hotel_list) # 批量插入至数据库 ch.insert(hotel_total_list) except Exception as e: print "Something goes wrong:" print e print "Wait a Minute,just sleep for a while" print "Zzzz" sleep(10) # 未读取列表 unreadlist.append(x) print "Go on~" continue # 未查询成功的数据 for i in unreadlist: try: # 打开第x页 newse = requests.session() html = newse.get(self.mainUrl + str(self.citycode) + "/p" + str(i)) print "page " + str(i) + " is Searching" bsObj = Bs(html.text, 'lxml') hotel_list = bsObj.find("div", { "id": "hotel_list" }).findAll("div", {"class": "hotel_new_list"}) # 数据转换 hotel_total_list = self.bsobj_to_hotelobj(hotel_list) # 批量插入至数据库 self.insert(hotel_total_list) except Exception as e: print e continue
def get_img_ranking_list(self, no="6"): """ 爬取排行榜数据(默认国际排行榜) 1.北海道/东北 2.关东 3.中部 4.近畿 5.中国/四国 6.九州/冲绳 7.国际 """ nodic = {"0": "北海道/东北", "1": "关东", "2": "中部", "3": "近畿", "4": "中国/四国", "5": "九州/冲绳", "6": "国际" } html = se.get(self.ranking_url + no) bsObj = Bs(html.text, 'lxml') href = bsObj.findAll("a",{"class":re.compile("^(work)")}) # 当前周 week = bsObj.findAll("span",{"class": "_about"})[0].string downloadPath = os.path.join(path, week + nodic.get(no) + u"排行榜") for link in href: if(link.attrs['href'][0]=='/'): self.img_list.append(self.indexurl + link.attrs['href'][1:]) else: self.img_list.append(self.indexurl + link.attrs['href']) try: # 返回根文件夹 os.chdir(os.path.dirname(downloadPath)) os.mkdir(week + nodic.get(no) + u"排行榜") except Exception as e: pass finally: os.chdir(downloadPath) print "ranking " + week + " are Searching!" # 获取当前页所有图片 (下载) for link in self.img_list: # 此处要添加代理 html = se.get(link, headers=self.headers) bsObj = Bs(html.text, 'lxml') try: src = bsObj.find("img", {"class": "original-image"}) if src: self.down_list.append({"src": src.attrs['data-src'].decode("unicode-escape"), "title": src.attrs['alt']}) print "title:" + src.attrs['alt'] except AttributeError as e: print "NOT Found" except Exception as e: print e # 开启三个线程下载图片 self.threading(3)
def html_table_to_array(html_string): """Convert an HTML table to a list of lists.""" from bs4 import BeautifulSoup as Bs import json if (len(Bs(html_string).findAll("tr")) > 0 and html_string is not None and html_string is not ""): return json.dumps([[cell.string or '' for cell in row.findAll("td")] for row in Bs(html_string).findAll("tr")]) else: return html_string
def expert_parser(url, i): nonlocal end, trigger, new_post try: logging.info('\n [%s]... ' % url) # Parsing http://yur-gazeta.com/news/ self.send(['log', '\n Loading [%s]... ' % url]) req = s.get(url) soup = Bs(req.text, 'html.parser') while end is False: page = soup.find('a', attrs={'class': 'right'}) tag = soup.find( 'ul', attrs={'class': 'material-list imageRight'}) for li in tag.children: if li.a['href'] != LAST_POST[i][0]: if li.find('a', attrs={'class': 'title private' }) is None: if trigger is True and li.a[ 'href'] != '1' and li.a['href'] != 1: new_post = li.a['href'] trigger = False req = s.get(li.a['href']) soup = Bs(req.text, 'html.parser') title = soup.find('h1', attrs={'class': 'mat-title'}) title = title.get_text( ) if title is not None else 'oops' article = soup.find( 'article', attrs={'id': 'material-content'}) article = article.get_text( ) if article is not None else 'oops' author = soup.find( 'a', attrs={'class': 'author-name no-link'}) author = author.get_text( ) if author is not None else 'oops' posts.append( PostModel( str(author + '\n' + title + '.' + article), hashtags['expert'], li.a['href'], imgs['expert'])) else: end = True break req = s.get(page['href']) if page is not None else req soup = Bs(req.text, 'html.parser') except ConnectionError: self.send(['log', '\n Could not load [%s]' % url]) end = False # For the adding last post trigger = True if new_post != '' and new_post != '1' and new_post != 1: LAST_POST[i][0] = new_post new_post = ''
def response(self): bs1 = Bs(self.req.get(self.u.format('')).text, 'html.parser') token = bs1.find('input', {'name': 'csrfmiddlewaretoken'}) res = self.req.post(self.u.format('/download'), data={ 'tweet': self.lnk, 'csrfmiddlewaretoken': token['value'] }) bs2 = Bs(res.text, 'html.parser') linku = bs2.find('a', {'class': 'expanded button small float-right'}) return linku['href']
def get_wikipedia_results_recursive(hops, wiki_pages, num_links=None): text = '' if hops > 0: links = [] for wiki in wiki_pages: url = 'https://en.wikipedia.org/wiki/' + wiki['href'] html = api_get( url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) ' 'Gecko/20100101 Firefox/74.0' }) content = Bs(html.text, features="lxml").find(class_='mw-parser-output') # To get rid of this warning, pass the additional argument 'features="lxml"' to the BeautifulSoup constructor. for child in content.children: if child.name == 'p': text += re.sub(' +', ' ', child.get_text().replace("\n", " ")) + "\n" for link in child.find_all('a'): if link.has_attr("href") and num_links is not None: if link["href"].startswith( "/wiki/") and len(links) < num_links: links.append(link) text += get_wikipedia_results_recursive(hops - 1, links) else: for wiki in wiki_pages: url = 'https://en.wikipedia.org/wiki/' + wiki['href'] html = api_get( url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) ' 'Gecko/20100101 Firefox/74.0' }) content = Bs(html.text, features="lxml").find(class_='mw-parser-output') # To get rid of this warning, pass the additional argument 'features="lxml"' to the BeautifulSoup constructor. for child in content.children: if child.name == 'p': text += re.sub(' +', ' ', child.get_text().replace("\n", " ")) + "\n" return text
def get_rss_content(rss_info_list, img_file_path): rss_content = [] for rss in rss_info_list: url = rss[1] page = get_rss_page(url) soup = Bs(page, 'lxml') try: source = re.search( r'>\w+', str(soup.select('span[class="from"] > a')[0])).group(0) source = re.search(r'\w+', source).group(0) content = str(soup.select('div[class="article_body"]')[0]) content = re.sub(r'\\.', r'', content) content_soup = Bs(content, 'lxml') except Exception as e: print('基本信息未获取') slug = ''.join( random.sample( 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789', 8)) img_list = [] img_url_list = content_soup.select('img') for img in img_url_list: img = img['src'] try: img_name = re.search(r'\w+.(jpg|png)', img).group(0) img_list.append(img_name) img_download_path = img_file_path + '/' + '%s' % slug if os.path.exists(img_download_path): request.urlretrieve( img, img_download_path + '/' + '%s' % img_name) else: os.makedirs(img_download_path) request.urlretrieve( img, img_download_path + '/' + '%s' % img_name) print('----------------- 下载 %s 完成 -----------------' % img_name) content = re.sub(r'http://img\d.tuicool.com/', r'/upload/blog/%s/' % slug, content) content = re.sub(r'!web', '', content) except Exception as e: print(u'图片下载错误') if len(img_list) == 0: rss.append('') else: rss.append(img_list[0]) rss.append(slug) rss.append(content) rss.append(source) rss_content.append(rss) return rss_content
def get_pagination_urls(self, base_url): """获取采集项目数据信息的分页URL""" url = base_url + '1' r = requests.get(url, headers=common.get_header()) soup = Bs(r.text) page_data = Bs(str( soup.find_all(class_='ProjectList'))).find_all(class_='stat') max_page = math.ceil( int(page_data[0].next_sibling.next_sibling.text) / 20) return (base_url + str(i) for i in range(1, max_page + 1))
def test1(self): md = Markdown(extensions=[MarkdownImageExpander()]) source = """ ![](example.png) """ mustbe = """ <img alt="" src="example.png"> """ self.assertEqual( Bs(md.convert(textwrap.dedent(source)), "html.parser").prettify().replace("\n", ""), Bs(textwrap.dedent(mustbe), "html.parser").prettify().replace("\n", ""))
def get_user_sales_detail(self, item_detail_url, input_datas): """ ユーザ販売履歴の情報を取得する params: input_datas: excel inputデータ """ user_detail_set = { 'ave_days_from_send': '', 'sales_history_nums': '', 'sales_day': '' } html = requests.get(item_detail_url).text.encode('utf-8') source = Bs(html, 'html5lib') user_url = top_page + source.find('div', id='item_subcol').find( 'p', id='buyer_name').find('a')['href'] html = requests.get(user_url).text.encode('utf-8') source = Bs(html, 'html5lib') detail_blocks = source.find('ul', attrs={ 'id': 'detailed_list' }).find_all('li') for detail_block in detail_blocks: if detail_block.find('h3').text.strip() == '発送までの平均日数': user_detail_set['ave_days_from_send'] = ( detail_block.find('div').find('p').text.strip()) for input_data in input_datas: if input_data['option'] == '販売履歴ページ数': sales_page_num = int(input_data['value']) for sales_page_index in range(1, sales_page_num + 1): user_detal_url = (user_url.replace( '.html', '/sales_{0}.html'.format(sales_page_index))) html = requests.get(user_detal_url).text.encode('utf-8') source = Bs(html, 'html5lib') sales_detail_blocks = (source.find( 'dl', id='buyeritemtable').find_all('div', id='buyeritemtable_body')) if len(sales_detail_blocks) != 0: span_blocks = [] for sales_detail_block in sales_detail_blocks: span_blocks = (sales_detail_block.find_all('span')) if self.item_name in span_blocks[0].text.strip() and len( span_blocks) == 3: user_detail_set['sales_history_nums'] += span_blocks[ 1].text.strip().replace('成約:', '') + ',' user_detail_set[ 'sales_day'] += span_blocks[2].text.strip() + ',' return user_detail_set
def login(session, username, password): password_text = 'رمز عبور را وارد کنید' successful_login_text = 'سفارشهای من' failed_login_text = r'\u0627\u0637\u0644\u0627\u0639\u0627\u062a \u06a9\u0627\u0631\u0628\u0631\u06cc \u0646\u0627\u062f\u0631\u0633\u062a \u0627\u0633\u062a' # username url = 'https://www.digikala.com/users/login-register/' r1_dom = Bs(session.get(url).text, 'html.parser') payload = { 'login[email_phone]': username, 'rc': r1_dom.select('input[name=rc]')[0]['value'], 'rd': r1_dom.select('input[name=rd]')[0]['value'], } r2 = session.post(url, data=payload) if r2.status_code != 200: print('connection error, code: %s' % r2.status_code) return False # password if password_text in r2.text: r2_dom = Bs(r2.text, 'html.parser') payload = { 'login[password]': password, 'rc': r2_dom.select('input[name=rc]')[0]['value'], 'rd': r2_dom.select('input[name=rd]')[0]['value'], } r3 = session.post(r2.url, data=payload) if r3.status_code != 200: print('connection error, code: %s' % r3.status_code) return False # succeed :) if successful_login_text in r3.text: print('successfully logged in') return True # wrong data :( elif failed_login_text in r3.text: print('wrong data!') return False # sth else :): else: print('unknown error') return False else: print('unknown error') return False
def crawl_append(self, crawl_results): for row in crawl_results: self.driver.get(row['source_url']) html = self.driver.page_source self.soup = Bs(html, 'html.parser') self.page_source = self.soup.select('#bo_v_atc') event_content = self.soup.select('#bo_v_con > div.event_ctt') if len(event_content) == 0: row['ctn'] = '' else: row['ctn'] = event_content[0].text row['page_source'] = str(self.page_source) temp_img_src = self.soup.select('#bo_v_img > a > img') '''if len(temp_img_src) == 0: img_source = self.soup.select('#contents > div.cont > div.view_wrap > div.info_sec > p > a > img') else: img_source = temp_img_src''' ab = datetime.datetime.now() date_now = ab.strftime('%Y%m%d%H%M%S') file_name = date_now + str(ab.microsecond) if len(temp_img_src) > 0: temp_src = temp_img_src[0].attrs.get('src') urllib.request.urlretrieve(temp_src, '../../originalDatas/' + file_name + '.png') img_src = file_name + '.png' else: img_src = '' row['img_src'] = img_src #self.cm.content_insert(row, 'original') return crawl_results
def test_correct_services_in_form(self): """Czy klient widzi prawidłowe usługi (swojego użytkownika, niezablokowane)""" user_correct = self.create_user('active_1') user_wrong = self.create_user('active_2') service_wrong = self.create_service(user_wrong, 'long_2') service_not_active = self.create_service(user_correct, 'not_active') services_correct = [] services_types = ['short_1', 'short_2', 'long_1'] for service in services_types: services_correct.append(self.create_service(user_correct, service)) client = self.create_client(user_correct) self.authorize_client(client) response = self.client.get(f'/{user_correct}/panel/') soup = Bs(response.content.decode(), features="html.parser") option_wrong = soup.find("option", {"value": {service_wrong.id}}) option_not_active = soup.find("option", {"value": {service_not_active.id}}) self.assertFalse(option_wrong) self.assertFalse(option_not_active) for service in services_correct: options = soup.find("option", {"value": {service.id}}) self.assertEqual(options.get_text(), service.name)
def test_default_table(self): results_table_div = self.driver.find_element_by_id('results') rows = Bs(results_table_div.get_attribute('innerHTML'), features='lxml').find('table').findAll('tr') self.assertEqual(len(rows), self.tc + 1) headings = rows[0].findAll('th') headings_txt = [x.text for x in headings] self.assertEqual( headings_txt, [f'{self.tc}/{self.tc} themes', 'date', 'stars', 'commit'], ) for i, row in enumerate(rows[1:]): tds = row.findAll('td') tds_txt = [x.text for x in tds] self.assertEqual( tds_txt, [ self.themes[i].cname, self.themes[i].commit_date[0:10], str(self.themes[i].stargazers_count), self.themes[i].commit_sha[0:6], ], )
def crawl_append(self, crawl_results): for row in crawl_results: self.driver.get(row['source_url']) html = self.driver.page_source self.soup = Bs(html, 'html.parser') self.page_source = self.soup.select( '#content > div.schedule > div') row['page_source'] = str(self.page_source) event_content = self.soup.select( '#content > div.schedule > div > div > dl:nth-child(11) > dd') if len(event_content) > 0: row['ctn'] = event_content[0].text else: row['ctn'] = '' temp_img_src = self.soup.select( '#PageWrap > div > div.ViewInfo > p > img') ab = datetime.datetime.now() date_now = ab.strftime('%Y%m%d%H%M%S') file_name = date_now + str(ab.microsecond) if len(temp_img_src) > 0: temp_src = temp_img_src[0].attrs.get('src') encoding_url = parse.urlparse(temp_src[3:len(temp_src)]) print(encoding_url) urllib.request.urlretrieve( self.url_base + quote(encoding_url.path), '../../originalDatas/' + file_name + '.png') img_src = file_name + '.png' else: img_src = '' row['img_src'] = img_src print(row['event_name']) self.cm.content_insert(row, 'original')
def Gratis(num,msg): req=requests.Session() req.headers.update({'Referer':'http://sms.payuterus.biz/alpha', 'user-agent':'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36', 'Connection':'keep-alive', 'Pragma':'no-cache', 'Cache-Control':'no-cache', 'Origin':'http://sms.payuterus.biz', 'Upgrade-Insecure-Requests':'1', 'Content-Type':'application/x-www-form-urlencoded', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7', 'Cookie':'_ga=GA1.2.131924726.1560439960; PHPSESSID=jjrqqaakmfcgfgbtjt8tve5595; _gid=GA1.2.1969561921.1561024035; _gat=1', }) capt=req.get('http://alpha.payuterus.biz/').text bs1=Bs(capt,'html.parser') cap=(bs1.find('span',{'id':None}).text).split(' ') hasilcapt=int(cap[0]) + int(cap[2]) key=bs1.find('input',{'name':'key'})['value'] dataq={'nohp':num, 'pesan':msg, 'captcha':hasilcapt, 'key':key, } res=req.post('http://alpha.payuterus.biz/send.php',data=dataq).text # print(res) return str(res)
def _scrape_raw_product_details(self, response): raw_html = response.content soup = Bs(raw_html, 'html.parser') self._set_name(soup) self._set_price(soup) self._set_original_price(soup) self._set_rating(soup)
def init(): try: print("[+] Requesting...") print(f'{link}') response = requests.get(link) response.raise_for_status() print("[+] Requesting succeded") # making the soup soup = Bs(response.content, "html.parser") print("[+] Html parsing succeded") # scraping token jeton = soup.select_one('#jeton').get('value') # getting the cookie cookie = response.cookies.get_dict() return soup, jeton, cookie except HTTPError as err: print(f'HTTP error occured : {err}') print('[-] Requesting failed.') except Exception as err: print(f'Other error occured : {err}') print('[-] Requesting failed.')
def get_number_of_results(self, response): raw_html = response.content soup = Bs(raw_html, 'html.parser') klass = '_2yAnYN' try: raw_results = soup.find('span', {'class': klass}).get_text() if raw_results is None: logging.error("No Results found for <h1> class: " + klass) exit() else: start = raw_results.index('of') end = raw_results.index('results') no_of_results = int(raw_results[start + 3:end - 1].replace(',', '')) logging.info('Number of results for ' + self.searchterm + ':' + str(no_of_results)) if no_of_results > 10000: print('Too many' + '(' + str(no_of_results) + ')results for ' + self.searchterm + '.\ Please extend your search term.') print('Do you still want to continue, it will take a lot of time.(Y/N)') choice = input() if choice == 'Y' or choice == 'y': return self.get_max_page(response) elif choice == 'N' or choice == 'n': exit() else: print('invalid choice, exiting') exit() else: print('No of results: ', no_of_results) return self.get_max_page(response) except AttributeError: logging.error("screen format different for this search result, cant continue" + self.searchterm) return self.handle_different_screen_format()
def scrapper(query): """ :parameter str query: end of url to search :rtype: list of dict """ query = query infoboxes = [] url = 'https://en.wikipedia.org/wiki/' + query try: raw = urlopen(url) print('{} opened successfully'.format(url)) soup = Bs(raw, features='lxml') boxes = soup.find_all('table', {'class': 'infobox vcard'}) for table in boxes: content_dict = dict() for br in soup.find_all('br'): br.replace_with('\n') for tr in table.find_all('tr'): if len(tr.contents) > 1: content_dict[string_clean(tr.contents[0].text, True)] = string_clean( tr.contents[1].text) elif tr.text: content_dict[string_clean(tr.text, True)] = None infoboxes.append(content_dict) return infoboxes except URLError as error: if str(error.reason) == '[Errno 11001] getaddrinfo failed': messagebox.showerror( 'URLError', message='No connection to internet.\nProbably...') else: print('|{}|'.format(error.reason)) messagebox.showerror('URLError', message=error.reason)
def crawl(self): self.driver.get(self.url.format(page=1)) self.driver.maximize_window() now = datetime.datetime.now() reg_date = now.strftime('%Y-%m-%d %H:%M:%S') for page in range(1, 27): self.driver.get(self.url.format(page=page)) length = self.driver.find_elements_by_xpath( '//*[@id="content"]/table/tbody/tr') for content in range(2, len(length) + 1): url = self.driver.find_elements_by_xpath( '//*[@id="content"]/table/tbody/tr[{content}]/td[2]/a'. format(content=content)) self.tempUrl = url[0].get_attribute('href') self.driver.find_element_by_xpath( '//*[@id="content"]/table/tbody/tr[{content}]/td[2]/a'. format(content=content)).click() contents = self.driver.find_element_by_xpath( '//*[@id="content"]').text html = self.driver.page_source self.soup = Bs(html, 'html.parser') data = self.soup.select('#content') _dict['convention_name'] = 'exco' _dict['contents'] = contents _dict['page_source'] = str(data) _dict['source_url'] = self.tempUrl _dict['home_page'] = self.url _dict['reg_date'] = reg_date self.content_insert(_dict) self.driver.back()
def requestEmploi(jeton, cookie, group, id): try: print(f'[+] Setup to request {group} emploi...') data = {} data["jeton"] = jeton data["id"] = id print("[+] Requesting emploi...") currentTime = timer() response = requests.post(link, cookies=cookie, data=data) elapsed = timer() - currentTime response.raise_for_status() print("[+] Requesting succeded " + str(elapsed)) return Bs(response.content, "html.parser") except HTTPError as err: print(f'HTTP error occured : {err}') print('[-] Requesting failed.') except Exception as err: print(f'Other error occured : {err}') print('[-] Requesting failed.')
def search(query): global pil c = 1 req = ses.get('http://149.56.24.226/?s=' + query) bs = Bs(req.text, 'html.parser') hsl = bs.find_all('div', {'class': 'col-xs-9 col-sm-10 search-content'}) for i in hsl: tit = i.find('a', {'rel': 'bookmark'}) info['title'].append((tit.text, tit['href'])) if len(info['title']) == 0: print("Tidak dapat menemukan judul film") return True print("\n\t[ Result ]") for x in info['title']: print(f"{c}. {x[0]}") c += 1 pil = int(input("_> pilih: ")) if pil <= 0: print("index out of ranges") return True print(" *Bypassing, please wait...") bypass(info['title'][pil - 1][1], info['title'][pil - 1][0])
def get_result(content): soup = Bs(content, 'lxml') job_description = soup.select('dd[class="job_bt"]') job_description = str(job_description[0]) rule = re.compile(r'<[^>]+>') result = rule.sub('', job_description) return result
def get_images(i): page = requests.get(f'https://pamskenya.com/?page={i}').text soup = Bs(page,'lxml') img = soup.find_all('img', class_='centered-image') for im in img: img_list.append((im["src"],im['alt'])) if not os.path.exists('./images'): os.makedirs('./images') for img in img_list: url = img[0] name = str(img[1]).strip() if not os.path.exists(f'./images/{name}.jpg'): print("getting file") response = requests.get(url, stream=True) try: with open(f'./images/{name}.jpg', 'wb') as out_file: print("writing to file") shutil.copyfileobj(response.raw, out_file) del response print("writing Done") except : fail.add(name) print(fail) pass