def __init__(self): self._s = HTMLSession()
def start_request(self, url): session = HTMLSession() header = dict() header['user-agent'] = random.choice(self.USER_AGENT_LIST) response = session.get(url, headers=header) return response
def lego(message, vacancy): usdrate = 75 avgsalarylist = list() vacancywithSalary = 0 def responce(page, per_page, text=vacancy): responce = requests.get( 'https://api.hh.ru/vacancies/', headers={'User-Agent': 'Python ([email protected])'}, params={ 'text': text, 'page': page, 'per_page': per_page }) return responce.json() # вывести все вакансии в столбик for x in range(responce(page=0, per_page=100)['pages']): templist = responce(page=x, per_page=100)['items'] for i in templist: if i['salary'] is None: continue else: vacancywithSalary += 1 if i['salary']['currency'] == 'RUR': if i['salary']['from'] is None and i['salary'][ 'to'] is not None: avgsalarylist.append(i['salary']['to']) elif i['salary'][ 'from'] is not None and i['salary']['to'] is None: avgsalarylist.append(i['salary']['from']) elif i['salary']['from'] is not None and i['salary'][ 'to'] is not None: avgsalarylist.append( int( round( (i['salary']['from'] + i['salary']['to']) / 2))) else: print('ERRRRRORRRR') # USD else: if i['salary']['from'] is None and i['salary'][ 'to'] is not None: avgsalarylist.append(i['salary']['to'] * usdrate) elif i['salary'][ 'from'] is not None and i['salary']['to'] is None: avgsalarylist.append(i['salary']['from'] * usdrate) elif i['salary']['from'] is not None and i['salary'][ 'to'] is not None: avgsalarylist.append( int( round(( (i['salary']['from'] + i['salary']['to']) / 2) * usdrate))) else: print('ERRRRRORRRR') textik = 'Профессия: ' + str(vacancy) bot.send_message(message.chat.id, '---------------') bot.send_message(message.chat.id, textik) if responce(page=0, per_page=1)['pages'] == 2000: session = HTMLSession() url = 'https://hh.ru/search/vacancy?L_is_autosearch=false&area=113&clusters=true&enable_snippets=true&text=%' + vacancy + '&page=0' page = session.get(url) vacNum = '' vacNumPage = page.html.find( 'body > div.HH-MainContent.HH-Supernova-MainContent > div > div > div > div.bloko-columns-wrapper > div > ' 'div.bloko-column.bloko-column_xs-0.bloko-column_s-8.bloko-column_m-12.bloko-column_l-16 > div > h1' ) xRegex = re.compile('\d') allnumbers = re.findall(xRegex, vacNumPage[0].text) for i in allnumbers: vacNum += i textik = 'Всего вакансий: ' + str(vacNum) bot.send_message(message.chat.id, textik) else: if responce(page=0, per_page=1)['pages'] == 1: bot.send_message(message.chat.id, "Вакансий не найдено") else: textik = 'Всего вакансий: ' + str( responce(page=0, per_page=1)['pages']) bot.send_message(message.chat.id, textik) try: textik = 'Средняя зарплата: ' + str( int(round(statistics.median(avgsalarylist)))) bot.send_message(message.chat.id, textik) except: pass textik = 'По ссылке можешь ознакомиться подробнее' bot.send_message(message.chat.id, '---------------') bot.send_message(message.chat.id, 'Теперь можешь написать другую')
async def test_browser_session_fail(): """ HTMLSession.browser should not be call within an existing event loop> """ session = HTMLSession() with pytest.raises(RuntimeError): session.browser
def __init__(self, main_url): self.main_url = main_url self.session = HTMLSession() self.result_map = set()
def get_fia_data(force: bool = False): if not os.path.exists('data/fia'): os.makedirs('data/fia') html_list = [ "https://www.fia.com/documents/season/season-2020-1059", "https://www.fia.com/documents/season/season-2019-971" ] races = pd.read_csv('data/ergast/races.csv') drivers = pd.read_csv('data/ergast/drivers.csv') standings = pd.read_csv('data/ergast/driver_standings.csv') session = HTMLSession() data = pd.DataFrame() for html in html_list: r = session.get(html) r.html.render() race_id = "Unknown" year = html.split("-")[-2] for line in r.html.text.split("\n"): if "Grand Prix" in line and not year in line: line = line.replace("Formula 1 ", "") race_id = races.loc[(races['year'] == int(year)) & (races['name'] == line)] if race_id.empty: print('Warning: Missing race for ' + year + " " + line) # print(race_id[['raceId','year','name']]) # print(line + " = " + race_id) if "Offence" in line and not "Corrected" in line: # print(line) doctored = re.sub(r"^.*?offence - ", "", line.lower()) doctored = re.sub(r"\)$", "", doctored) doctored = doctored.replace(" (", " - ") if doctored == "car 8 parc ferme": doctored_list = ["car 8", "parc ferme"] elif doctored == "car 26 track limits turn 10 2nd": doctored_list = ["car 26", "track limits turn 18 2nd"] else: doctored_list = doctored.split(" - ", maxsplit=1) if "car" in doctored_list[0]: offence = pd.DataFrame( [[doctored_list[0], doctored_list[1], line]], columns=["Car", "Warning", "Entire Line"]) driver_num = doctored_list[0].split()[1] driver = drivers.loc[drivers['number'] == driver_num] if len(driver) > 1: correct_driver = standings.loc[ (standings['raceId'] == race_id['raceId'].iloc[0]) & (standings['driverId'].isin( list(driver['driverId'].values)))]['driverId'] driver = driver.loc[driver['driverId'] == correct_driver.iloc[0]] if driver.empty: print(' Warning: No driver found for ' + driver_num) else: driver = driver.rename(columns={'url': 'driver_url'}) temp_dataframe = pd.concat([ race_id.reset_index(drop=True), driver.reset_index(drop=True), offence.reset_index(drop=True) ], axis=1) data = data.append(temp_dataframe, ignore_index=True) data.to_csv("data/fia/driver_offence.csv", index=False)
def fetch_html(url): session = HTMLSession() response = session.get(url=url) return response
def get_school_info(school): """ Web scrapes a school's GreatSchools page. school: a dictionary """ session = HTMLSession() r = session.get(school['link']) r.html.render() ranking = 0 num_metrics = 0 # Get Sections test_scores = r.html.find('#TestScores', first=True) equity = r.html.find('#Equity', first=True) students = r.html.find('#Students', first=True) teachers = r.html.find('#TeachersStaff', first=True) # Test Scores if test_scores: school['english_prof'] = 0 school['math_prof'] = 0 subjects = test_scores.find('.subject') scores = test_scores.find('.score') score_zip = zip(subjects, scores) for pair in score_zip: if pair[0].text == 'English': english_prof = percent_check(pair[1].text) school['english_prof'] = english_prof elif pair[0].text == 'Math': math_prof = percent_check(pair[1].text) school['math_prof'] = math_prof test_scores_avg = (school['english_prof'] + school['math_prof']) / 2 # Student Demographics if students: # Racial Demographics demographics = {} for demo in students.find('.legend-separator'): dblock = demo.find('.legend-title') ethnicity = dblock[0].text percent = percent_check(dblock[1].text) demographics[ethnicity] = percent if 'Hispanic' not in demographics: school['hispanic'] = 0 else: school['hispanic'] = demographics['Hispanic'] if 'Black' not in demographics: school['black'] = 0 else: school['black'] = demographics['Black'] urm_percent = school['hispanic'] + school['black'] school['urm_percent'] = urm_percent ranking += (urm_percent / 100) * 0.35 num_metrics += 1 # English Learners ell = students.find('#english-learners', first=True) if ell: ell_percent = percent_check(ell.find('tspan')[0].text) school['ell'] = ell_percent ranking += (ell_percent / 100) * 0.15 num_metrics += 1 # Low-income low_income = students.find( '#students-participating-in-free-or-reduced-price-lunch-program', first=True) if low_income: low_income_percent = percent_check( low_income.find('tspan')[0].text) school['low_income'] = low_income_percent ranking += (low_income_percent / 100) * 0.5 num_metrics += 1 # Teachers with 3+ Years Experience if teachers: teacher_experience = teachers.find('.score')[0].text t_exp_percent = percent_check(teacher_experience) # Generate ANova Ranking if num_metrics == 0: school['ranking'] = 0 else: school['ranking'] = ranking / num_metrics session.close() return school
def parse_url(url): # 解析并返回页面内容 session = HTMLSession() response = session.get(url) return response.content.decode()
from requests_html import HTMLSession response = HTMLSession().get('https://coreyms.com/') articles = response.html.find('article') for article in articles: headline = article.find('.entry-title-link', first=True).text print(headline) print() summary = article.find('.entry-content p', first=True).text print(summary) print() try: video_code = article.find('.embed-youtube iframe', first=True).attrs['src'] video_code = video_code.split('/')[4] video_code = video_code.split('?')[0] video_url = f'https://www.youtube.com/watch?v={video_code}' print(video_url) print() print() except: print('No video.') print() print()
def retrieve(self, job_state): from requests_html import HTMLSession session = HTMLSession() response = session.get(self.navigate) return response.html.html
def set_args_browser_html(self, *args): if args: parameter = args[0] self.js_session = HTMLSession(browser_args=parameter)
def crawl_one(url): author = get_authors() try: with HTMLSession() as session: response = session.get(url) name = response.html.xpath('//h1')[0].text content = response.html.xpath('//article//p') image_url = response.html.xpath('//figure//img/@src')[0] pub_date = response.html.xpath('//time/@datetime') cats = response.html.xpath( '//article//div[@class="ssrcss-1emjddl-Cluster e1ihwmse0"]//ul//li' ) my_content = '' short_description = '' for element in content: my_content += f'<{element.tag}>' + element.text + f'<{element.tag}>' if len(short_description) < 200: short_description += element.text + ' ' image_name = slugify(name) img_type = image_url.split('.')[-1] img_path = f'images/{image_name}.{img_type}' with open(f'media/{img_path}', 'wb') as f: with HTMLSession() as session: response = session.get(image_url) f.write(response.content) # pub_date = datetime.strptime(pub_date, '%d %B %Y') pub_date = datetime.strptime(pub_date[0][0:10], '%Y-%m-%d') categories = [] for cat in cats: categories.append({ 'name': cat.text.strip(), 'slug': slugify(cat.text) }) article = { 'name': name, 'slug': slugify(name), 'content': my_content, 'short_description': short_description.strip(), 'main_image': img_path, 'pub_date': pub_date, 'author': author, } article, created = Article.objects.get_or_create(**article) for category in categories: cat, created = Category.objects.get_or_create(**category) article.categories.add(cat) logger.debug(f" Try to parser {url}") print(article) except Exception as e: logger.debug(f' Try to parse {url}') print(f'[{url}]', e, type(e), sys.exc_info()[-1].tb_lineno)
def _get_download(self): s = HTMLSession() r = s.get(self._url) html = getattr(r, "html") download = html.find(".btn.btn-primary.btn-lg.btn-block", first=True) return download.attrs["data-href"] if download else ""
from helpers import cached_in_db from helpers import cache_in_calendar from requests_html import HTMLSession SaintDay = namedtuple('SaintDay', ['month', 'day']) SaintResult = namedtuple('SaintResult', ['full_name', 'first_name', 'role', 'dates']) SPLIT_SAINTS_REGEX = re.compile(r"\d+ >") SPLIT_LINES_REGEX = re.compile(r"\n+ ") ITALIAN_MONTHS = [ "gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre" ] DATE_REGEX = re.compile(r"\d{1,2} (?:" + "|".join(ITALIAN_MONTHS) + ")") SESSION = HTMLSession() def create_url(letter, page): page_label = f'more{page}.html' if page > 1 else '' return f"http://www.santiebeati.it/{letter}/{page_label}" def create_calendar_url(month, day): return "http://www.santiebeati.it/{:02d}/{:02d}".format(month, day) def parse_saint_days(days): results = [] for day in days: d, m = day.split()
from utils.email import send_email, checkIfSent from bs4 import BeautifulSoup from requests_html import HTMLSession store_name = "Koodoo" if checkIfSent(store_name) is False: session = HTMLSession() # create an HTML Session object notify = False inStock = "" link = "https://koodoo.co.za/collections/all-consoles/products/playstation-5-ps5-digital" resp = session.get( link) # Use the object above to connect to needed webpage resp.html.render() # Run JavaScript code on webpage soup = BeautifulSoup(resp.html.html, "html.parser") try: inStock = soup.find(id="addToCartText-product-template").getText() except: pass # do nothing if the item doesn't exist if inStock.lower() != "sold out": notify = True print(f"{store_name} has stock: ", notify) if notify: # Send a notification send_email(store_name, link)
def crawler(token): session = HTMLSession() request = session.get("https://google.com/search?q=" + token) for link in request.html.links: if re.search("stick=", link): crawler2("https://google.com" + link)
def get_from_meteojob(url): session = HTMLSession() headers = { "Host": "www.meteojob.com", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Cookie": "ABTasty=uid%3D20012809565792105%26fst%3D1580201817549%26pst%3D1581517299643%26cst%3D1581598588773%26ns%3D3%26pvt%3D11%26pvis%3D1%26th%3D; _fbp=fb.1.1580201818340.973396398; _ga=GA1.2.1214576890.1580201818; cikneeto_uuid=id:811e2d38-8a86-4a72-9b53-d92cb2fb87a7; cto_bundle=Dye-hl9qTnpXZEpDNkh1dURJTHdLdE1zQ1c1NlNJaXJJMEJzb2pmY01yZWdXUyUyRkpDWVB2dmEyR0daSXRsWU9EdElsJTJGJTJCUTElMkZTQkZqOTNBSlVnVzRFMHBycVhyUXhWZnZUdDNueVJ5UjZ4Zk41eXBDTWRvSHZsVlM3bENzQUslMkZ4U3hsako5ajQwdk0xSWpaUm9jJTJCRUFMY1dqdFElM0QlM0Q; __gads=ID=f83e1fa8b3f0b798:T=1580201871:S=ALNI_MZiG2mYjCj0SCw_MdkWUT9HiMCATw; _gid=GA1.2.872070098.1581517301; _tac=false~self|not-available; _ta=fr~4~8d7faf049ef65861988a9f7855c1ca28; exit_modal_closed=true; cikneeto=date:1581598601434; autocomplete_history_job=[{%22type%22:%22JOB%22%2C%22id%22:11540%2C%22label%22:%22D%C3%A9veloppeur%20Big%20Data%20(H/F)%22%2C%22count%22:990%2C%22ambiguous%22:false}]; web_user_id=41a78692-e95a-4835-b086-610970bc4126; ABTastySession=sen%3D3__referrer%3D__landingPage%3Dhttps%3A//www.meteojob.com/candidat/offres/offre-d-emploi-data-scientist-h-f-brest-bretagne-cdi-12007746%3Fscroll%3DaW5kZXg9MTUmdG90YWw9NTkmd2hhdD1EJUMzJUE5dmVsb3BwZXVyK0JpZytEYXRhKyhIJTJGRikmcGFnZT0y; _tty=2083560087699625985; _tas=y8pte1n6i5i; _gat=1; _gat_raw=1", "Upgrade-Insecure-Requests": "1", "Cache-Control": "max-age=0" } # url = "https://www.meteojob.com/candidat/offres/offre-d-emploi-data-scientist-h-f-paris-ile-de-france-cdi-12341413?what=data" meteo = session.get(url, headers=headers) contenu = meteo.html.find(".mj-offer-details", first=True) if contenu is None: print(meteo.html.text) return None annonce = {} annonce["Titre"] = "NaN" annonce["Date_publication"] = "NaN" annonce["intitule"] = "NaN" annonce["ville"] = "NaN" annonce["code_dep"] = "NaN" annonce["Type_contrat"] = "NaN" annonce["Exp"] = "NaN" annonce["Diplome"] = "NaN" annonce["Entreprise"] = "NaN" annonce["Salaire"] = "NaN" annonce["corps"] = "NaN" annonce["Lien"] = "NaN" annonce["Titre"] = contenu.find("h1", first=True).text annonce["Date_publication_txt"] = contenu.find(".publication-date", first=True).text annonce["Date_publication"] = annonce["Date_publication_txt"] cont_json = meteo.html.find(".mj-column-content script") for j in cont_json: if j.attrs["type"] == "application/ld+json": cont = j.text if cont is not None and cont != "": try: cont = json.loads(cont) annonce["Date_publication"] = cont["datePosted"].split( "T")[0] except Exception as e: #print(cont) #annonce["Date_publication"] = annonce["Date_publication_txt"] pass else: print(j.attrs["type"]) items = contenu.find(".matching-criterion-wrapper") criteres = [] for bal in items: criteres.append(bal.text) for crit in criteres: if "(H/F)" in crit: annonce["intitule"] = crit elif crit.endswith(")"): lieu = crit.split("(") annonce["ville"] = lieu[0] annonce["code_dep"] = lieu[1].replace(')', '') elif crit in ("CDI", "CDI-C", "CDD", "Interim", "Stage"): annonce["Type_contrat"] = crit elif crit.startswith("Expérience"): annonce["Exp"] = crit.split(" : ")[1] elif crit.startswith("Niveau"): annonce["Diplome"] = crit.split(" : ")[1] sections = contenu.find("section") corps = "" for sect in sections: if sect.attrs.get( "class") and "offer-apply-form" not in sect.attrs.get("class"): corps += sect.text + "\n" if sect.attrs.get("class") and "company-description" in sect.attrs.get( "class"): annonce["Entreprise"] = sect.find("h3 span", first=True).text elif not sect.attrs.get("class"): if sect.find("h3") and sect.find( "h3", first=True).text == "Salaire et avantages": if sect.find("div", first=True): annonce["Salaire"] = sect.find("div", first=True).text elif sect.find("p", first=True): annonce["Salaire"] = sect.find("p", first=True).text else: print(sect.html) annonce["corps"] = corps annonce["Lien"] = url return annonce
# ch24_7.py from requests_html import HTMLSession session = HTMLSession() # 定義Session url = 'https://python.org/' r = session.get(url) # get() txt = r.html.search('Python is a {} language')[0] print(txt)
def __init__(self): self.session = HTMLSession() self.user_context_id = None self.staff_id = None self.logged_in = False
def scraper(tipo,region,a): fechahoy = datetime.datetime.now() fechascrap=str(fechahoy.year)+'-'+str(fechahoy.month)+'-'+str(fechahoy.day) try: link = "https://www.portalinmobiliario.com/venta/"+tipo+"/"+region+"?ca=1&ts=1&mn=1&or=&sf=0&sp=0&at=0&pg=" + str(a) page = requests.get(link, timeout=20, headers={'User-Agent': agentCreator.generateAgent()}) except: return print("continue") for j in range (1,28): proyecto = [] id=None nombre=None tipo=None comuna=None barrio=None direccion=None lat=None lon=None entrega=None propietario=None construye=None arquitecto=None propietario=None vende=None bodega=None bdesde=None bhasta=None bprom=None estacionamiento=None edesde=None ehasta=None eprom=None link2=None tree = html.fromstring(page.content) link2=tree.xpath('//*[@id="wrapper"]/section[2]/div/div/div[1]/article/div[3]/div['+str(j)+']/div[2]/div/div[1]/h4/a') try: link2=link2[0] except: continue link2=str(link2.attrib) link2=link2.split(': ') link2=link2[1] link2=str(link2) link2=link2[2:-2] link2 = "https://www.portalinmobiliario.com/" + link2 try: page2 = requests.get(link2, timeout=30, headers={'User-Agent': agentCreator.generateAgent()}) except: continue print("continue") tree2 = html.fromstring(page2.content) bye=tree2.xpath('//*[@id="project-descr"]/div/div[3]/div/p') try: bye=bye[0] bye=bye.text if ("no" in bye): if ("UF" in bye): try: byes=bye.split(' ') byen=0 for i in byes: if ("bodega" in i): if ("(desde" in byes[byen+1]): bdesde=byes[byen+3] if ("," in bdesde): bdesde=bdesde[:-1] if (")" in bdesde): bdesde=bdesde[:-1] else: bprom=byes[byen+2] if ("," in bprom) or ("." in bprom): bprom=bprom[:-1] if (")" in bprom): bprom=bprom[:-1] if ("hasta" in byes[byen+4]): bhasta=byes[byen+6] if ("," in bhasta) or ("." in bhasta): bhasta=bhasta[:-1] if (")" in bhasta): bhasta=bhasta[:-1] if ("estacionamiento" in i): if ("(desde" in byes[byen+1]): edesde=byes[byen+3] if ("," in edesde) or ("." in edesde): edesde=edesde[:-1] if (")" in edesde): edesde=edesde[:-1] else: eprom=byes[byen+2] if ("," in eprom) or ("." in eprom): eprom=eprom[:-1] if (")" in eprom): eprom=eprom[:-1] if ("hasta" in byes[byen+4]): ehasta=byes[byen+6] if ("," in ehasta) or ("." in ehasta): ehasta=ehasta[:-1] if (")" in ehasta): ehasta=ehasta[:-1] if ")" in ehasta: ehasta.split(")") ehasta=ehasta[0] byen=byen+1 bodega=0 estacionamiento=0 except: bodega = 0 estacionamiento = 0 else: bodega=0 estacionamiento=0 else: bodega=1 estacionamiento=1 try: bprom=(float(bdesde)+float(bhasta))/2 except: try: bprom=float(bprom) except: bprom=None try: eprom = (float(edesde) + float(ehasta)) / 2 except: eprom=None try: bdesde=float(bdesde) except: bdesde=None try: bhasta=float(bhasta) except: bhasta=None try: edesde = float(edesde) except: edesde=None try: ehasta = float(ehasta) except: ehasta=None try: eprom = float(eprom) except: eprom=None try: bodega=float(bodega) except: bodega=None try: estacionamiento=float(estacionamiento) except: estacionamiento=None byen=0 byes = bye.split(' ') for j in byes: if ("incluye") in j: if ("no" not in byes[byen-1]) or (byen==0): if ("estacionamiento" in byes[byen+1]): estacionamiento=1 if ("y" in byes[byen+2]): if("bodega" in byes[byen+3]): bodega=1 if("bodega" in byes[byen+4]): bodega=float(byes[byen+3]) if ("estacionamiento") in byes[byen+2]: try: estacionamiento=float(byes[byen+1]) except: estacionamiento=0 if ("y" in byes[byen+3]): if("bodega" in byes[byen+4]): bodega=1 if("bodega" in byes[byen+5]): try: bodega=float(byes[byen+4]) except: bodega=0 if ("bodega" in byes[byen+1]): bodega=1 if ("y" in byes[byen+2]): if("estacionamiento" in byes[byen+3]): estacionamiento=1 if("estacionamiento" in byes[byen+4]): estacionamiento=float(byes[byen+3]) if ("bodega") in byes[byen+2]: try: bodega=float(byes[byen+1]) except: bodega=0 if ("y" in byes[byen+3]): if("estacionamiento" in byes[byen+4]): estacionamiento=1 if("estacionamiento" in byes[byen+5]): try: estacionamiento=float(byes[byen+4]) except: estacionamiento=0 else: byen=byen+1 except: bodega=0 estacionamiento=0 try: bodega=int(bodega) bdesde=float(bdesde) bhasta=float(bhasta) bprom=float(bprom) estacionamiento=int(estacionamiento) edesde=float(edesde) ehasta=float(ehasta) eprom=float(eprom) except: alfa=None tester=tree2.xpath('//*[@id="project-descr"]/div/div[4]/div[2]/div/div[1]/strong') validator=4 try: tester=tester[0].text except: validator=5 for v in range (1,6): tester=tree2.xpath('//*[@id="project-descr"]/div/div['+str(validator)+']/div[2]/div/div['+str(v)+']/strong') try: if("Fecha" in tester[0].text): entrega=tree2.xpath('//*[@id="project-descr"]/div/div['+str(validator)+']/div[2]/div/div['+str(v)+']/text()') entrega=entrega[0] entrega=str(entrega) entrega=entrega[1:] if ("Propietario" in tester[0].text): propietario =tree2.xpath('//*[@id="project-descr"]/div/div['+str(validator)+']/div[2]/div/div['+str(v)+']/text()') propietario=propietario[0] propietario=str(propietario) propietario=propietario[1:] if ("Arquitecto" in tester[0].text): arquitecto=tree2.xpath('//*[@id="project-descr"]/div/div['+str(validator)+']/div[2]/div/div['+str(v)+']/text()') arquitecto=arquitecto[0] arquitecto=str(arquitecto) arquitecto=arquitecto[1:] if ("Construye" in tester[0].text): construye=tree2.xpath('//*[@id="project-descr"]/div/div['+str(validator)+']/div[2]/div/div['+str(v)+']/text()') construye=construye[0] construye=str(construye) construye=construye[1:] if ("Vende" in tester[0].text): vende=tree2.xpath('//*[@id="project-descr"]/div/div['+str(validator)+']/div[2]/div/div['+str(v)+']/text()') vende=vende[0] vende=str(vende) vende=vende[1:] except: continue session = HTMLSession() r = session.get(link2, timeout=20) script=r.html.find('.page-project') texto=script[0] texto=texto.text try: texto=texto.split(',') except: continue b=0 for t in texto: if ("lat") in t: try: lat=texto[b] if ("Nombre") not in t: lat=lat.split(': ') lat=lat[1] lat=lat.split(' ') lat=lat[0] lat=float(lat) lon=texto[b+1] lon=lon.split(' ') lon=lon[2] lon=float(lon) id=texto[b+1] id=id.split(':') id=id[2] id=int(id) nombre=texto[b+2] nombre=nombre.split(':') nombre=nombre[1] nombre=nombre[1:-1] tipo=texto[b+3] tipo=tipo.split(':') tipo=tipo[1] tipo=tipo[1:-1] direccion=texto[b+5] direccion=direccion.split(':') direccion=direccion[1] direccion=direccion[1:] if "?" in texto[b+7]: comuna=texto[b+6] comuna=comuna[1:-1] else: comuna=texto[b+8] comuna=comuna[1:-1] barrio=texto[b+7] barrio=barrio[1:] except: b=b+1 continue b=b+1 a=0 proyecto.append(id) proyecto.append(nombre) proyecto.append(tipo) proyecto.append(comuna) proyecto.append(barrio) proyecto.append(direccion) proyecto.append(lat) proyecto.append(lon) proyecto.append(entrega) proyecto.append(propietario) proyecto.append(arquitecto) proyecto.append(construye) proyecto.append(vende) proyecto.append(bodega) proyecto.append(bdesde) proyecto.append(bhasta) proyecto.append(bprom) proyecto.append(estacionamiento) proyecto.append(edesde) proyecto.append(ehasta) proyecto.append(eprom) proyecto.append(link2) proyecto.append(fechascrap) proyecto.append(fechascrap) print(len(proyecto)) print(proyecto) insertarProyecto(proyecto) time.sleep(random.uniform(0.5,1.5)) for t in texto: if ("Numero" in t): try: propiedad = [] n=str(texto[a]) try: n=n.split(':') n=str(n[1]) n=n[1:-1] except: n="nn" try: id3=(int(n)*100000) id3=id3+id except: continue precio=str(texto[a+1]) precio=precio.split(":") precio=str(precio[1]) try: precio=float(precio) except: precio=None dormitorios=str(texto[a+3]) dormitorios=dormitorios.split(":") dormitorios=str(dormitorios[1]) try: dormitorios=int(dormitorios) except: dormitorios=None banos=str(texto[a+4]) banos=banos.split(":") banos=str(banos[1]) banos=int(banos) piso=str(texto[a+5]) piso=piso.split(":") piso=str(piso[1]) try: piso=int(piso) except: piso=None orientacion=str(texto[a+6]) orientacion=orientacion.split(":") orientacion=str(orientacion[1]) orientacion=orientacion[1:-1] util=str(texto[a+8]) util=util.split(":") util=str(util[1]) try: util=float(util) except: util=None total=str(texto[a+9]) total=total.split(":") total=str(total[1]) try: total=float(total) except: total=None terraza=str(texto[a+10]) terraza=terraza.split(":") terraza=str(terraza[1]) try: terraza=float(terraza) except: terraza=None propiedad.append(id) propiedad.append(id3) propiedad.append(n) propiedad.append(precio) propiedad.append(dormitorios) propiedad.append(banos) propiedad.append(piso) propiedad.append(orientacion) propiedad.append(util) propiedad.append(total) propiedad.append(terraza) propiedad.append(fechascrap) print(id) print(id3) print(n) print(precio) print(dormitorios) print(banos) print(piso) print(orientacion) print(util) print(total) print(terraza) print(fechascrap) propiedad.append(fechascrap) insertarDepto(propiedad) except: continue a=a+1 sleep(5) sleep(120)
from requests_html import HTMLSession from bs4 import BeautifulSoup import pandas as pd s = HTMLSession() searchterm = 'dslr+camera' # url for amazon website #url = 'https://www.amazon.co.uk/s?k={searchterm}&qid=1616907527&ref=sr_pg_1' url = f'https://www.amazon.co.uk/s?k={searchterm}&i=black-friday' # def getdata(url): r = s.get(url) r.html.render(sleep=1) soup = BeautifulSoup(r.html.html, 'html.parser') return soup # dealslist = [] def getdeals(soup): products = soup.find_all('div', {'data-component-type': 's-search-result'}) for item in products: # Scraping product name title = item.find('a', {
def get_flask_TripleDES_Decode(aesKey, text): url = f'http://127.0.0.1:8058/decode?key={aesKey}&text={text}' session = HTMLSession() session.get(url) time.sleep(0.5)
async def on_message(message): print( f"{message.channel}: {message.author}: {message.author.name}: {message.content}" ) sentdex_guild = client.get_guild(405403391410438165) author_roles = message.author.roles #print(author_roles) #author_role_ids = [r.id for r in author_roles] if random.choice(range(500)) == 30: matches = [r for r in author_roles if r.id in vanity_role_ids] #print(matches) if len(matches) == 0: try: role_id_choice = random.choice(vanity_role_ids) actual_role_choice = sentdex_guild.get_role(role_id_choice) #print(type(message.author)) author_roles.append(actual_role_choice) await message.author.edit(roles=author_roles) except Exception as e: print('EDITING ROLES ISSUE:', str(e)) with open(f"{path}/msgs.csv", "a") as f: if message.author.id not in chatbots: f.write( f"{int(time.time())},{message.author.id},{message.channel}\n") with open(f"{path}/log.csv", "a") as f: if message.author.id not in chatbots: try: f.write( f"{int(time.time())},{message.author.id},{message.channel},{message.content}\n" ) except Exception as e: f.write(f"{str(e)}\n") if "sentdebot.member_count()" == message.content.lower(): await message.channel.send(f"```py\n{sentdex_guild.member_count}```") elif "sentdebot.community_report()" == message.content.lower( ) and message.channel.id in image_chan_ids: online, idle, offline = community_report(sentdex_guild) file = discord.File(f"{path}/online.png", filename=f"{path}/online.png") await message.channel.send("", file=file) await message.channel.send( f'```py\n{{\n\t"Online": {online},\n\t"Idle/busy/dnd": {idle},\n\t"Offline": {offline}\n}}```' ) elif "sentdebot.p6()" == message.content.lower(): await message.channel.send( f"```\nThe Neural Networks from Scratch video series will resume when the NNFS book is completed. This means the videos will resume around Sept or Oct 2020.\n\nIf you are itching for the content, you can buy the book and get access to the draft now. The draft is over 500 pages, covering forward pass, activation functions, loss calcs, backward pass, optimization, train/test/validation for classification and regression. You can pre-order the book and get access to the draft via https://nnfs.io```" ) elif "sentdebot.user_activity()" == message.content.lower( ) and message.channel.id in image_chan_ids: # and len([r for r in author_roles if r.id in admins_mods_ids]) > 0: file = discord.File(f"{path}/activity.png", filename=f"{path}/activity.png") await message.channel.send("", file=file) #await message.channel.send(f'```py\n{{\n\t"Online": {online},\n\t"Idle/busy/dnd": {idle},\n\t"Offline": {offline}\n}}```') elif "help(sentdebot)" == message.content.lower( ) or "sentdebot.commands()" == message.content.lower(): await message.channel.send(commands_available) # if it doesnt work later. #elif "sentdebot.logout()" == message.content.lower() and message.author.id == 324953561416859658: elif "sentdebot.logout()" == message.content.lower() and str( message.author).lower() == "sentdex#7777": await client.close() elif "sentdebot.gtfo()" == message.content.lower() and str( message.author).lower() == "sentdex#7777": await client.close() elif "sentdebot.get_history()" == message.content.lower() and str( message.author).lower() == "sentdex#7777": channel = sentdex_guild.get_channel(channel_ids[0]) async for message in channel.history(limit=999999999999999): if message.author.id == 324953561416859658: with open(f"{path}/history_out.csv", "a") as f: f.write(f"{message.created_at},1\n") else: query = search_term(message.content) if query: #query = match.group(1) print(query) qsearch = query.replace(" ", "%20") full_link = f"https://pythonprogramming.net/search/?q={qsearch}" session = HTMLSession() r = session.get(full_link) specific_tutorials = [(tut.text, list(tut.links)[0]) for tut in r.html.find("a") if "collection-item" in tut.html] if len(specific_tutorials) > 0: return_str = "\n---------------------------------------\n".join( f'{tut[0]}: <https://pythonprogramming.net{tut[1]}>' for tut in specific_tutorials[:3]) return_str = f"```Searching for '{query}'```\n" + return_str + f"\n----\n...More results: <{full_link}>" await message.channel.send(return_str) else: await message.channel.send(f"""```py Traceback (most recent call last): File "<stdin>", line 1, in <module> NotFoundError: {query} not found```""")
def start_request(self, url): headers = {'user-agent': random.choice(self.USER_AGENT_LIST)} session = HTMLSession() response = session.get(url, headers=headers) return response
def test_is_goodreads_available(self): url = 'https://www.bbc.com/news/technology' with HTMLSession() as session: response = session.get(url) self.assertTrue(response.status_code == 200)
def handle(self, *args, **options): # 曲リストページの配列 music_list_page_url_list = [ "http://textage.cc/score/index.html?sA11B00", "http://textage.cc/score/index.html?sB11B00" ] for music_list_page_url in music_list_page_url_list: print(f"■music_list_page_url:{music_list_page_url}") music_list_page_session = HTMLSession() music_list_page_response = music_list_page_session.get( music_list_page_url) # shift_jisのページをスクレイピングする際のUnicodeEncodeError対策 music_list_page_response.content.decode("shift_jis") print("before render music_list_page...") # 結構重いので大きめにタイムアウト時間を取る必要あり music_list_page_response.html.render(timeout=300) print("after render music_list_page!!!") # 文字化け対策 music_list_page_response.html.encoding = "utf-8" # find()で要素を探索 music_tr_list = music_list_page_response.html \ .find("table")[1] \ .find("tr") first_tr_flg = True for music_tr in music_tr_list: if first_tr_flg: first_tr_flg = False continue td_second = music_tr.find("td")[1] td_second_class = td_second.attrs["class"][0] music_deleted_flag = False if td_second_class.startswith("x"): music_deleted_flag = True music_tr_diff_code = td_second_class[2:3] else: music_tr_diff_code = td_second_class[1:2] # 曲リストページの難易度コードを曲マスタで保持する難易度コードに変換 if music_tr_diff_code == "n": music_tr_diff_code_mstcode = "N" elif music_tr_diff_code == "h": music_tr_diff_code_mstcode = "H" elif music_tr_diff_code == "a": music_tr_diff_code_mstcode = "A" elif music_tr_diff_code == "x": music_tr_diff_code_mstcode = "L" else: raise Exception( f"想定外のmusic_tr_diff_code:{music_tr_diff_code}") # 曲名 music_name = music_tr.find("td")[3].text print(f"music_name:{music_name}") print( f"music_tr_diff_code_mstcode:{music_tr_diff_code_mstcode}") # 曲マスタを取得 old_music_mst_list = MstMusic.objects.filter( music_name=music_name, difficulty_code=music_tr_diff_code_mstcode) if len(old_music_mst_list) == 0: old_music_mst = None elif len(old_music_mst_list) == 1: old_music_mst = old_music_mst_list[0] else: raise Exception("複数レコード取得は想定外") if music_deleted_flag: if old_music_mst is not None: setattr(mst_music, "music_deleted_flag", \ music_deleted_flag) setattr(mst_music, "upd_user", \ "music-mst-mainte.py") setattr(mst_music, "upd_date", \ timezone.localtime()) old_music_mst.save() continue else: # レコードがすでに存在する場合は何もしない if old_music_mst is not None: print("MstMusic exists.") continue # 譜面ページURL try: note_page_url = "http://textage.cc/score/" \ + td_second.find("a")[0] \ .attrs["href"] except Exception: # aタグがない=譜面ページがまだない continue # 譜面ページにアクセス music_page_session = HTMLSession() music_page_response = music_page_session.get(note_page_url) music_page_response.content.decode("shift_jis") print("before render music_page...") music_page_response.html.render(timeout=300) print("after render music_page!!!") music_page_response.html.encoding = "utf-8" # 曲情報をパース(画面上部) music_info_top_lines = music_page_response.html.find( "nobr")[0] music_info_top_line_text_arr = music_info_top_lines \ .text \ .split("\n", maxsplit=2) for music_info_top_line_text in music_info_top_line_text_arr: print(music_info_top_line_text) music_info_style_diff = music_info_top_line_text_arr[1] \ .replace("[", "") \ .replace("]", "") music_info_top_style_diff_arr = music_info_style_diff.split( " ") # V2等、曲名に改行が入る場合のため改行を空文字に置換しておく # 「V2/ TAKA」→「V2 / TAKA」対応 # TODO: ★10のV2より前の曲が正しくスクレイピングできるか? music_info_top_others = music_info_top_line_text_arr[2] \ .replace("\n", "") \ .replace("V2/", "V2 /") \ .replace(" bpm:", ":::::") \ .replace(" - ★", ":::::") \ .replace(" Notes:", ":::::") \ .split(":::::") music_info_top_name_artist_arr = music_info_top_others[ 0].split(" / ") music_info_top_bpm_arr = music_info_top_others[1].split( "~") # 曲情報をパース(画面下部) music_info_bottom_line = music_page_response.html.find( "table + font")[0] print(music_info_bottom_line.text) music_info_bottom_line_text_arr = music_info_bottom_line \ .text \ .replace(", ", ",") \ .split(" ", maxsplit=1) # TODO: DPのページは左右ノーツ数があるのでこのままだとだめ music_info_bottom_score_border_arr = music_info_bottom_line_text_arr[0] \ .split(",") music_info_bottom_special_notes_arr = music_info_bottom_line_text_arr[1] \ .replace("(", "") \ .replace(")", "") \ .split(" ") # 曲名は設定済(music_name) # 曲削除済フラグは設定済(music_deleted_flag) # アーティスト名 artist_name = music_info_top_name_artist_arr[1] # ジャンル genre = music_info_top_line_text_arr[0].replace("\"", "") # 難易度(NORMAL or HYPER or ANOTHER or LEGGENDARIA) difficulty_and_difficulty_code_map = { "NORMAL": "N", "HYPER": "H", "ANOTHER": "A", "LEGGENDARIA": "L", } difficulty_code = difficulty_and_difficulty_code_map.get( music_info_top_style_diff_arr[1]) # BPM bpm_min = int(music_info_top_bpm_arr[0]) bpm_max = int(music_info_top_bpm_arr[0]) if len(music_info_top_bpm_arr) == 1 \ else int(music_info_top_bpm_arr[1]) # ★ level = int(music_info_top_others[2]) # SP総ノーツ数 sp_notes_num_all = int(music_info_top_others[3]) # SP特殊ノーツ数(スクラッチ) sp_notes_num_scr = 0 # SP特殊ノーツ数(チャージノート) sp_notes_num_cn = 0 # SP特殊ノーツ数(バックスピンスクラッチ) sp_notes_num_bss = 0 for special_notes_info in music_info_bottom_special_notes_arr: special_notes_info_arr = special_notes_info.split("=") if special_notes_info_arr[0] == "SCR": sp_notes_num_scr = int(special_notes_info_arr[1]) elif special_notes_info_arr[0] == "CN": sp_notes_num_cn = int(special_notes_info_arr[1]) elif special_notes_info_arr[0] == "BSS": sp_notes_num_bss = int(special_notes_info_arr[1]) else: raise Exception( f"想定外の特殊ノーツ]{special_notes_info_arr[0]}") # スコアボーダー sp_score_border_aaa = int( music_info_bottom_score_border_arr[0].replace( "AAA:", "")) sp_score_border_aa = int( music_info_bottom_score_border_arr[1].replace( "AA:", "")) sp_score_border_a = int( music_info_bottom_score_border_arr[2].replace( "A:", "")) # DB用カラムの計算・設定 # TODO: CN、BSSは開始・終了で2ノーツ扱い db_notes_num_scr = 0 db_notes_num_cn = sp_notes_num_cn * 2 db_notes_num_bss = 0 db_notes_num_all = sp_notes_num_all * 2 \ - sp_notes_num_scr \ - sp_notes_num_bss * 2 db_score_max = db_notes_num_all * 2 db_score_border_maxminus = math.ceil(db_notes_num_all * 2 * 8.5 / 9) db_score_border_aaa = math.ceil(db_notes_num_all * 2 * 8 / 9) db_score_border_aa = math.ceil(db_notes_num_all * 2 * 7 / 9) db_score_border_a = math.ceil(db_notes_num_all * 2 * 6 / 9) # TODO: 更新対応 mst_music = MstMusic() setattr(mst_music, "music_name", \ music_name) setattr(mst_music, "music_deleted_flag", \ music_deleted_flag) setattr(mst_music, "artist_name", \ artist_name) setattr(mst_music, "genre", \ genre) setattr(mst_music, "difficulty_code", \ difficulty_code) setattr(mst_music, "bpm_min", \ bpm_min) setattr(mst_music, "bpm_max", \ bpm_max) setattr(mst_music, "level", \ level) setattr(mst_music, "sp_notes_num_all", \ sp_notes_num_all) setattr(mst_music, "sp_notes_num_scr", \ sp_notes_num_scr) setattr(mst_music, "sp_notes_num_cn", \ sp_notes_num_cn) setattr(mst_music, "sp_notes_num_bss", \ sp_notes_num_bss) setattr(mst_music, "sp_score_max", \ 9999) # TODO: calc and set setattr(mst_music, "sp_score_border_maxminus", \ 9999) # TODO: calc and set setattr(mst_music, "sp_score_border_aaa", \ sp_score_border_aaa) setattr(mst_music, "sp_score_border_aa", \ sp_score_border_aa) setattr(mst_music, "sp_score_border_a", \ sp_score_border_a) setattr(mst_music, "dp_notes_num_all", \ 9999) # TODO: calc and set setattr(mst_music, "dp_notes_num_scr", \ 9999) # TODO: calc and set setattr(mst_music, "dp_notes_num_cn", \ 9999) # TODO: calc and set setattr(mst_music, "dp_notes_num_bss", \ 9999) # TODO: calc and set setattr(mst_music, "dp_score_max", \ 9999) # TODO: calc and set setattr(mst_music, "dp_score_border_maxminus", \ 9999) # TODO: calc and set setattr(mst_music, "dp_score_border_aaa", \ 9999) # TODO: calc and set setattr(mst_music, "dp_score_border_aa", \ 9999) # TODO: calc and set setattr(mst_music, "dp_score_border_a", \ 9999) # TODO: calc and set setattr(mst_music, "db_notes_num_all", \ db_notes_num_all) setattr(mst_music, "db_notes_num_scr", \ db_notes_num_scr) setattr(mst_music, "db_notes_num_cn", \ db_notes_num_cn) setattr(mst_music, "db_notes_num_bss", \ db_notes_num_bss) setattr(mst_music, "db_score_max", \ db_score_max) setattr(mst_music, "db_score_border_maxminus", \ db_score_border_maxminus) setattr(mst_music, "db_score_border_aaa", \ db_score_border_aaa) setattr(mst_music, "db_score_border_aa", \ db_score_border_aa) setattr(mst_music, "db_score_border_a", \ db_score_border_a) setattr(mst_music, "db_withscr_notes_num_all", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_notes_num_scr", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_notes_num_cn", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_notes_num_bss", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_score_max", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_score_border_maxminus", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_score_border_aaa", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_score_border_aa", \ 9999) # TODO: calc and set setattr(mst_music, "db_withscr_score_border_a", \ 9999) # TODO: calc and set setattr(mst_music, "note_page_url", \ note_page_url) setattr(mst_music, "ins_user", \ "music-mst-mainte.py") setattr(mst_music, "ins_date", \ timezone.localtime()) setattr(mst_music, "upd_user", \ "music-mst-mainte.py") setattr(mst_music, "upd_date", \ timezone.localtime()) mst_music.save() music_page_response.close()
def getHTMLwithJavascriptContent(url): session = HTMLSession() resp = session.get(url) resp.html.render(timeout=20) return resp.html.html
from bs4 import BeautifulSoup as soup from selenium import webdriver from requests_html import HTMLSession from urllib.request import urlopen as uReq session = HTMLSession() import time your_exec_path = r"C:\Users\PARULEKAR\Downloads\chromedriver_win32\chromedriver.exe" driver = webdriver.Chrome(executable_path=your_exec_path) driver.get("https://www.who.int/publications/en/") article_link = driver.find_elements_by_xpath('//a[@class="buffet_headline"]') for i in range(len(article_link)): print( "---------------------------------------------------------------------------------------------------" ) abc = article_link[i].get_attribute("href") r = session.get(abc) html = r.html.html pageSoup = soup(html, "html.parser") element = pageSoup.find("li", {"first"}) try: pdf_url = element.a.get('href') print(pdf_url) except: print("pdf not found") # try: # element = pageSoup.find("li",{"first"}) # print(element) # try:
def UploadImageAsset(client, url, image_ref_on_file, image_name, width, height): """Uploads the image from the specified url. Args: client: An AdWordsClient instance. url: The image URL. Returns: The ID of the uploaded image. """ # Initialize appropriate service. asset_service = client.GetService('AssetService', version='v201809') # Download the image. headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } session__ = HTMLSession() """ image_request = session__.get(url, headers=headers, verify=True) #print("URL: "+ url) print(image_request.content) print(image_request.html) """ print(url) tab = url.split('&') #print(type(url)) image_request = session__.get(tab[0], headers=headers, verify=True) #print(tab[0]) #image_asset = BytesIO(urlopen(tab[0]).read()) image_asset = image_request.content #print(image_asset) # Create the image asset. try: source = tinify.tinify.tinify.from_url(url) #print(source) resized_image = source.resize(method="fit", width=int(width), height=int(height)) data = resized_image.to_file(image_ref_on_file) #print(sys.getsizeof(data)) #print(data) except: try: source = tinify.tinify.tinify.from_url(url) print(source) resized_image = source.resize(method="fit", width=int(width), height=int(height)) data = resized_image.to_file(image_ref_on_file) print(sys.getsizeof(data)) #print(data) except Exception as e: print(e) print(image_name) file_url = url_for('uploaded_file', filename=image_name, _external=True) image_asset = { 'xsi_type': 'ImageAsset', 'imageData': urlopen(file_url).read(), # This field is optional, and if provided should be unique. # 'assetName': 'Image asset ' + str(uuid.uuid4()), } # Create the operation. operation = {'operator': 'ADD', 'operand': image_asset} # Create the asset and return the ID. result = asset_service.mutate([operation]) return result['value'][0]['assetId']