def get_article_for_packerswire(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('div', {'class': 'articleBody'}).find('p') return _arranged(lines) except Exception: return ''
def get_index_components(query: str = 'https://stooq.pl/q/i/?s=mwig40'): ''' Returns components of an index that is specified for a given index (in the case of Polish Stock Exchange (GPW) that would be for example mWig40) # For the time being, I will set the 'query' argument to download # mWig 40 data. Changing this to some other index (WIG20) for example # should be pretty straightforward then. ''' htmlContents = [] companies = [] html = get(query) soup = Soup(html) data_table = soup.find('tbody') for i in data_table: if i.find('font') == None: continue htmlContents.append(i.find('font')) for element in htmlContents[0]: if element.find('a') == None: continue companies.append(re.findall(">(.*)</a>", str(element.find('a')))[0]) return companies
def get_article_for_dev_to(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('div', {'id': 'article-body'}).find('p') return _arranged(lines) except Exception: return ''
def test_get_headers(): url = "https://httpbin.org/headers" UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0" headers = {"User-Agent": UA} content = get(url, headers=headers) user_agent = json.loads(content)["headers"]["User-Agent"] assert user_agent == UA
def get_game_stats(url): url += "/stats" html = get(url) soup = Soup(html) rows = soup.find("div", {"class": "BoxScore__statLine"}) data = [parse_stat_row(row) for row in rows] return data
def get_article(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('p') return _arranged(lines) except Exception: return ''
def comprobar_estado(usuario, sl): url_user = url + usuario + '/?hl=es' try: html = get(url_user) except: return None html_s = Soup(html) a = html_s.find('meta', attrs={'property': 'og:description'}) dic = a.attrs n = dic['content'].replace('-', ',') # arreglar el problema de las ',' en los números new_n = n[0] for i in range(1, len(n)): if n[i - 1].isdigit() and n[i] == ',' and n[i + 1].isdigit(): new_n += '.' else: new_n += n[i] n = new_n n = n.split(',') data = (x.strip() for x in n) # ('431 Followers', '872 Following', '294 Posts', 'See Instagram photos and videos from JP (@juanpedro)') seguidores, seguidos, publicaciones, usuario = data *_, usuario = usuario.split(' ') # '@juanpedro' if '(' in usuario: usuario = usuario[1:-1] if sl: nombre, tipo = paser_selenium(url_user) else: nombre, tipo = '¿nombre?', 'No tienes selenium instaldo, imposible sacar el tipo cuenta...' return usuario, nombre, seguidores, seguidos, publicaciones, tipo
def test_get_headers(): url = 'https://httpbin.org/headers' UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0' headers = {'User-Agent': UA} content = get(url, headers=headers) user_agent = json.loads(content)['headers']['User-Agent'] assert user_agent == UA
def get_currencies_rates(url: str) -> dict: """Return dictionary with exchange rates""" result: Dict[str, dict] = {} response = None try: response = get(url) except (HTTPError, URLError) as err: logging.exception(err) if response: soup = Soup(response) currencies = [ cur.find("a") for cur in soup.find("div", {"class": NAME}) ] buy = soup.find("div", {"class": BUY}) buy_values = [value.find("div", {"class": NUM}).text for value in buy] sale = soup.find("div", {"class": SALE}) sale_values = [ value.find("div", { "class": NUM }).text for value in sale ] for cur, buy_num, sale_num in zip(currencies, buy_values, sale_values): cur = cur.text.split()[-1] result[cur] = {"buy": float(buy_num), "sale": float(sale_num)} return result
def scrape_position_cbs(position): url = f'https://www.cbssports.com/fantasy/hockey/stats/{position}/2019/season/projections/' html = get(url) soup = Soup(html) table = soup.find('table', {'class': 'TableBase-table'}) pdf = pd.read_html(str(table))[0] return pdf
def scrape(): url = request.args.get('url') html = get(url) soup = Soup(html) name = (soup.find('meta', attrs={'property': "og:title"}, mode='first')) if name != None: name = name.attrs['content'] else: name = (soup.find('title', mode='first')).text description = soup.find('meta', attrs={'property': "og:description"}, mode='first') if description != None: description = description.attrs['content'] else: description = soup.find('meta', attrs={'name': "description"}, mode='first') if description != None: description = description.attrs['content'] image = soup.find('meta', attrs={'property': "og:image"}, mode='first') if image != None: image = image.attrs['content'] else: image = soup.find('meta', attrs={'name': "image"}, mode='first') if image != None: image = image.attrs['content'] price = soup.find('meta', attrs={'property': "og:price:amount"}, mode='first') if price != None: price = price.attrs['content'] else: price = soup.find('meta', attrs={'name': "price"}, mode='first') if price != None: price = price.attrs['content'] vendor = soup.find('meta', attrs={'property': "og:site_name"}, mode='first') if vendor != None: vendor = vendor.attrs['content'] else: vendor = soup.find('meta', attrs={'name': "site_name"}, mode='first') if vendor != None: vendor = vendor.attrs['content'] return { "name": name if name != None else '', "description": description if description != None else '', "image": image if image != None else '', "price": price if price != None else '', "vendor": vendor if vendor != None else '', "dest_url": url }, 200
def get_article_for_packers(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('div', {'class': 'nfl-c-body-part nfl-c-body-part--text'}) return _arranged(lines) except Exception: return ''
def existe_usuario(usuario): url_user = url + usuario + '/?hl=es' try: html = get(url_user) except: return None return html
def fetch_sale(): url = "https://scrape.world/books" html = get(url) soup = Soup(html) books_raw = soup.find("div", {"class": "book-"}) books = [parse(book) for book in books_raw] on_sale = [name for name, price in books if price == 0.99] return "\n".join(on_sale)
def scrape_position_numberfire(position): url = f'https://www.numberfire.com/nhl/fantasy/yearly-projections/{position}' html = get(url) soup = Soup(html) tables = soup.find('table', {'class': 'projection-table'}) names = pd.read_html(str(tables[0]))[0] data = pd.read_html(str(tables[1]))[0] df = pd.concat([names, data], axis=1) return df
def scrape(): from gazpacho import get, Soup url = 'https://front.njpwworld.com/search/latest?page=465' soup = Soup(get(url)) movie_areas = soup.find('div', {'class': 'movieArea'}) links = list(map(lambda m: m.find('a'), movie_areas)) for link in links: movie_id = link[0].attrs['href'].replace('/p/', '') url = f'{endpoint_movie}{movie_id}' res = requests.post(url)
def nonLoggingSearch(self, name): base_url = 'https://m.facebook.com' nameAndSurname = name.split(' ') firstName = nameAndSurname[0] lastName = nameAndSurname[1] url = base_url + '/public/' + firstName + '+' + lastName cont = get(url, headers={':authority:': 'www.facebook.com', ':method:': 'GET', ':scheme:': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7', 'cache-control': 'max-age=0', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'viewport-width': '1536'}) content = requests.get(url).text soup = bs4.BeautifulSoup(content, "html.parser") linesWithData = [] hrefs = [] links = [] with open('fbcontent.txt', 'wb') as file: file.write(bytes(cont, encoding='utf-8')) with open('fbcontent.txt', 'rb') as file: for line in file: line = str(line, encoding='utf-8') if line.find('hidden_elem') != -1 and line.startswith('<div class="hidden_elem">'): linesWithData.append(line) elif line.find('href="/') != -1: linesWithData.append(line) tags = [] for elem in linesWithData: for i in range(len(elem)): if elem[i] == '<': tag = '' tag = tag + elem[i] j = i while elem[j] != '>': tag = tag + elem[j] j = j + 1 if elem[j] == '>': tag = tag + elem[j] tags.append(tag) for tag in tags: if tag.find('href') != -1: hrefs.append(tag) for href in hrefs: split = href.split(' ') for part in split: if part.find('href') != -1 and part.find('https://') == -1 and part.find('=https') == -1 and (part.lower().find(nameAndSurname[0].lower()) != -1 or part.lower().find(nameAndSurname[1].lower()) != -1): corrected_link = self.getUrlFromHref(part) if corrected_link is not None and corrected_link.endswith('/photos') is False: links.append(base_url + corrected_link) links = set(links) return links
def get_boxscore_urls(date): if isinstance(date, pd.Timestamp): date = date.strftime("%Y-%m-%d") url = f"{base}/nba/events/date/{date}" html = get(url) soup = Soup(html) games = soup.find("div", { 'class': "Layout__content" }).find('a', mode='all') urls = [base + game.attrs['href'] for game in games] return urls
def get_game_stats(url): url += "/stats" html = get(url) soup = Soup(html) rows = soup.find("div", {"class": "BoxScore__statLine"}) data = [] for row in rows: try: data.append(parse_stat_row(row)) except: pass return data
def scrape_daily_faceoff(): URL = 'https://www.dailyfaceoff.com/fantasy-hockey-projections/' html = get(URL) soup = Soup(html) df = pd.DataFrame() for id in ['igsv', 'igsv-1']: table = soup.find( 'table', {'id': f'{id}-1N8XNZpOIb8-6WcOPANqSHRyHBXlwZ6X_1vgGyDbETm4'}) df = df.append(pd.read_html(str(table))[0]) df = df.reset_index(drop=True) return df
def yahoo_draft_rankings(): URL = 'https://www.fantasypros.com/nhl/adp/overall.php' html = get(URL) soup = Soup(html) df = pd.read_html(str(soup.find('table')))[0] df[['first', 'last', 'team']] = df['Player Team'].str.split(' ', n=2, expand=True) df['name'] = df['first'] + ' ' + df['last'] df.columns = [c.lower() for c in df.columns] df = df[['name', 'yahoo']] return df
def make_soup(date): if not isinstance(date, pd.Timestamp): date = pd.Timestamp(date) params = { "StationID": 31688, "Year": date.year, "Month": date.month, "Day": date.day } html = get(url, params) soup = Soup(html) return soup
def get_places(episode): url = episode html = get(url) soup = Soup(html) table = soup.find('div', {'class': 'entry-content'}) links = table.find('a') place = [i.text for i in links][:-6] places = [] for i in place: if 'map' not in i: places.append(i) return places
def scrape_script(episode): url = 'https://seinfeldscripts.com/' + str(episode) html = get(url) soup = Soup(html) table = soup.find('div', {'id': 'content'}) script = table.find('p') scrip = [i.remove_tags() for i in script] lines = same_line(scrip) scri = [i.replace('\n', '') for i in lines] spaces = [re.sub(' +', ' ', i) for i in scri] lines = same_line(spaces) bracks = [re.sub('\[.*?\]', '', i) for i in lines] return bracks
def download(player_id): url = f'https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020' html = get(url) soup = Soup(html) table = soup.find('table', {'id': "gamelog"}) df = pd.read_html(str(table))[0] df.columns = ['_'.join(col) for col in df.columns] df['name'] = soup.find('h1').text df['player_id'] = player_id meta = soup.find('div', { 'id': 'meta' }).find('p', mode='first').remove_tags() df['position'] = meta.split(': ')[1].split(' •')[0] return df
def download_player(player_id): url = f"https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020" html = get(url) soup = Soup(html) table = soup.find("table", {"id": "gamelog"}) df = pd.read_html(str(table))[0] df.columns = ["_".join(col) for col in df.columns] df["name"] = soup.find("h1").text df["player_id"] = player_id meta = soup.find("div", { "id": "meta" }).find("p", mode="first").remove_tags() df["position"] = meta.split(": ")[1].split(" •")[0] return df
def mick_rijmwoordenboek(word: str, n_words: int): url = f"https://rijmwoordenboek.nl/rijm/{word}" html = get(url) soup = Soup(html) results = soup.find("div", { "id": "rhymeResultsWords" }).html.split("<br />") # clean up results = [r.replace("\n", "").replace(" ", "") for r in results] # filter html and empty strings results = [r for r in results if ("<" not in r) and (len(r) > 0)] return random.sample(results, min(len(results), n_words))
def capfriendly(): df = pd.DataFrame() for page in range(1, 10 + 1): url = f'https://www.capfriendly.com/browse/active/2020/salary&hide=team,clauses,position,handed,expiry-status,caphit,skater-stats,goalie-stats&p={page}' html = get(url) soup = Soup(html) pdf = pd.read_html(str(soup.find('table')))[0] df = df.append(pdf) time.sleep(0.5) df['PLAYER'] = df['PLAYER'].apply( lambda x: re.split("\d{1}|\d{2}|\d{3}", x)[-1].replace('. ', '')) df['SALARY'] = df['SALARY'].apply( lambda x: x.replace('$', '').replace(',', '')) df['SALARY'] = df['SALARY'].apply(float) df.columns = ['name', 'age', 'salary'] df = df.reset_index(drop=True) return df
def json(self) -> dict: """ The JSON file for this match """ if self.json_file: with open(self.json_file, "r") as f: return json.loads(f.read()) else: try: return get(self.json_url) except HTTPError as e: if e.code == 404: raise PageNotFoundException( e.code, f"Match {self.id} not found. Check that the id is correct.", ) raise PyCricinfoException(e.code, e.message)
def download_player_ids(): players = [] for letter in tqdm(string.ascii_lowercase): if letter == 'x': continue url = f'https://www.hockey-reference.com/players/{letter}/' html = get(url) soup = Soup(html) strong = soup.find('strong') for s in strong: try: player = s.find('a').attrs['href'].split('.')[0].split('/')[-1] players.append(player) except: pass time.sleep(1) return players