def from_warc(warc_record): """ Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article extractor. :return: """ raw_stream = warc_record.raw_stream.read() encoding = None try: encoding = warc_record.http_headers.get_header( 'Content-Type').split(';')[1].split('=')[1] except: pass if not encoding: encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True) if not encoding: # assume utf-8 encoding = 'utf-8' html = raw_stream.decode(encoding) url = warc_record.rec_headers.get_header('WARC-Target-URI') download_date = warc_record.rec_headers.get_header('WARC-Date') article = NewsPlease.from_html(html, url=url, download_date=download_date) return article
def getOneEntry(searchTerm): searchTerm = searchTerm.replace('\n', '') response = requests.get( urlSearchTemplate.format(searchTerm.replace(' ', '%20'))) if response.ok: http_encoding = response.encoding if 'charset' in response.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( response.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding) result = processHtml(soup, searchTerm) if ("/tpl" in result[0]): result = getOneEntry2(result[1], result[0]) resultSplited = result.split(',') if len(resultSplited) == 3: resultSplited = [i.decode('utf-8').strip() for i in resultSplited] nome = resultSplited[0] status = resultSplited[1] nome_aceito = resultSplited[1] return nome, status, nome_aceito else: return '', '', '' else: return 'Bad Response!'
def scrape_page(url): ''' NOTE: this throws away any links that can't be addons (ie: assumes we're not going any deeper) ''' resp = None links = set() if url.endswith('.jpg') or url.endswith('.png') or url.endswith( '.gif') or url.endswith('.rar'): return set() head = time_wrapper(requests.head, (url, ), t=3) if head: try: cl = int(head.headers['Content-Length']) except: cl = -1 if cl < 1000000: resp = time_wrapper(requests.get, (url, ), t=3) if not resp: return set() netloc = urlparse(url).netloc.split(':')[0] http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) for link in soup.find_all('a', href=True): if ".zip" in link['href'] or 'github' in link['href']: href = link['href'] if not href.startswith('http'): href = 'http://' + netloc + '/' + href if can_be_repo(href): links.add(href) return links
def getHTML(url, verb=False): ''' This function takes and url as an input and returns the corresponding bs4 object ''' from bs4.dammit import EncodingDetector try: re = session.get(url, headers=headers, timeout=(10, 30)) except: print(r'problem here') return (None) else: if re.status_code == 200: # dealing with encoding http_encoding = re.encoding if 'charset' in re.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( re.content, is_html=True) encoding = html_encoding or http_encoding # generating BeautifulSoup object bsObj = BeautifulSoup(re.content, 'html5lib', from_encoding=encoding) if verb == True: print("The title of html is %s" % bsObj.title.getText()) return (bsObj) else: return (None)
def scrape_politifact_article(story_url): resp = requests.get(story_url) http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) return soup.find("div", "article__text").get_text()
def getLinks(): parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed for i in range(1,100): if os.path.exists('pdfs/' + str(i)): print(str(i),'already exists') continue resp = requests.get("https://quizbowlpackets.com/"+str(i)) http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, parser, from_encoding=encoding) links = [] allLinks = soup.find_all('a', href=True) combined = [True if 'pdf' in link['href'] else False for link in allLinks] if not any(combined): print(str(i), 'doesn\'t exist') continue for link in allLinks: link = link['href'] if 'Packet' in link: links.append(link) print(links) with open('pdfs/' + str(i),'wb') as file: pickle.dump(links, file)
def get_soup(self, _page=0): """ scrape web-site page """ # get request self.__response = self.get_request() if self.__verbose: _log.debug(f'self.__response={self.__response}') # get encoding _http_encoding = self.__response.encoding if 'charset' in self.__response.headers.get( 'content-type', '').lower() else None _html_encoding = EncodingDetector.find_declared_encoding( self.__response.content, is_html=True) # get soup self.__soup = None try: if self.__verbose: _log.debug(f'Getting soup from self.__response.text') self.__soup = BeautifulSoup(self.__response.text, features='html5lib', from_encoding=(_html_encoding or _http_encoding)) if self.__verbose: _log.debug(f'Got soup from self.__response.text OK') except Exception as e: self.__soup = None if self.__verbose: _log.error( f'Failed to get soup from self.__response.text, error={e}')
def from_warc(warc_record, decode_errors="replace"): """ Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article extractor. :return: """ raw_stream = warc_record.raw_stream.read() encoding = None try: encoding = warc_record.http_headers.get_header('Content-Type').split(';')[1].split('=')[1] except: pass if not encoding: encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True) if not encoding: # assume utf-8 encoding = 'utf-8' try: html = raw_stream.decode(encoding, errors=decode_errors) except LookupError: # non-existent encoding: fallback to utf-9 html = raw_stream.decode('utf-8', errors=decode_errors) if not html: raise EmptyResponseError() url = warc_record.rec_headers.get_header('WARC-Target-URI') download_date = warc_record.rec_headers.get_header('WARC-Date') article = NewsPlease.from_html(html, url=url, download_date=download_date) return article
def otherTeams(sumOfQ): remainigTeams = 16- sumOfQ finalTeams = [] if remainigTeams <=0: return [] else: randomTeams = sample(range(10),remainigTeams) randomTeamsIndex = sample(range(remainigTeams+2),remainigTeams) randomTeamSelection = choices(range(len(league_qlf_list)),k=remainigTeams) for i in range(0,remainigTeams): temp_x = randomTeamSelection[i] try: tempClubsLst = [] url = 'https://www.worldfootball.net'+league_qlf_list[temp_x] source = requests.get(url, headers=header) http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding) find_boxS= soup.find('div',class_="scrollable_tables") the_team_table = find_boxS.find('table', {'class':'standard_tabelle'}) for theTeamAtag in the_team_table.find_all('a',href=True): if theTeamAtag.text: tempClubsLst.append(theTeamAtag.text) y = randomTeamsIndex[i] teamNames = tempClubsLst[y] finalTeams.append(teamNames) except Exception as e: print(e) return finalTeams
def getSteam(self, q, size): querys = q.replace(" ", "+") url = ('https://store.steampowered.com/search/?term=' + str(querys) + '&category1=998') resp = requests.get(url) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding, features="lxml") print(url) SteamLinkList = [] #find links to apps for link in soup.find_all('a', href=re.compile('app')): #remove duplicates if (link['href'] not in SteamLinkList): SteamLinkList.append(link['href']) #remove first two irrelevant links return SteamLinkList[2:size + 2]
def desi_crawler(u_r_l): web_list = [] url = u_r_l web_list.append(url) domain = url if "www." not in domain: div = domain.replace('//', ' ').replace('.', ' ').split() domain = div[1] else: div = domain.replace('//', ' ').replace('.', ' ').split() domain = div[2] for url in web_list: response = requests.get(url) http_encoding = response.encoding if 'charset' in response.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( response.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(response.content, from_encoding=encoding) for link in soup.find_all('a', href=True): if domain in link['href']: if link['href'] not in web_list: web_list.append(link['href'])
def top4leagues(leagueList,index): rangeOfWork = team_qulfied[index] defaultLst = [] temIndex = sample(range(12),rangeOfWork) for i in range(rangeOfWork): clubsIndex = randint(0,1) try: tempClubsLst = [] url = 'https://www.worldfootball.net'+leagueList[clubsIndex] source = requests.get(url, headers=header) http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding) find_boxS= soup.find('div',class_="scrollable_tables") the_team_table = find_boxS.find('table', {'class':'standard_tabelle'}) for theTeamAtag in the_team_table.find_all('a',href=True): if theTeamAtag.text: tempClubsLst.append(theTeamAtag.text) y = temIndex[i] teamNames = tempClubsLst[y] defaultLst.append(teamNames) tempClubsLst.pop() except Exception as e: print(e) return defaultLst
def getIMDB(self, queryi): url = ('https://www.imdb.com/search/keyword/?keywords=' + str(queryi) + '&ref_=fn_kw_kw_1&mode=detail&page=1&sort=moviemeter,asc') resp = requests.get(url) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding, features="lxml") print(url) imdbLinkList = [] #find links to titles for link in soup.find_all('a', href=re.compile('title')): #remove irrelevant links if "vote" not in link['href'] and "search" not in link[ 'href'] and "plotsummary" not in link['href']: #remove duplicates if ('https://www.imdb.com' + link['href'] not in imdbLinkList): imdbLinkList.append('https://www.imdb.com' + link['href']) return imdbLinkList
def prepare_complete_links(url): http_regex = re.compile(r'http') page = requests.get(url) http_encoding = page.encoding if 'charset' in page.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(page.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(page.content, 'html.parser', from_encoding=encoding) complete_links = [] for alink in soup.find_all('a', href=True): if http_regex.search(alink['href']) is not None: complete_links.append(alink['href']) print( http_regex.search(alink['href']).group() + "---" + alink['href']) elif 'javascript' not in alink['href'] and len( alink['href'].strip()) > 0: if alink['href'][:1] == '/': temp_link = TWM_DOMAIN + alink['href'] complete_links.append(temp_link) print("need http" + "---" + alink['href']) else: temp_link = TWM_DOMAIN + "/" + alink['href'] complete_links.append(temp_link) return list(set(complete_links))
def compile_links(web_address): ''' compile_links accesses a webpage at a given address, finds all of the links on that page, and appends certain links to a list called links_list. compile links works together with find_diffraction_files to get only the relevant links. inputs are a web address, and the list for storing links ''' html_page = requests.get(web_address) http_encoding = html_page.encoding if 'charset' in\ html_page.headers.get('content-type', '').lower() else None html_encoding =\ EncodingDetector.find_declared_encoding(html_page.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(html_page.content, from_encoding=encoding, features="html.parser") links_list = [] permutation_attempt = soup(text=re.compile("Now trying variations on your request:")) if len(permutation_attempt) is not 0: return links_list for link in soup.find_all(href=find_diffraction_files): links_list.append('http://rruff.geo.arizona.edu'+link['href']) return links_list
def get_all_uic_links_from_url(base_url, h=None): resp = requests.get(base_url, headers=headers) base_url = resp.url if is_url_end_point(base_url): return [], "" http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) uic_link_list = [] for link in soup.find_all('a', href=True): if is_url_end_point(link['href']): continue target_url = '' o = urlparse(link['href']) if "uic.edu" in o.netloc: target_url = link['href'].rstrip('/') elif not is_absolute(link['href']): target_url = (urllib.parse.urljoin(base_url, link['href'])).rstrip('/') target_url = target_url.replace("http:", "https:") if target_url is not '': uic_link_list.append(target_url) return list(set(uic_link_list)), h.handle(resp.text)
def get_url_soup(url): url_request = requests.get(url, headers=headers, allow_redirects=True) http_encoding = url_request.encoding if 'charset' in url_request.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( url_request.content, is_html=True) encoding = html_encoding or http_encoding return BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
def doc_encoding(self) -> str: http_encoding = self.doc.encoding if "charset" in self.doc.headers.get( "Content-Type", "").lower() else None html_encoding = EncodingDetector.find_declared_encoding( self.doc.content, is_html=True) encoding: str = str(html_encoding or http_encoding) self.sdoc.encoding = encoding return encoding
def get_text(html): # Detect encoding and extract plain text from page encoding = EncodingDetector.find_declared_encoding(html, is_html=True) soup = BeautifulSoup(html, "lxml", from_encoding=encoding) for script in soup(["script", "style"]): script.extract() return soup.get_text(" ", strip=True)
def get_html_title(self, page, record): try: encoding = EncodingDetector.find_declared_encoding(page, is_html=True) soup = BeautifulSoup(page, "lxml", from_encoding=encoding) title = soup.title.string.strip() return title except: return ""
def grab_projects(self, resp): http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) links = [self.BASE_URL + link['href'] for link in soup.find_all('a', href=True) if link['href'].startswith("/projects/")] self.add_to_queue(urls=links, website_name=self.NAME) return len(links)
def get_html(url): headers = {"User-Agent": USERAGENT} resp = requests.get(url, headers=headers) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding return resp.text
def get_soup_for_url(base_url): resp = requests.get(base_url) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) return soup
def get_source_html(url): headers = {"User-Agent": 'Chrome'} resp = requests.get(url, headers=headers) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding webpage = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) return webpage
def get_html_text_body(self, page, record): try: encoding = EncodingDetector.find_declared_encoding(page, is_html=True) soup = BeautifulSoup(page, "lxml", from_encoding=encoding) for script in soup(["script", "style"]): script.extract() return soup.get_text(" ", strip=True) except: return ""
def get_soup_html(url, headers=GET_HEADER): resp = SESSION.get(url, headers=headers) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) return soup
def getSoup(matchUrl): res = requests.get(matchUrl) res.raise_for_status() http_encoding = res.encoding if 'charset' in res.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(res.content, is_html=True) encoding = html_encoding or http_encoding return bs4.BeautifulSoup(res.content, 'lxml', from_encoding=encoding)
def get_text(self, record): try: content = record.content_stream().read() encoding = EncodingDetector.find_declared_encoding(content, is_html=True) soup = BeautifulSoup(content, "lxml", from_encoding=encoding) # strip all script and style elements for script in soup(["script", "style"]): script.decompose() return soup.get_text(" ", strip=True) except: return ""
def get_the_soup(self): if self.url is None: # Early exit return resp = requests.get(self.url, proxies=urllib.request.getproxies()) http_encoding = (resp.encoding if "charset" in resp.headers.get( "content-type", "").lower() else None) html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, "lxml", from_encoding=encoding) adjust_anchors(soup) return soup
def getData(searchTerm, offset=0, inputFile=os.path.join('data', 'ListaMacrofitasResult.csv')): response = requests.get(urlSearchTemplate.format(searchTerm, offset)) registries = [] not_found_registries = [] if response.ok: # Fetch enconding used from source http_encoding = response.encoding if 'charset' in response.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( response.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding) divs = soup.findAll("div", {"class": "record"}) try: page_hint = soup.find("div", {"id": "div_hint_summary"}) except: writeNotFoundOutput(searchTerm) return # If no results are found, write searchTerm to file if page_hint == None or "Nenhum registro encontrado" in page_hint.find( "b").text: writeNotFoundOutput(searchTerm) else: hints = page_hint.findAll('ll') offset = int(hints[1].text) max_registries = int(hints[2].text) for div in divs[1:]: scientificName, municipality, state, country, latitude, longitude, date = parseDiv( div) registries.append('{},{},{},{},{},{},{}'.format( searchTerm, municipality, state, country, latitude, longitude, date)) writeOutput(registries) if (offset < max_registries): getData(searchTerm, offset) else: response.raise_for_status()
def get_html_encoding(html): return EncodingDetector.find_declared_encoding(html, is_html=True, search_entire_document=False)