def handler(event, context): class job: def __init__(self, title, summary): self.title = title self.summary = summary search_string = event['queryStringParameters']['search'] location = event['queryStringParameters']['state'] client = ScraperAPIClient(os.environ.get('API_KEY')) monster_url = f'https://www.monster.com/jobs/search?q={search_string}&where={location}&page=1' page = client.get(url=monster_url, render=True) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(class_='results-page') job_elems = results.find_all('div', class_='results-card') jobs = { "jobs": [] } for job_elem in job_elems: title_elem = job_elem.find('div', class_='title-company-location') summary_elem = job_elem.find('div', class_='results-card-description') jobs['jobs'].append( job(title_elem.text.strip(), summary_elem.text.strip())) result = json.dumps(jobs, default = lambda x: x.__dict__) # result = jobs return { 'statusCode': 200, 'headers': { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET' }, 'body': result }
def get_proxies_from_scraper_api(cls, proxy_count=10): client = ScraperAPIClient('3af7d62e85b75e0271d32f245107a240') proxies = set() for i in range(1, proxy_count): result = client.get(url='http://httpbin.org/ip').text json_data = json.loads(result) print(json_data) proxies.add(json_data["origin"]) print(proxies) return proxies
class ScraperApi(Base): def __init__(self, key: str, max_retry): self.max_retry = max_retry self.client = ScraperAPIClient(key) def get(self, url: str, headers: Dict[str, str]): if not headers: headers = {} if "User-Agent" not in headers.keys(): headers["User-Agent"] = USER_AGENTS[randint( 0, len(USER_AGENTS) - 1)] return self.client.get(url, headers, retry=self.max_retry).text def get_json(self, url: str, headers: Union[Dict[str, str], None]): return json.loads(self.get(url, headers))
def main(): parser = argparse.ArgumentParser(description='Parses command line arguments') parser.add_argument('--scraper_api_key', type=str, required=True) args = parser.parse_args() client = ScraperAPIClient(args.scraper_api_key) result = json.loads(client.get(url='http://httpbin.org/ip').text) print('Rotated proxy IP address = ' + result['origin']) urls = [ client.scrapyGet(url='http://quotes.toscrape.com/page/1/'), client.scrapyGet(url='http://quotes.toscrape.com/page/2/'), ] for url in urls: r = requests.get(url) # add parsing logic here print(r.status_code)
def check_availability(URL, scraper_api_key): # To prevent from being blocked. sleep(randint(30, 40)) client = ScraperAPIClient(scraper_api_key) result = client.get(URL) parsed_html = BeautifulSoup(result.content, 'html.parser') availability = parsed_html.select("#availability") if availability: availability = availability[0].getText().strip().splitlines()[0] if availability != "Currently unavailable.": return True return False
def product_scraper(url): from scraper_api import ScraperAPIClient from bs4 import BeautifulSoup import json client = ScraperAPIClient('9aa1dbc863b8334850efccb9be3552f8') try: page = client.get(url=url, render=True) except: result = {'status_code': 500, 'status': 'scraper api fatal error', 'elapsed_time': '', 'price': '', 'title': ''} return json.dumps(result) if page.status_code != 200: result = {'status_code': page.status_code, 'status': 'error', 'elapsed_time': int(page.elapsed.total_seconds()), 'price': '', 'title': ''} return json.dumps(result) else: soup = BeautifulSoup(page.content, 'html.parser') result_title = soup.findAll( "span", {"class": "pdp-mod-product-badge-title"}) result_price = soup.findAll("div", {"class": "pdp-product-price"}) if result_title and result_price: title = result_title[0].text price = float(result_price[0].find_next( 'span').text.strip('RM').replace(',', '')) result = {'status_code': page.status_code, 'status': 'success', 'elapsed_time': int( page.elapsed.total_seconds()), 'title': title, 'price': price} return json.dumps(result) else: result = {'status_code': 500, 'status': 'blocked/nocontent', 'elapsed_time': int(page.elapsed.total_seconds()), 'title': '', 'price': ''} return json.dumps(result)
def store_product_info(product_keys): for key in product_keys: url = gen_url(base_url, count, offset, page, store_id, key) try: client = ScraperAPIClient(settings.SCRAPER_API_KEY) result = client.get(url=url) if result.status_code == 200: res_json = result.json() elif result.status_code == 500: raise Exception("Request not successful, status: 500") elif result.status_code == 403: raise Exception("Plan max request exceeded, status: 403") elif result.status_code == 404: raise Exception("Request not found, status: 404") elif result.status_code == 410: raise Exception("Request gone or deleted, status: 410") except Exception as e: logger.error("failed to fetch product info, error: " + str(e)) try: store_to_db(res_json["products"]) except Exception as e: logger.error("failed to save product info, error: " + str(e))
async def graph(request): scraper_api_key = getenv("SCRAPER_API_KEY") if getenv("SCRAPER_API_KEY") else None # Example url where pair graph can be fetched # url = 'https://gov.capital/forex/usd-eur/' pair = 'usd-eur' url = 'https://gov.capital/forex/{}/'.format(pair) client = ScraperAPIClient(scraper_api_key) request_result = client.get(url, render=True).text soup = BeautifulSoup(request_result) # Removing all divs containing ads for ads in soup.find_all("div", {"class": "code-block code-block-2"}): # Removes all ads in fetched page source ads.decompose() cleaned_graph_html = '<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js" integrity="sha256-4+XzXVhsDmqanXGHaHvgh1gMQKX40OUvDEBTu8JcmNs=" crossorigin="anonymous"></script>\n\ <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.4.0/Chart.min.js"></script>' cleaned_graph_html = cleaned_graph_html + soup.find('canvas').next.__str__() return html(cleaned_graph_html)
def handler(event, context): client = ScraperAPIClient(os.environ.get('API_KEY')) URL = "https://www.indeed.com/jobs?q=Entry+Level+Software+Engineer&l=Remote" page = client.get(url=URL) print(page) soup = BeautifulSoup(page.content, 'html.parser') print(soup) results = soup.find(id='resultsCol') print(results) job_elems = results.find_all('div', class_='jobsearch-SerpJobCard') titles = [] companies = [] summaries = [] for job_elem in job_elems: title_elem = job_elem.find('h2', class_='title') company_elem = job_elem.find('div', class_='sjcl') summary_elem = job_elem.find('div', class_='summary') titles.append(title_elem.text.strip()) companies.append(company_elem.text.strip()) summaries.append(summary_elem.text.strip()) response = { "titles": titles, "companies": companies, "summaries": summaries } return { 'statusCode': 200, 'headers': { 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET' }, 'body': json.dumps(response) }
def handler(event, context): class job: def __init__(self, title, company, summary): self.title = title self.company = company self.summary = summary search_string = event['queryStringParameters']['search'] location = event['queryStringParameters']['state'] client = ScraperAPIClient(os.environ.get('API_KEY')) URL = f"https://www.indeed.com/jobs?q={search_string}&l={location}" page = client.get(url=URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(id='resultsCol') job_elems = results.find_all('div', class_='jobsearch-SerpJobCard') jobs = {"jobs": []} for job_elem in job_elems: title_elem = job_elem.find('h2', class_='title') company_elem = job_elem.find('div', class_='sjcl') summary_elem = job_elem.find('div', class_='summary') jobs['jobs'].append( job(title_elem.text.strip(), company_elem.text.strip(), summary_elem.text.strip())) result = json.dumps(jobs, default=lambda x: x.__dict__) return { 'statusCode': 200, 'headers': { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET' }, 'body': result.replace("\\n", " ") }
def handler(event, context): client = ScraperAPIClient(os.environ.get('API_KEY')) monster_url = "https://www.monster.com/jobs/search?q=Software+Engineer&where=Nashville%2C+TN&page=6" page = client.get(url=monster_url, render=True) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(class_='results-page') job_elems = results.find_all('div', class_='results-card') titles = [] summaries = [] for job_elem in job_elems: title_elem = job_elem.find('div', class_='title-company-location') summary_elem = job_elem.find('div', class_='results-card-description') titles.append(title_elem.text.strip()) summaries.append(summary_elem.text.strip()) response = {"titles": titles, "summaries": summaries} return { 'statusCode': 200, 'headers': { 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET' }, 'body': json.dumps(response) }
import proxies import requests from proxies import random_proxy from scraper_api import ScraperAPIClient import random def random_line(fname): lines = open(fname).read().splitlines() return random.choice(lines) client = ScraperAPIClient('d0224166b175ddca1f18dd5b5cca66a5') result = client.get(url = 'http://httpbin.org/headers', headers={ 'useragent':random_line('user-agents.txt') } ) print(result.text);
def useScraperAPI(URL): client = ScraperAPIClient('d249e87aac326eaa5e728aafaa319d10') page = client.get(url=URL) page = BeautifulSoup(page.text, 'html.parser') return page
class snkrs: def __init__(self) -> None: self.products = {} def getInfoInnvictus(self, apiKey='aba275ef5f8f713e086a1a0ab240dd5c'): """ Method to retrieve a dict given launching shoes on innvictus.com/lanzamientos """ self.client = ScraperAPIClient(apiKey) result = self.client.get(url='https://www.innvictus.com/lanzamientos') assert result.status_code == 200, f"Status code {result.status_code}" soup = BeautifulSoup(result.content, 'html.parser') links = soup.find('body').find_all("script", attrs={'type': 'text/javascript'}) s = links[3] # links[3] element gives var products productVar = re.search(r'\'(.*?)\'', str(s)) productDict = json.loads( productVar.group(0).replace("'{\"id", "[{\"id").replace("\"}'", "\"}]")) #print(productVar.group(0).replace("'{\"id","[{\"id").replace("\"}'","\"}]")) for tenis in productDict: tenis['url'] = "https://www.innvictus.com/p/{id}".format( id=tenis['id']) self.products = productDict def selectSaveTargetShoes(self, save=False): i = 0 for shoes in self.products: print(f"Selelction {i}") print("Shoe: ", shoes['name2']) print("Launching date: ", shoes['realdate']) print("Price: ", shoes['price'], "\n") i += 1 selectedShoe = [ int(x) for x in input("Select shoe number to track\n").split() ] #print(selectedShoe) def takeFromDict(dct, listElements): return [dct[element] for element in listElements] self.products = takeFromDict(self.products, selectedShoe) if save == True: with open('TargetShoes.txt', 'w') as f: for shoe in self.products: f.write(shoe['url']) f.write("\n") def isAvailable(self, fileName=None): def sendDiscordMessage(self, message='Hello world'): from discord import Webhook, RequestsWebhookAdapter webhook = Webhook.from_url( "https://discord.com/api/webhooks/811294684386426890/3SY7GtmBAwyjDM6qr73eDRqrzjRJZ2u0vlShoyOvvf_cCUNvv6YJqiGPI1udVWWqipVp", adapter=RequestsWebhookAdapter()) webhook.send(message) if fileName == None: for url in self.products: notAvailable = self.client.get(url['url']) assert notAvailable.status_code == 200, f"Status code {notAvailable.status_code}" soupNotAvailable = BeautifulSoup(notAvailable.content, 'html.parser') try: notFoundClass = soupNotAvailable.find('body').find( "div", attrs={ 'class': 'pdp-notFound' }).text.strip() notFoundTitle = soupNotAvailable.find('title').text.strip() except: notFoundClass = '' notFoundTitle = '' if "No encontrado" in notFoundTitle or "Este producto no" in notFoundClass: url['Available'] = False message = url['name2'], " no disponible aun" print(message) return False else: message = url['name2'], " disponible", url['url'] print(message) url['Available'] = True sendDiscordMessage(self, message=message) return True else: with open(fileName, 'r') as targetShoes: urls = [url.strip() for url in targetShoes] for url in urls: notAvailable = self.client.get(url, headers=self.headers) soupNotAvailable = BeautifulSoup(notAvailable.content, 'html.parser') try: notFoundClass = soupNotAvailable.find('body').find( "div", attrs={ 'class': 'pdp-notFound' }).text.strip() notFoundTitle = soupNotAvailable.find( 'title').text.strip() except: notFoundClass = '' notFoundTitle = '' if "No encontrado" in notFoundTitle or "Este producto no" in notFoundClass: print(url, " no disponible aun\n") else: print(url, " disponible \n")
import time import re # Use proxy api to simplify web scraping from scraper_api import ScraperAPIClient client = ScraperAPIClient('api_key') # Loop over multiple search results pages: 30 results per page * 10 pages object_name = [] object_href = [] for i in range(10): if i == 0: url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C+OR&ns=1' else: url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C%20OR&start=' + str(i * 30) main_page = client.get(url).text soup = BeautifulSoup(main_page, 'lxml') subpages = soup.select('.text-size--inherit__373c0__2fB3p .link-color--inherit__373c0__3dzpk') for j in subpages[2:32]: object_name.append(j.string) object_href.append(j.get('href')) # DataFrame: Master list: Business name; Business link object_href = ['https://www.yelp.com' + url for url in object_href] object_data = list(zip(object_name, object_href)) object_df = pd.DataFrame(data = object_data, columns = ['name', 'url']) # Inspect duplicated values print(sum(object_df.duplicated(subset = ['name'], keep = 'first'))) # Drop duplicated values
def testErrorCode(url): client = ScraperAPIClient('XXXXXXXXXXXXXXXXXXXXXX') return client.get(url=url)
from scraper_api import ScraperAPIClient client = ScraperAPIClient('60d3e592477ebfd328a11e92ce6600c9') result = client.get(url='http://httpbin.org/ip').text print(result)
def proxy_request(url): client = ScraperAPIClient(os.environ.get('PROXY_API_KEY')) rsp = client.get(url=url, headers=HEADERS) return rsp
from selenium import webdriver from time import sleep import time from random import randrange from bs4 import BeautifulSoup from selenium.webdriver.common.keys import Keys from scraper_api import ScraperAPIClient client = ScraperAPIClient('********************') result = client.get(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True).text print("starting getting information from the web ...") start_urls = [client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True)] def parse(self, response): yield scrapy.Request(client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True), self.parse) soup = BeautifulSoup(result, 'html.parser') Adresse_text = [] Adresse_source = soup.find('div', attrs={'class': 'section-review-subtitle section-review-subtitle-nowrap'})
class GoogleNews: def __init__(self, key: str, lang="en", period="", ua=""): assert key != "" self.client = ScraperAPIClient(key) self.user_agent = ua self.__texts = [] self.__titles = [] self.__links = [] self.__results = [] self.__lang = lang self.__period = period self.__exec_time = 0 def set_lang(self, lang): self.__lang = lang def search(self, q: Union[List[str], str], p: Union[List[int], int], start: datetime, end: datetime) -> List[dict]: """ Searches for a term in google news and retrieves the first page into __results. Parameters: key = the search term """ start_time = time() if isinstance(q, str): q = [q] if isinstance(p, int): p = [p] elif len(p) < 1: p = [1] for query in q: for page in p: out = self.scrape_page(query, page, start, end) for o in out: if o["title"] not in self.__titles: self.__results.append(o) self.__links.append(o["link"]) self.__texts.append(o["title"] + " " + o["desc"]) self.__exec_time = time() - start_time return self.__results def scrape_page(self, q: str, page: int, start: datetime, end: datetime, attempts=0): """ page = number of the page to be retrieved """ payload = { 'q': q, 'lr': f'lang_{self.__lang}', 'tbs': f"lr:lang_1{self.__lang}", 'tbm': 'nws', 'start': (10 * (page - 1)), } out: List[dict] = [] if start is not None and end is not None: payload['tbs'] += f",cdr:1,cd_min:{start.strftime(DATE_FMT)}," \ f"cd_max:{end.strftime(DATE_FMT)}" try: page = self.client.get(url=GOOG_URL + "?" + parse.urlencode(payload)).text content = Soup(page, "html.parser") except Exception as e: attempts += 1 if attempts > 5: print(f"ERROR TRYING TO LOAD CONTENT: {e}") raise e sleep(0.1 * attempts) self.scrape_page(q, page, start, end, attempts) try: result = content.find_all("div", id="search")[0].find_all("g-card") except IndexError: # no results were found return out for item in result: try: out.append({ "title": item.find("div", { "role": "heading" }).text.replace("\n", ""), "link": item.find("a").get("href"), "media": item.findAll("g-img")[1].parent.text, "date": item.find("div", { "role": "heading" }).next_sibling.findNext('div').findNext('div').text, "desc": item.find("div", { "role": "heading" }).next_sibling.findNext('div').text.replace("\n", ""), "image": item.findAll("g-img")[0].find("img").get("src") }) except Exception: pass return out def get_results(self) -> List[dict]: """Returns the __results.""" return self.__results def get_text(self) -> List[str]: """Returns only the __texts of the __results.""" return self.__texts def get_links(self) -> List[str]: """Returns only the __links of the __results.""" return self.__links def clear(self): self.__texts = [] self.__links = [] self.__results = [] self.__titles = [] self.__exec_time = 0
def search_googleScholar(query, headers, _gs_pages, records, _title, _keyword, _abstract, scrpr_api, _from_yr, _to_yr_, logging_flag, data): rec = 0 if _title: # request url url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=%22' + query + '%22&btnG=' # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') print('Searching in Google Scholar...') # set the counter for records count count = 0 for i in tqdm(range(1)): # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: if bool(item.select('.gs_or_ggsm')): cc = str( re.findall( r'\d+', str(item.select('.gs_fl') [1].get_text()))).split(',', 1)[0].replace( '[', '') else: cc = str( re.findall( r'\d+', str(item.select('.gs_fl') [0].get_text()))).split(',', 1)[0].replace( '[', '') if bool(item.select('.gs_ct1')): type = str(item.select('.gs_ct1')[0].get_text()) else: type = str(['Research Article']) resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited count, Type, Published " "date, Abstract", "items": [{ "DOI": str(['No information found']), "Title": item.select('h3')[0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": re.sub( "[^A-Za-z]", " ", str(item.select('.gs_a') [0].get_text()).split('-', 1)[0]), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": cc, "Affiliation": str(['No information found']), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select('.gs_a') [0].get_text()))).strip(), "Abstract": item.select('.gs_rs')[0].get_text() }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) print(f'Finished with total {count} records returned.') logger.writeRecords(query, None, _engine, "1", count, logging_flag) return data if _keyword or _abstract: if _gs_pages != 0: pages = pagination(_gs_pages) else: pages = 1 # search for dates if _from_yr: # use of scraper api to avoid IP block issue by Google scholar client = ScraperAPIClient(scrpr_api) count = 0 for i in tqdm(range(1)): print("Searching Google Scholar Engine now please wait...") url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=' response = client.get(url, headers={'User-agent': 'your bot 0.1'}) if response.status_code != 200: print("Request failed with status", response.status_code) logger.writeError( "Logging Error:" + str(response.status_code), None, _engine, logging_flag) else: soup = BeautifulSoup(response.content, 'lxml') # count no of records returned by google scholar for item in soup.find_all('div', class_='gs_ab_st'): rec = \ str(item.find_all('div', id='gs_ab_md')[0].get_text()).split(' ', 1)[1].replace(',', "").split( ' ', 1)[0] pages = 1 if _gs_pages != 0: pages = pagination(_gs_pages) else: pages = pagination(rec) # check if records are greater than 1000 or not if int(pages) > 100: print( "NOTE:Google Scholar returns data for max 1000 records irrespective of total records. " "Total No of total records found :", rec, "\n Fetching records details now...") pages = 100 for i in range(pages): url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str( i) + '0' # response = requests.get(url, proxies={"http": proxy, "https": proxy}, headers=headers) response = client.get( url, headers={'User-agent': 'your bot 0.1'}) soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: try: if bool( item.select('.gs_rs') [0].get_text()): abstract = item.select( '.gs_rs')[0].get_text() else: abstract = str( ['No information found']) except: abstract = str( ['No information found']) pass try: if bool(item.select('.gs_or_ggsm')): cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split( ',', 1)[ 0].replace('[', '') else: cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split( ',', 1)[ 0].replace('[', '') except: cc = str(['No information found']) pass try: if bool(item.select('.gs_ct1')): type = str( item.select('.gs_ct1') [0].get_text()) else: type = str(['Research Article']) except: type = str(['No information found']) pass # response object resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited count, " "Type, Published date, Abstract", "items": [{ "DOI": str(['No information found']), "Title": item.select('h3') [0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": re.sub( "[^A-Za-z]", " ", str( item.select('.gs_a') [0].get_text()).split( '-', 1)[0]), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": cc, "Affiliation": str(['No information found']), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select( '.gs_a') [0].get_text())) ).strip(), "Abstract": abstract }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) else: for i in range(pages): url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str( i) + '0' response = client.get( url, headers={'User-agent': 'your bot 0.1'}) if response.status_code != 200: print("Request failed with stauts", response.status_code) logger.writeError( "Logging Erorr:" + str(response.status_code), None, _engine, logging_flag) else: soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: try: if bool( item.select('.gs_rs') [0].get_text()): abstract = item.select( '.gs_rs')[0].get_text() else: abstract = str( ['No information found']) except: abstract = str( ['No information found']) pass try: if bool(item.select( '.gs_or_ggsm')): cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split( ',', 1)[ 0].replace('[', '') else: cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split( ',', 1)[ 0].replace('[', '') except: cc = str(['No information found']) pass try: if bool(item.select('.gs_ct1')): type = str( item.select('.gs_ct1') [0].get_text()) else: type = str( ['Research Article']) except: type = str( ['No information found']) pass resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited " "count, Type, Published date, " "Abstract", "items": [{ "DOI": str([ 'No information found' ]), "Title": item.select( 'h3')[0].get_text(), "URLs": item.select('a')[0] ['href'], "Authors": re.sub( "[^A-Za-z]", " ", str( item.select( '.gs_a')[0]. get_text()).split( '-', 1)[0]), "Publication Name": str([ 'No information found' ]), "ISSN": str([ 'No information found' ]), "Cited count": cc, "Affiliation": str([ 'No information found' ]), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select( '.gs_a') [0].get_text()) )).strip(), "Abstract": abstract }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError( e, None, _engine, logging_flag, filename, line_number) time.sleep(1) print(f'Finished with total {count} records returned.') logger.writeRecords(query, None, _engine, rec, count, logging_flag) return data # search without dates else: print("Searching Google Scholar Engine now please wait...") client = ScraperAPIClient(scrpr_api) count = 0 for i in tqdm(range(1)): for i in range(pages): # request url url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&btnG=&start=' + str( i) + '0' # response object response = client.get( url, headers={'User-agent': 'your bot 0.1'}) if response.status_code != 200: print("Request failed with stauts", response.status_code) logger.writeError( "Logging Erorr:" + str(response.status_code), None, _engine, logging_flag) soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: try: if bool(item.select('.gs_rs')[0].get_text()): abstract = item.select( '.gs_rs')[0].get_text() else: abstract = str(['No information found']) except: abstract = str(['No information found']) pass try: if bool(item.select('.gs_or_ggsm')): cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(',', 1)[ 0].replace('[', '') else: cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(',', 1)[ 0].replace('[', '') except: cc = str(['No information found']) pass try: if bool(item.select('.gs_ct1')): type = str( item.select('.gs_ct1')[0].get_text()) else: type = str(['Research Article']) except: type = str(['No information found']) pass resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited count, Type, " "Published date, Abstract", "items": [{ "DOI": str(['No information found']), "Title": item.select('h3')[0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": re.sub( "[^A-Za-z]", " ", str( item.select('.gs_a') [0].get_text()).split('-', 1)[0]), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": cc, "Affiliation": str(['No information found']), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select('.gs_a') [0].get_text()))).strip(), "Abstract": abstract }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) print(f'Finished with total {count} records returned.') logger.writeRecords(query, None, _engine, rec, count, logging_flag) return data
def query_scraper(query): from scraper_api import ScraperAPIClient from bs4 import BeautifulSoup import json import statistics ################################################################## # LAZADA SCRAPER STARTS HERE ################################################################## url = 'https://www.lazada.com.my/catalog/?q=' + query xquery = query.lower().split() client = ScraperAPIClient('9aa1dbc863b8334850efccb9be3552f8') try: page = client.get(url=url, render=True) except: result = {'status_code': 500, 'status': 'scraper api fatal error', 'elapsed_time': '', 'data': [], 'analytics': {}} return json.dumps(result) if page.status_code != 200: result = {'status_code': page.status_code, 'status': 'error', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} return json.dumps(result) else: soup = BeautifulSoup(page.content, 'html.parser') scripts = soup.find_all('script') if not scripts: result = {'status_code': 500, 'status': 'no script tag found', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} return json.dumps(result) else: data = None for i in range(len(scripts)): if "<script>window.pageData=" in str(scripts[i]): data = str(scripts[i]).lstrip( "<script>window.pageData=").rstrip("</script>") data = json.loads(data) if data == None: result = {'status_code': 500, 'status': 'no window.pagedata found', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} return json.dumps(result) else: if "listItems" not in data['mods']: result = {'status_code': 400, 'status': 'no listItems found', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} return json.dumps(result) else: result = {'status_code': 200, 'status': 'success', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} for i in range(len(data['mods']['listItems'])): val = [] for j in xquery: val.append(j in data['mods'] ['listItems'][i]['name'].lower()) xquery_result = False if False in val else True if xquery_result: result['data'].append({ "product_id": data['mods']['listItems'][i]['nid'], "name": data['mods']['listItems'][i]['name'], "price": float(data['mods']['listItems'][i]['price']), "brand": data['mods']['listItems'][i]['brandName'], "url": data['mods']['listItems'][i]['productUrl'].lstrip("//").rstrip("?search=1"), "image_url": data['mods']['listItems'][i]['image'].lstrip("https://"), "platform": "lazada" }) ################################################################## # SHOPEE SCRAPER STARTS HERE ################################################################## url_shopee = 'https://shopee.com.my/search?keyword=' + query image_shopee = 'upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Shopee_logo.svg/1200px-Shopee_logo.svg.png' try: page = client.get(url=url_shopee, render=True) except: result = {'status_code': 500, 'status': 'scraper api for shopee fatal error', 'elapsed_time': '', 'data': [], 'analytics': {}} return json.dumps(result) if page.status_code != 200: result = {'status_code': page.status_code, 'status': 'shopee error', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} return json.dumps(result) else: result['elapsed_time_shopee'] = int(page.elapsed.total_seconds()) soup = BeautifulSoup(page.content, 'html.parser') shopee_result = soup.select('div.col-xs-2-4.shopee-search-item-result__item') for i in range(len(shopee_result)): if shopee_result[i].select('div._1NoI8_'): name = shopee_result[i].select('div._1NoI8_')[0].text url = 'www.shopee.com.my' + shopee_result[i].select('a')[0]['href'] if len(shopee_result[i].select('span._341bF0')) == 1: price = shopee_result[i].select('span._341bF0')[0].text result['data'].append({'product_id':'','brand':'','name':name, 'image_url':image_shopee , 'platform':'shopee','url':url, 'price':float(price.replace(',',''))}) else: price = shopee_result[i].select('span._341bF0')[0].text price2 = shopee_result[i].select('span._341bF0')[1].text result['data'].append({'product_id':'','brand':'','name':name, 'image_url':image_shopee, 'platform':'shopee', 'url':url, 'price':float(price.replace(',','')), 'price2':float(price2.replace(',',''))}) ################################################################## # RENUMBERED ID LIST for i in range(len(result['data'])): result['data'][i]['id'] = i+1 ################################################################## if len(result['data']) < 1: result = {'status_code': 400, 'status': 'no matched result', 'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}} return json.dumps(result) else: price = [] for i in range(len(result['data'])): price.append(result['data'][i]['price']) result['analytics']['result_count'] = len(price) result['analytics']['max_price'] = max(price) result['analytics']['min_price'] = min(price) result['analytics']['avg_price'] = round(statistics.mean(price), 2) result['analytics']['median_price'] = statistics.median(price) result['analytics']['min_price_url'] = result['data'][price.index( min(price))]['url'] return json.dumps(result)
from scraper_api import ScraperAPIClient client = ScraperAPIClient('352c90489d500e45cab80df55a2033f0') result = client.get(url='https://tinbds.com/du-an/ho-chi-minh/quan-1').text print(result) # Scrapy users can simply replace the urls in their start_urls and parse function # Note for Scrapy, you should not use DOWNLOAD_DELAY and # RANDOMIZE_DOWNLOAD_DELAY, these will lower your concurrency and are not # needed with our API