示例#1
0
def handler(event, context):
    class job:
        def __init__(self, title, summary):
            self.title = title
            self.summary = summary

    search_string = event['queryStringParameters']['search']
    location = event['queryStringParameters']['state']
    
    client = ScraperAPIClient(os.environ.get('API_KEY'))
    monster_url = f'https://www.monster.com/jobs/search?q={search_string}&where={location}&page=1'
    page = client.get(url=monster_url, render=True)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(class_='results-page')
    job_elems = results.find_all('div', class_='results-card')
    jobs = {
        "jobs": []
    }
    for job_elem in job_elems:
        title_elem = job_elem.find('div', class_='title-company-location')
        summary_elem = job_elem.find('div', class_='results-card-description')
        jobs['jobs'].append( job(title_elem.text.strip(), summary_elem.text.strip()))
    result = json.dumps(jobs, default = lambda x: x.__dict__)
    # result = jobs
    return {
        'statusCode': 200,
        'headers': {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Headers': '*',
            'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
        },
        'body': result
    }
示例#2
0
    def get_proxies_from_scraper_api(cls, proxy_count=10):
        client = ScraperAPIClient('3af7d62e85b75e0271d32f245107a240')
        proxies = set()

        for i in range(1, proxy_count):
            result = client.get(url='http://httpbin.org/ip').text
            json_data = json.loads(result)
            print(json_data)
            proxies.add(json_data["origin"])

        print(proxies)

        return proxies
示例#3
0
class ScraperApi(Base):
    def __init__(self, key: str, max_retry):
        self.max_retry = max_retry
        self.client = ScraperAPIClient(key)

    def get(self, url: str, headers: Dict[str, str]):
        if not headers:
            headers = {}

        if "User-Agent" not in headers.keys():
            headers["User-Agent"] = USER_AGENTS[randint(
                0,
                len(USER_AGENTS) - 1)]
        return self.client.get(url, headers, retry=self.max_retry).text

    def get_json(self, url: str, headers: Union[Dict[str, str], None]):
        return json.loads(self.get(url, headers))
示例#4
0
def main():
	parser = argparse.ArgumentParser(description='Parses command line arguments')
	parser.add_argument('--scraper_api_key', type=str, required=True)
	args = parser.parse_args()

	client = ScraperAPIClient(args.scraper_api_key)
	result = json.loads(client.get(url='http://httpbin.org/ip').text)
	print('Rotated proxy IP address = ' + result['origin'])

	urls = [
		client.scrapyGet(url='http://quotes.toscrape.com/page/1/'),
		client.scrapyGet(url='http://quotes.toscrape.com/page/2/'),
	]

	for url in urls:
		r = requests.get(url)
		# add parsing logic here
		print(r.status_code)
示例#5
0
def check_availability(URL, scraper_api_key):
    # To prevent from being blocked.
    sleep(randint(30, 40))
    client = ScraperAPIClient(scraper_api_key)
    result = client.get(URL)

    parsed_html = BeautifulSoup(result.content, 'html.parser')

    availability = parsed_html.select("#availability")

    if availability:

        availability = availability[0].getText().strip().splitlines()[0]

        if availability != "Currently unavailable.":
            return True

    return False
示例#6
0
def product_scraper(url):
    from scraper_api import ScraperAPIClient
    from bs4 import BeautifulSoup
    import json

    client = ScraperAPIClient('9aa1dbc863b8334850efccb9be3552f8')

    try:
        page = client.get(url=url, render=True)
    except:
        result = {'status_code': 500, 'status': 'scraper api fatal error',
                  'elapsed_time': '', 'price': '', 'title': ''}
        return json.dumps(result)

    if page.status_code != 200:
        result = {'status_code': page.status_code, 'status': 'error',
                  'elapsed_time': int(page.elapsed.total_seconds()), 'price': '', 'title': ''}
        return json.dumps(result)

    else:
        soup = BeautifulSoup(page.content, 'html.parser')
        result_title = soup.findAll(
            "span", {"class": "pdp-mod-product-badge-title"})
        result_price = soup.findAll("div", {"class": "pdp-product-price"})

        if result_title and result_price:
            title = result_title[0].text
            price = float(result_price[0].find_next(
                'span').text.strip('RM').replace(',', ''))

            result = {'status_code': page.status_code, 'status': 'success', 'elapsed_time': int(
                page.elapsed.total_seconds()), 'title': title, 'price': price}
            return json.dumps(result)

        else:
            result = {'status_code': 500, 'status': 'blocked/nocontent',
                      'elapsed_time': int(page.elapsed.total_seconds()), 'title': '', 'price': ''}
            return json.dumps(result)
def store_product_info(product_keys):
    for key in product_keys:
        url = gen_url(base_url, count, offset, page, store_id, key)
        try:
            client = ScraperAPIClient(settings.SCRAPER_API_KEY)
            result = client.get(url=url)
            if result.status_code == 200:
                res_json = result.json()
            elif result.status_code == 500:
                raise Exception("Request not successful, status: 500")
            elif result.status_code == 403:
                raise Exception("Plan max request exceeded, status: 403")
            elif result.status_code == 404:
                raise Exception("Request not found, status: 404")
            elif result.status_code == 410:
                raise Exception("Request gone or deleted, status: 410")
        except Exception as e:
            logger.error("failed to fetch product info, error: " + str(e))

        try:
            store_to_db(res_json["products"])
        except Exception as e:
            logger.error("failed to save product info, error: " + str(e))
示例#8
0
async def graph(request):
    scraper_api_key = getenv("SCRAPER_API_KEY") if getenv("SCRAPER_API_KEY") else None

    # Example url where pair graph can be fetched
    # url = 'https://gov.capital/forex/usd-eur/'
    pair = 'usd-eur'
    url = 'https://gov.capital/forex/{}/'.format(pair)

    client = ScraperAPIClient(scraper_api_key)
    request_result = client.get(url, render=True).text
    soup = BeautifulSoup(request_result)

    # Removing all divs containing ads
    for ads in soup.find_all("div", {"class": "code-block code-block-2"}):
        # Removes all ads in fetched page source
        ads.decompose()

    cleaned_graph_html = '<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js" integrity="sha256-4+XzXVhsDmqanXGHaHvgh1gMQKX40OUvDEBTu8JcmNs=" crossorigin="anonymous"></script>\n\
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.4.0/Chart.min.js"></script>'

    cleaned_graph_html = cleaned_graph_html + soup.find('canvas').next.__str__()

    return html(cleaned_graph_html)
示例#9
0
def handler(event, context):

    client = ScraperAPIClient(os.environ.get('API_KEY'))
    URL = "https://www.indeed.com/jobs?q=Entry+Level+Software+Engineer&l=Remote"
    page = client.get(url=URL)
    print(page)
    soup = BeautifulSoup(page.content, 'html.parser')
    print(soup)
    results = soup.find(id='resultsCol')
    print(results)
    job_elems = results.find_all('div', class_='jobsearch-SerpJobCard')
    titles = []
    companies = []
    summaries = []
    for job_elem in job_elems:
        title_elem = job_elem.find('h2', class_='title')
        company_elem = job_elem.find('div', class_='sjcl')
        summary_elem = job_elem.find('div', class_='summary')
        titles.append(title_elem.text.strip())
        companies.append(company_elem.text.strip())
        summaries.append(summary_elem.text.strip())

    response = {
        "titles": titles,
        "companies": companies,
        "summaries": summaries
    }

    return {
        'statusCode': 200,
        'headers': {
            'Access-Control-Allow-Headers': '*',
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
        },
        'body': json.dumps(response)
    }
示例#10
0
def handler(event, context):
    class job:
        def __init__(self, title, company, summary):
            self.title = title
            self.company = company
            self.summary = summary

    search_string = event['queryStringParameters']['search']
    location = event['queryStringParameters']['state']

    client = ScraperAPIClient(os.environ.get('API_KEY'))
    URL = f"https://www.indeed.com/jobs?q={search_string}&l={location}"
    page = client.get(url=URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(id='resultsCol')
    job_elems = results.find_all('div', class_='jobsearch-SerpJobCard')

    jobs = {"jobs": []}
    for job_elem in job_elems:
        title_elem = job_elem.find('h2', class_='title')
        company_elem = job_elem.find('div', class_='sjcl')
        summary_elem = job_elem.find('div', class_='summary')
        jobs['jobs'].append(
            job(title_elem.text.strip(), company_elem.text.strip(),
                summary_elem.text.strip()))

    result = json.dumps(jobs, default=lambda x: x.__dict__)

    return {
        'statusCode': 200,
        'headers': {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Headers': '*',
            'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
        },
        'body': result.replace("\\n", " ")
    }
示例#11
0
def handler(event, context):
    client = ScraperAPIClient(os.environ.get('API_KEY'))
    monster_url = "https://www.monster.com/jobs/search?q=Software+Engineer&where=Nashville%2C+TN&page=6"
    page = client.get(url=monster_url, render=True)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(class_='results-page')
    job_elems = results.find_all('div', class_='results-card')
    titles = []
    summaries = []
    for job_elem in job_elems:
        title_elem = job_elem.find('div', class_='title-company-location')
        summary_elem = job_elem.find('div', class_='results-card-description')
        titles.append(title_elem.text.strip())
        summaries.append(summary_elem.text.strip())
    response = {"titles": titles, "summaries": summaries}
    return {
        'statusCode': 200,
        'headers': {
            'Access-Control-Allow-Headers': '*',
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
        },
        'body': json.dumps(response)
    }
示例#12
0
import proxies
import requests
from proxies import random_proxy
from scraper_api import ScraperAPIClient
import random
def random_line(fname):
    lines = open(fname).read().splitlines()
    return random.choice(lines)

client = ScraperAPIClient('d0224166b175ddca1f18dd5b5cca66a5')
result = client.get(url = 'http://httpbin.org/headers', 
	headers={
	'useragent':random_line('user-agents.txt') }
	)
print(result.text);


def useScraperAPI(URL):
    client = ScraperAPIClient('d249e87aac326eaa5e728aafaa319d10')
    page = client.get(url=URL)
    page = BeautifulSoup(page.text, 'html.parser')
    return page
示例#14
0
class snkrs:
    def __init__(self) -> None:
        self.products = {}

    def getInfoInnvictus(self, apiKey='aba275ef5f8f713e086a1a0ab240dd5c'):
        """ Method to retrieve a dict given launching shoes on innvictus.com/lanzamientos """

        self.client = ScraperAPIClient(apiKey)

        result = self.client.get(url='https://www.innvictus.com/lanzamientos')
        assert result.status_code == 200, f"Status code {result.status_code}"

        soup = BeautifulSoup(result.content, 'html.parser')
        links = soup.find('body').find_all("script",
                                           attrs={'type': 'text/javascript'})
        s = links[3]  # links[3] element gives var products
        productVar = re.search(r'\'(.*?)\'', str(s))
        productDict = json.loads(
            productVar.group(0).replace("'{\"id",
                                        "[{\"id").replace("\"}'", "\"}]"))
        #print(productVar.group(0).replace("'{\"id","[{\"id").replace("\"}'","\"}]"))

        for tenis in productDict:
            tenis['url'] = "https://www.innvictus.com/p/{id}".format(
                id=tenis['id'])
        self.products = productDict

    def selectSaveTargetShoes(self, save=False):
        i = 0
        for shoes in self.products:
            print(f"Selelction {i}")
            print("Shoe: ", shoes['name2'])
            print("Launching date: ", shoes['realdate'])
            print("Price: ", shoes['price'], "\n")
            i += 1
        selectedShoe = [
            int(x) for x in input("Select shoe number to track\n").split()
        ]

        #print(selectedShoe)
        def takeFromDict(dct, listElements):
            return [dct[element] for element in listElements]

        self.products = takeFromDict(self.products, selectedShoe)
        if save == True:
            with open('TargetShoes.txt', 'w') as f:
                for shoe in self.products:
                    f.write(shoe['url'])
                    f.write("\n")

    def isAvailable(self, fileName=None):
        def sendDiscordMessage(self, message='Hello world'):
            from discord import Webhook, RequestsWebhookAdapter
            webhook = Webhook.from_url(
                "https://discord.com/api/webhooks/811294684386426890/3SY7GtmBAwyjDM6qr73eDRqrzjRJZ2u0vlShoyOvvf_cCUNvv6YJqiGPI1udVWWqipVp",
                adapter=RequestsWebhookAdapter())
            webhook.send(message)

        if fileName == None:
            for url in self.products:
                notAvailable = self.client.get(url['url'])
                assert notAvailable.status_code == 200, f"Status code {notAvailable.status_code}"
                soupNotAvailable = BeautifulSoup(notAvailable.content,
                                                 'html.parser')
                try:
                    notFoundClass = soupNotAvailable.find('body').find(
                        "div", attrs={
                            'class': 'pdp-notFound'
                        }).text.strip()
                    notFoundTitle = soupNotAvailable.find('title').text.strip()
                except:
                    notFoundClass = ''
                    notFoundTitle = ''
                if "No encontrado" in notFoundTitle or "Este producto no" in notFoundClass:
                    url['Available'] = False
                    message = url['name2'], " no disponible aun"
                    print(message)
                    return False

                else:
                    message = url['name2'], " disponible", url['url']
                    print(message)
                    url['Available'] = True
                    sendDiscordMessage(self, message=message)
                    return True
        else:
            with open(fileName, 'r') as targetShoes:
                urls = [url.strip() for url in targetShoes]
                for url in urls:
                    notAvailable = self.client.get(url, headers=self.headers)
                    soupNotAvailable = BeautifulSoup(notAvailable.content,
                                                     'html.parser')
                    try:
                        notFoundClass = soupNotAvailable.find('body').find(
                            "div", attrs={
                                'class': 'pdp-notFound'
                            }).text.strip()
                        notFoundTitle = soupNotAvailable.find(
                            'title').text.strip()
                    except:
                        notFoundClass = ''
                        notFoundTitle = ''
                    if "No encontrado" in notFoundTitle or "Este producto no" in notFoundClass:
                        print(url, " no disponible aun\n")
                    else:
                        print(url, " disponible \n")
示例#15
0
import time
import re

# Use proxy api to simplify web scraping
from scraper_api import ScraperAPIClient
client = ScraperAPIClient('api_key')

# Loop over multiple search results pages: 30 results per page * 10 pages
object_name = []
object_href = []
for i in range(10):
    if i == 0:
        url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C+OR&ns=1'
    else:
        url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C%20OR&start=' + str(i * 30)
    main_page = client.get(url).text
    soup = BeautifulSoup(main_page, 'lxml')
    subpages = soup.select('.text-size--inherit__373c0__2fB3p .link-color--inherit__373c0__3dzpk')
    for j in subpages[2:32]:
        object_name.append(j.string)
        object_href.append(j.get('href'))

# DataFrame: Master list: Business name; Business link
object_href = ['https://www.yelp.com' + url for url in object_href]
object_data = list(zip(object_name, object_href))
object_df = pd.DataFrame(data = object_data, columns = ['name', 'url'])

# Inspect duplicated values
print(sum(object_df.duplicated(subset = ['name'], keep = 'first')))

# Drop duplicated values
示例#16
0
def testErrorCode(url):

    client = ScraperAPIClient('XXXXXXXXXXXXXXXXXXXXXX')
    return client.get(url=url)
示例#17
0
from scraper_api import ScraperAPIClient
client = ScraperAPIClient('60d3e592477ebfd328a11e92ce6600c9')
result = client.get(url='http://httpbin.org/ip').text
print(result)
示例#18
0
def proxy_request(url):
    client = ScraperAPIClient(os.environ.get('PROXY_API_KEY'))
    rsp = client.get(url=url, headers=HEADERS)
    return rsp
示例#19
0
from selenium import webdriver
from time import sleep
import time
from random import randrange
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from scraper_api import ScraperAPIClient



client = ScraperAPIClient('********************')
result = client.get(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True).text
print("starting getting information from the web ...")  


start_urls = [client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True)]
def parse(self, response):

    yield scrapy.Request(client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True), self.parse)


soup = BeautifulSoup(result, 'html.parser')




    
Adresse_text = []


Adresse_source = soup.find('div', attrs={'class': 'section-review-subtitle section-review-subtitle-nowrap'})
示例#20
0
class GoogleNews:
    def __init__(self, key: str, lang="en", period="", ua=""):
        assert key != ""
        self.client = ScraperAPIClient(key)
        self.user_agent = ua
        self.__texts = []
        self.__titles = []
        self.__links = []
        self.__results = []
        self.__lang = lang
        self.__period = period
        self.__exec_time = 0

    def set_lang(self, lang):
        self.__lang = lang

    def search(self, q: Union[List[str], str], p: Union[List[int], int],
               start: datetime, end: datetime) -> List[dict]:
        """
        Searches for a term in google news and retrieves the first page into __results.

        Parameters:
        key = the search term
        """
        start_time = time()
        if isinstance(q, str):
            q = [q]
        if isinstance(p, int):
            p = [p]
        elif len(p) < 1:
            p = [1]

        for query in q:
            for page in p:
                out = self.scrape_page(query, page, start, end)
                for o in out:
                    if o["title"] not in self.__titles:
                        self.__results.append(o)
                        self.__links.append(o["link"])
                        self.__texts.append(o["title"] + " " + o["desc"])

        self.__exec_time = time() - start_time
        return self.__results

    def scrape_page(self,
                    q: str,
                    page: int,
                    start: datetime,
                    end: datetime,
                    attempts=0):
        """
        page = number of the page to be retrieved
        """
        payload = {
            'q': q,
            'lr': f'lang_{self.__lang}',
            'tbs': f"lr:lang_1{self.__lang}",
            'tbm': 'nws',
            'start': (10 * (page - 1)),
        }

        out: List[dict] = []

        if start is not None and end is not None:
            payload['tbs'] += f",cdr:1,cd_min:{start.strftime(DATE_FMT)}," \
                              f"cd_max:{end.strftime(DATE_FMT)}"

        try:
            page = self.client.get(url=GOOG_URL + "?" +
                                   parse.urlencode(payload)).text
            content = Soup(page, "html.parser")
        except Exception as e:
            attempts += 1
            if attempts > 5:
                print(f"ERROR TRYING TO LOAD CONTENT: {e}")
                raise e
            sleep(0.1 * attempts)
            self.scrape_page(q, page, start, end, attempts)
        try:
            result = content.find_all("div", id="search")[0].find_all("g-card")
        except IndexError:
            # no results were found
            return out

        for item in result:
            try:
                out.append({
                    "title":
                    item.find("div", {
                        "role": "heading"
                    }).text.replace("\n", ""),
                    "link":
                    item.find("a").get("href"),
                    "media":
                    item.findAll("g-img")[1].parent.text,
                    "date":
                    item.find("div", {
                        "role": "heading"
                    }).next_sibling.findNext('div').findNext('div').text,
                    "desc":
                    item.find("div", {
                        "role": "heading"
                    }).next_sibling.findNext('div').text.replace("\n", ""),
                    "image":
                    item.findAll("g-img")[0].find("img").get("src")
                })
            except Exception:
                pass
        return out

    def get_results(self) -> List[dict]:
        """Returns the __results."""
        return self.__results

    def get_text(self) -> List[str]:
        """Returns only the __texts of the __results."""
        return self.__texts

    def get_links(self) -> List[str]:
        """Returns only the __links of the __results."""
        return self.__links

    def clear(self):
        self.__texts = []
        self.__links = []
        self.__results = []
        self.__titles = []
        self.__exec_time = 0
示例#21
0
def search_googleScholar(query, headers, _gs_pages, records, _title, _keyword,
                         _abstract, scrpr_api, _from_yr, _to_yr_, logging_flag,
                         data):
    rec = 0
    if _title:
        # request url
        url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=%22' + query + '%22&btnG='

        # response object
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        print('Searching in Google Scholar...')
        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            # Find required attributes in the response object by checking tag [data-lid]'))
            for item in soup.select('[data-lid]'):
                try:
                    if bool(item.select('.gs_or_ggsm')):
                        cc = str(
                            re.findall(
                                r'\d+',
                                str(item.select('.gs_fl')
                                    [1].get_text()))).split(',', 1)[0].replace(
                                        '[', '')
                    else:
                        cc = str(
                            re.findall(
                                r'\d+',
                                str(item.select('.gs_fl')
                                    [0].get_text()))).split(',', 1)[0].replace(
                                        '[', '')

                    if bool(item.select('.gs_ct1')):
                        type = str(item.select('.gs_ct1')[0].get_text())
                    else:
                        type = str(['Research Article'])

                    resp_obj = {
                        "entities": {
                            "Search Engine":
                            "Google Scholar",
                            "Attributes found":
                            "Title, URLs, Authors, Cited count, Type, Published "
                            "date, Abstract",
                            "items": [{
                                "DOI":
                                str(['No information found']),
                                "Title":
                                item.select('h3')[0].get_text(),
                                "URLs":
                                item.select('a')[0]['href'],
                                "Authors":
                                re.sub(
                                    "[^A-Za-z]", " ",
                                    str(item.select('.gs_a')
                                        [0].get_text()).split('-', 1)[0]),
                                "Publication Name":
                                str(['No information found']),
                                "ISSN":
                                str(['No information found']),
                                "Cited count":
                                cc,
                                "Affiliation":
                                str(['No information found']),
                                "Type":
                                type,
                                "Published date":
                                str(
                                    re.findall(
                                        r'\d+',
                                        str(
                                            item.select('.gs_a')
                                            [0].get_text()))).strip(),
                                "Abstract":
                                item.select('.gs_rs')[0].get_text()
                            }]
                        }
                    }
                    # append dict object data
                    count += 1
                    data.append(resp_obj)
                except Exception as e:  # raise e
                    pass
                    exception_type, exception_object, exception_traceback = sys.exc_info(
                    )
                    filename = exception_traceback.tb_frame.f_code.co_filename
                    line_number = exception_traceback.tb_lineno
                    logger.writeError(e, None, _engine, logging_flag, filename,
                                      line_number)
        time.sleep(1)

        print(f'Finished with total {count} records returned.')
        logger.writeRecords(query, None, _engine, "1", count, logging_flag)
        return data

    if _keyword or _abstract:
        if _gs_pages != 0:
            pages = pagination(_gs_pages)
        else:
            pages = 1

        # search for dates
        if _from_yr:

            # use of scraper api to avoid IP block issue by Google scholar
            client = ScraperAPIClient(scrpr_api)
            count = 0

            for i in tqdm(range(1)):
                print("Searching Google Scholar Engine now please wait...")
                url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG='

                response = client.get(url,
                                      headers={'User-agent': 'your bot 0.1'})

                if response.status_code != 200:
                    print("Request failed with status", response.status_code)
                    logger.writeError(
                        "Logging Error:" + str(response.status_code), None,
                        _engine, logging_flag)

                else:
                    soup = BeautifulSoup(response.content, 'lxml')

                    # count no of records returned by google scholar
                    for item in soup.find_all('div', class_='gs_ab_st'):
                        rec = \
                        str(item.find_all('div', id='gs_ab_md')[0].get_text()).split(' ', 1)[1].replace(',', "").split(
                            ' ', 1)[0]

                        pages = 1
                        if _gs_pages != 0:
                            pages = pagination(_gs_pages)
                        else:
                            pages = pagination(rec)

                    # check if records are greater than 1000 or not
                    if int(pages) > 100:
                        print(
                            "NOTE:Google Scholar returns data for max 1000 records irrespective of total records. "
                            "Total No of total records found :", rec,
                            "\n Fetching records details now...")

                        pages = 100
                        for i in range(pages):

                            url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str(
                                i) + '0'

                            # response = requests.get(url, proxies={"http": proxy, "https": proxy}, headers=headers)
                            response = client.get(
                                url, headers={'User-agent': 'your bot 0.1'})
                            soup = BeautifulSoup(response.content, 'lxml')
                            # Find required attributes in the response object by checking tag [data-lid]'))
                            for item in soup.select('[data-lid]'):
                                try:
                                    try:
                                        if bool(
                                                item.select('.gs_rs')
                                            [0].get_text()):
                                            abstract = item.select(
                                                '.gs_rs')[0].get_text()
                                        else:
                                            abstract = str(
                                                ['No information found'])
                                    except:
                                        abstract = str(
                                            ['No information found'])
                                        pass
                                    try:
                                        if bool(item.select('.gs_or_ggsm')):
                                            cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(
                                                    ',',
                                                    1)[
                                                    0].replace('[', '')
                                        else:
                                            cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(
                                                    ',',
                                                    1)[
                                                    0].replace('[', '')
                                    except:
                                        cc = str(['No information found'])
                                        pass
                                    try:
                                        if bool(item.select('.gs_ct1')):
                                            type = str(
                                                item.select('.gs_ct1')
                                                [0].get_text())
                                        else:
                                            type = str(['Research Article'])
                                    except:
                                        type = str(['No information found'])
                                        pass

                                    # response object
                                    resp_obj = {
                                        "entities": {
                                            "Search Engine":
                                            "Google Scholar",
                                            "Attributes found":
                                            "Title, URLs, Authors, Cited count, "
                                            "Type, Published date, Abstract",
                                            "items": [{
                                                "DOI":
                                                str(['No information found']),
                                                "Title":
                                                item.select('h3')
                                                [0].get_text(),
                                                "URLs":
                                                item.select('a')[0]['href'],
                                                "Authors":
                                                re.sub(
                                                    "[^A-Za-z]", " ",
                                                    str(
                                                        item.select('.gs_a')
                                                        [0].get_text()).split(
                                                            '-', 1)[0]),
                                                "Publication Name":
                                                str(['No information found']),
                                                "ISSN":
                                                str(['No information found']),
                                                "Cited count":
                                                cc,
                                                "Affiliation":
                                                str(['No information found']),
                                                "Type":
                                                type,
                                                "Published date":
                                                str(
                                                    re.findall(
                                                        r'\d+',
                                                        str(
                                                            item.select(
                                                                '.gs_a')
                                                            [0].get_text()))
                                                ).strip(),
                                                "Abstract":
                                                abstract
                                            }]
                                        }
                                    }
                                    # append dict object data
                                    count += 1
                                    data.append(resp_obj)
                                except Exception as e:  # raise e
                                    pass
                                    exception_type, exception_object, exception_traceback = sys.exc_info(
                                    )
                                    filename = exception_traceback.tb_frame.f_code.co_filename
                                    line_number = exception_traceback.tb_lineno
                                    logger.writeError(e, None, _engine,
                                                      logging_flag, filename,
                                                      line_number)

                    else:
                        for i in range(pages):

                            url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str(
                                i) + '0'

                            response = client.get(
                                url, headers={'User-agent': 'your bot 0.1'})
                            if response.status_code != 200:
                                print("Request failed with stauts",
                                      response.status_code)
                                logger.writeError(
                                    "Logging Erorr:" +
                                    str(response.status_code), None, _engine,
                                    logging_flag)

                            else:
                                soup = BeautifulSoup(response.content, 'lxml')

                                # Find required attributes in the response object by checking tag [data-lid]'))
                                for item in soup.select('[data-lid]'):
                                    try:
                                        try:
                                            if bool(
                                                    item.select('.gs_rs')
                                                [0].get_text()):
                                                abstract = item.select(
                                                    '.gs_rs')[0].get_text()
                                            else:
                                                abstract = str(
                                                    ['No information found'])
                                        except:
                                            abstract = str(
                                                ['No information found'])
                                            pass
                                        try:
                                            if bool(item.select(
                                                    '.gs_or_ggsm')):
                                                cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(
                                                    ',', 1)[
                                                    0].replace('[', '')
                                            else:
                                                cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(
                                                    ',', 1)[
                                                    0].replace('[', '')
                                        except:
                                            cc = str(['No information found'])
                                            pass
                                        try:
                                            if bool(item.select('.gs_ct1')):
                                                type = str(
                                                    item.select('.gs_ct1')
                                                    [0].get_text())
                                            else:
                                                type = str(
                                                    ['Research Article'])
                                        except:
                                            type = str(
                                                ['No information found'])
                                            pass

                                        resp_obj = {
                                            "entities": {
                                                "Search Engine":
                                                "Google Scholar",
                                                "Attributes found":
                                                "Title, URLs, Authors, Cited "
                                                "count, Type, Published date, "
                                                "Abstract",
                                                "items": [{
                                                    "DOI":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "Title":
                                                    item.select(
                                                        'h3')[0].get_text(),
                                                    "URLs":
                                                    item.select('a')[0]
                                                    ['href'],
                                                    "Authors":
                                                    re.sub(
                                                        "[^A-Za-z]", " ",
                                                        str(
                                                            item.select(
                                                                '.gs_a')[0].
                                                            get_text()).split(
                                                                '-', 1)[0]),
                                                    "Publication Name":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "ISSN":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "Cited count":
                                                    cc,
                                                    "Affiliation":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "Type":
                                                    type,
                                                    "Published date":
                                                    str(
                                                        re.findall(
                                                            r'\d+',
                                                            str(
                                                                item.select(
                                                                    '.gs_a')
                                                                [0].get_text())
                                                        )).strip(),
                                                    "Abstract":
                                                    abstract
                                                }]
                                            }
                                        }
                                        # append dict object data
                                        count += 1
                                        data.append(resp_obj)
                                    except Exception as e:  # raise e
                                        pass
                                        exception_type, exception_object, exception_traceback = sys.exc_info(
                                        )
                                        filename = exception_traceback.tb_frame.f_code.co_filename
                                        line_number = exception_traceback.tb_lineno
                                        logger.writeError(
                                            e, None, _engine, logging_flag,
                                            filename, line_number)
                    time.sleep(1)

                    print(f'Finished with total {count} records returned.')
                    logger.writeRecords(query, None, _engine, rec, count,
                                        logging_flag)
                    return data

        # search without dates
        else:
            print("Searching Google Scholar Engine now please wait...")
            client = ScraperAPIClient(scrpr_api)
            count = 0
            for i in tqdm(range(1)):
                for i in range(pages):
                    # request url
                    url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&btnG=&start=' + str(
                        i) + '0'
                    # response object
                    response = client.get(
                        url, headers={'User-agent': 'your bot 0.1'})

                    if response.status_code != 200:
                        print("Request failed with stauts",
                              response.status_code)
                        logger.writeError(
                            "Logging Erorr:" + str(response.status_code), None,
                            _engine, logging_flag)

                    soup = BeautifulSoup(response.content, 'lxml')

                    # Find required attributes in the response object by checking tag [data-lid]'))
                    for item in soup.select('[data-lid]'):
                        try:

                            try:
                                if bool(item.select('.gs_rs')[0].get_text()):
                                    abstract = item.select(
                                        '.gs_rs')[0].get_text()
                                else:
                                    abstract = str(['No information found'])
                            except:
                                abstract = str(['No information found'])
                                pass
                            try:
                                if bool(item.select('.gs_or_ggsm')):
                                    cc = \
                                        str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(',', 1)[
                                            0].replace('[', '')
                                else:
                                    cc = \
                                        str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(',', 1)[
                                            0].replace('[', '')
                            except:
                                cc = str(['No information found'])
                                pass
                            try:
                                if bool(item.select('.gs_ct1')):
                                    type = str(
                                        item.select('.gs_ct1')[0].get_text())
                                else:
                                    type = str(['Research Article'])
                            except:
                                type = str(['No information found'])
                                pass

                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "Google Scholar",
                                    "Attributes found":
                                    "Title, URLs, Authors, Cited count, Type, "
                                    "Published date, Abstract",
                                    "items": [{
                                        "DOI":
                                        str(['No information found']),
                                        "Title":
                                        item.select('h3')[0].get_text(),
                                        "URLs":
                                        item.select('a')[0]['href'],
                                        "Authors":
                                        re.sub(
                                            "[^A-Za-z]", " ",
                                            str(
                                                item.select('.gs_a')
                                                [0].get_text()).split('-',
                                                                      1)[0]),
                                        "Publication Name":
                                        str(['No information found']),
                                        "ISSN":
                                        str(['No information found']),
                                        "Cited count":
                                        cc,
                                        "Affiliation":
                                        str(['No information found']),
                                        "Type":
                                        type,
                                        "Published date":
                                        str(
                                            re.findall(
                                                r'\d+',
                                                str(
                                                    item.select('.gs_a')
                                                    [0].get_text()))).strip(),
                                        "Abstract":
                                        abstract
                                    }]
                                }
                            }
                            # append dict object data
                            count += 1
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)
            time.sleep(1)

            print(f'Finished with total {count} records returned.')
            logger.writeRecords(query, None, _engine, rec, count, logging_flag)
            return data
示例#22
0
def query_scraper(query):
    from scraper_api import ScraperAPIClient
    from bs4 import BeautifulSoup
    import json
    import statistics

    ##################################################################
    # LAZADA SCRAPER STARTS HERE
    ##################################################################
    url = 'https://www.lazada.com.my/catalog/?q=' + query

    xquery = query.lower().split()

    client = ScraperAPIClient('9aa1dbc863b8334850efccb9be3552f8')

    try:
        page = client.get(url=url, render=True)
    except:
        result = {'status_code': 500, 'status': 'scraper api fatal error',
                  'elapsed_time': '', 'data': [], 'analytics': {}}
        return json.dumps(result)

    if page.status_code != 200:
        result = {'status_code': page.status_code, 'status': 'error',
                  'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}
        return json.dumps(result)

    else:
        soup = BeautifulSoup(page.content, 'html.parser')
        scripts = soup.find_all('script')

        if not scripts:
            result = {'status_code': 500, 'status': 'no script tag found',
                      'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}
            return json.dumps(result)

        else:
            data = None
            for i in range(len(scripts)):
                if "<script>window.pageData=" in str(scripts[i]):
                    data = str(scripts[i]).lstrip(
                        "<script>window.pageData=").rstrip("</script>")
                    data = json.loads(data)

            if data == None:
                result = {'status_code': 500, 'status': 'no window.pagedata found',
                          'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}
                return json.dumps(result)

            else:
                if "listItems" not in data['mods']:
                    result = {'status_code': 400, 'status': 'no listItems found',
                              'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}
                    return json.dumps(result)

                else:
                    result = {'status_code': 200, 'status': 'success',
                              'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}

                    for i in range(len(data['mods']['listItems'])):
                        val = []

                        for j in xquery:
                            val.append(j in data['mods']
                                       ['listItems'][i]['name'].lower())
                            xquery_result = False if False in val else True

                        if xquery_result:
                            result['data'].append({
                                "product_id": data['mods']['listItems'][i]['nid'],
                                "name": data['mods']['listItems'][i]['name'],
                                "price": float(data['mods']['listItems'][i]['price']),
                                "brand": data['mods']['listItems'][i]['brandName'],
                                "url": data['mods']['listItems'][i]['productUrl'].lstrip("//").rstrip("?search=1"),
                                "image_url": data['mods']['listItems'][i]['image'].lstrip("https://"),
                                "platform": "lazada"
                            })

    ##################################################################
    # SHOPEE SCRAPER STARTS HERE
    ##################################################################
    url_shopee = 'https://shopee.com.my/search?keyword=' + query
    image_shopee = 'upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Shopee_logo.svg/1200px-Shopee_logo.svg.png'

    try:
        page = client.get(url=url_shopee, render=True)
    except:
        result = {'status_code': 500, 'status': 'scraper api for shopee fatal error',
                  'elapsed_time': '', 'data': [], 'analytics': {}}
        return json.dumps(result)

    if page.status_code != 200:
        result = {'status_code': page.status_code, 'status': 'shopee error',
                  'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}
        return json.dumps(result)

    else:
        result['elapsed_time_shopee'] = int(page.elapsed.total_seconds())
        soup = BeautifulSoup(page.content, 'html.parser')
        shopee_result = soup.select('div.col-xs-2-4.shopee-search-item-result__item')

    for i in range(len(shopee_result)):
        if shopee_result[i].select('div._1NoI8_'):
            name = shopee_result[i].select('div._1NoI8_')[0].text
            url = 'www.shopee.com.my' + shopee_result[i].select('a')[0]['href']
            
            if len(shopee_result[i].select('span._341bF0')) == 1:
                price = shopee_result[i].select('span._341bF0')[0].text
                result['data'].append({'product_id':'','brand':'','name':name, 'image_url':image_shopee , 'platform':'shopee','url':url, 'price':float(price.replace(',',''))})
            else:
                price = shopee_result[i].select('span._341bF0')[0].text
                price2 = shopee_result[i].select('span._341bF0')[1].text
                result['data'].append({'product_id':'','brand':'','name':name, 'image_url':image_shopee, 'platform':'shopee', 'url':url, 'price':float(price.replace(',','')), 'price2':float(price2.replace(',',''))})


    ##################################################################
    # RENUMBERED ID LIST
    for i in range(len(result['data'])):
        result['data'][i]['id'] = i+1
    ##################################################################

    if len(result['data']) < 1:
        result = {'status_code': 400, 'status': 'no matched result',
                  'elapsed_time': int(page.elapsed.total_seconds()), 'data': [], 'analytics': {}}
        return json.dumps(result)

    else:
        price = []
        for i in range(len(result['data'])):
            price.append(result['data'][i]['price'])

        result['analytics']['result_count'] = len(price)
        result['analytics']['max_price'] = max(price)
        result['analytics']['min_price'] = min(price)
        result['analytics']['avg_price'] = round(statistics.mean(price), 2)
        result['analytics']['median_price'] = statistics.median(price)
        result['analytics']['min_price_url'] = result['data'][price.index(
            min(price))]['url']

        return json.dumps(result)
示例#23
0
from scraper_api import ScraperAPIClient

client = ScraperAPIClient('352c90489d500e45cab80df55a2033f0')
result = client.get(url='https://tinbds.com/du-an/ho-chi-minh/quan-1').text
print(result)
# Scrapy users can simply replace the urls in their start_urls and parse function
# Note for Scrapy, you should not use DOWNLOAD_DELAY and
# RANDOMIZE_DOWNLOAD_DELAY, these will lower your concurrency and are not
# needed with our API