Exemplo n.º 1
0
 def scrape(self, ind):
     db = FirebaseAccess()
     urls1 = [
         'https://www.class-central.com/subject/cs',
         'https://www.class-central.com/subject/business',
         'https://www.class-central.com/subject/science',
         'https://www.class-central.com/subject/data-science',
         'https://www.class-central.com/subject/programming-and-software-development',
         'https://www.class-central.com/subject/engineering',
         'https://www.class-central.com/subject/maths'
     ]
     urls2 = [
         'https://www.class-central.com/subject/humanities',
         'https://www.class-central.com/subject/social-sciences',
         'https://www.class-central.com/subject/education',
         'https://www.class-central.com/subject/personal-development',
         'https://www.class-central.com/subject/art-and-design',
         'https://www.class-central.com/subject/health'
     ]
     if ind == 1:
         sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls1)
     else:
         sched = Job(CoursesSpider, fbadb=db, urls_to_scrape=urls2)
     processor = Processor(settings=None)
     data = processor.run([sched])
Exemplo n.º 2
0
def get_product_info(term):
    candleJob = Job(CandleSpider,
                    url="https://www.yankeecandle.com/search?Ntt=" +
                    quote(term))
    processor = Processor(settings=None)
    results = processor.run([candleJob])
    if len(results) == 0:
        return None
    else:
        return results[0]
Exemplo n.º 3
0
    def scrape(self, query):
        # Create jobs for each instance. *args and **kwargs supplied here will
        # be passed to the spider constructor at runtime
        myJob = Job(ClassCentralSpider, keys=query)

        # Create a Processor, optionally passing in a Scrapy Settings object.
        processor = Processor(settings=None)

        # Start the reactor, and block until all spiders complete.
        data = processor.run([myJob])
Exemplo n.º 4
0
def crawl_lad_scrapyscript(depth=lad_depth, urls=None, domain=lad_domain):
    """Version of crawl_lad that assures multiple run on one worker without restart.    """
    settings = scrapy_settings(depth, concurrent_requests)

    if urls is None:
        urls = list(get_gov_websites(gov_sites_path))

    job = Job(LadSpider, urls, domain, depth)
    processor = Processor(settings=settings)
    data = processor.run([job])
    print(json.dumps(data, indent=4))
Exemplo n.º 5
0
def spider_results():

    # Define a Scrapy Spider, which can accept *args or **kwargs
    # https://doc.scrapy.org/en/latest/topics/spiders.html#spider-arguments
    class PythonSpider(Spider):
        name = 'myspider'

        def start_requests(self):
            yield Request(self.url)

        def parse(self, response):
            #title = response.xpath('//title/text()').extract()
            precio_meta = response.xpath(
                '//*[@id="root-app"]/div/div[3]/div/div[2]/div[1]/div/div[3]/div/div[1]/div/span/span[2]/text()'
            ).extract()
            return {'url': response.request.url, 'precio': precio_meta}

    # Create jobs for each instance. *args and **kwargs supplied here will
    # be passed to the spider constructor at runtime
    githubJob = Job(
        PythonSpider,
        url=
        'https://articulo.mercadolibre.com.ar/MLA-850664638-cuadernos-anotador-2020-modelos-de-diseno-_JM#position=1&type=item&tracking_id=cb49fd5e-5e5d-4e33-903b-66f14e0f3ac5'
    )
    # pythonJob = Job(PythonSpider, url='http://www.python.org')

    # Create a Processor, optionally passing in a Scrapy Settings object.
    cust_settings = Settings()
    cust_settings[
        'USER_AGENT'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
    processor = Processor(settings=cust_settings)

    # Start the reactor, and block until all spiders complete.
    data = processor.run([githubJob])

    # Print the consolidated results
    # print(json.dumps(data, indent=4))
    return json.dumps(data, indent=4)
Exemplo n.º 6
0
 def work(url, g_id):
     broker_job = Job(BrokenImageChecker, url=url, g_id=g_id)
     processor = Processor(settings=None)
     result = processor.run([broker_job])
     return result
Exemplo n.º 7
0
from scrapyscript import Job, Processor
from scrapy.utils.project import get_project_settings

from web_site_info.spiders.site_info import SiteInfoSpider

if __name__ == "__main__":

    start_urls = open(0).read().splitlines()
    jobs = list()

    for url in start_urls:
        job = Job(SiteInfoSpider, url=url)
        jobs.append(job)

    processor = Processor(get_project_settings())
    data = processor.run(jobs)

    for item in data:
        print(item['_values'])
Exemplo n.º 8
0
def run_crawler(url, numofpages):
    newsJob = Job(CrawlerSpider, url=url, numofpages=numofpages)
    processor = Processor(get_project_settings())
    processor.run([newsJob])
    return None
Exemplo n.º 9
0
import scrapy
from scrapyscript import Job, Processor

settings = scrapy.settings.Settings(values={"LOG_LEVEL": "WARNING"})
processor = Processor(settings=None)


class PythonSpider(scrapy.spiders.Spider):
    name = "myspider"

    def start_requests(self):
        yield scrapy.Request(self.url)

    def parse(self, response):
        data = response.xpath("//title/text()").extract_first()
        return {"title": data}


job = Job(PythonSpider, url="http://www.python.org")
results = processor.run(job)

print(results)