示例#1
0
def main():
    nflcrawler = Crawler()
    seeds = [
        "http://www.nfl.com/teams/roster?team=STL",
        "http://www.nfl.com/teams/roster?team=TEN",
        "http://www.nfl.com/teams/roster?team=WAS",
        "http://www.nfl.com/teams/roster?team=CAR",
        "http://www.nfl.com/teams/roster?team=CLE",
        "http://www.nfl.com/teams/roster?team=JAC",
        "http://www.nfl.com/teams/roster?team=KC",
    ]

    nflcrawler.add_seeds(seeds)

    rules = {
        "^(http://www.nfl.com/teams/roster)(\?team=[a-zA-Z]+)$": [
            "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/profile)$"
        ],
        "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/profile)$": [
            "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/careerstats)$"
        ],
    }

    nflcrawler.add_rules(rules)
    nflcrawler.start()
示例#2
0
def main():
    nfltweetcrawler = Crawler()
    seeds = ['http://www.tweeting-athletes.com/index.cfm?CatID=2&People=1']
    
    nfltweetcrawler.add_seeds(seeds)
    
    rules = {'^(http://www.tweeting-athletes.com/)(index.cfm\?CatID=2&People=1)$': ['^(http://www.tweeting-athletes.com/)(index.cfm\?AthleteID=[0-9]+)$'],
    '^(http://www.tweeting-athletes.com/)(index.cfm\?AthleteID=[0-9]+)$':['^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$'],
    '^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$': ['^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$']}
    
    nfltweetcrawler.add_rules(rules)
    nfltweetcrawler.start()
示例#3
0
def startCrawler(request):
      try:
           
       id = request.POST.get('id')
       source = Source.objects.get(id=id)
       sourceurl = source.url
       crawler =  Crawler(sourceurl)
       crawler.start()
       runingcrawlers.update( {'id':id,'inst':crawler} )
       
       return redirect('dashboard')
      except ObjectDoesNotExist:
         return redirect('dashboard')
示例#4
0
from crawler.crawler import Crawler

mycrawler = Crawler()
seeds = ['http://www.baidu.com/']  # list of url
mycrawler.add_seeds(seeds)
rules = {'^(http://.+baidu\.com)(.+)$': ['^(http://.+baidu\.com)(.+)$']}
#your crawling rules: a dictionary type,
#key is the regular expressions for url,
#value is the list of regular expressions for urls which you want to follow from the url in key.
mycrawler.add_rules(rules)
mycrawler.start()  # start crawling
示例#5
0

if __name__ == "__main__":
    try:
        import http.client as httplib
    except ImportError:
        import httplib

    # Override the 100 header limit on responses
    # Otherwise our requests to the washington post will fail.
    httplib._MAXHEADERS = 1000

    starting_urls = [
        'http://thehill.com/', 'http://www.newsweek.com/',
        'https://www.washingtonpost.com/', 'https://www.wsj.com/',
        'http://thefederalist.com/', 'http://www.cnn.com/',
        'http://foxnews.com/'
    ]

    urls = []
    for s_url in starting_urls:
        agg_urls = crawl_sitemaps(s_url, max_depth=1)
        urls.extend(agg_urls)

    router = PageRouter()
    router.add_route('.*', save_page)

    c = Crawler(router, url_stack=[u['location'] for u in urls])
    c.max_depth = 1
    c.start()
示例#6
0
from crawler.crawler import Crawler

mycrawler = Crawler()
seeds = ['http://www.fdprice.com/'] # list of url
mycrawler.add_seeds(seeds)
rules = {'^(http://.+fdprice\.com)(.+)$':[ '^(http://.+fdprice\.com)(.+)$' ]}
#your crawling rules: a dictionary type, 
#key is the regular expressions for url, 
#value is the list of regular expressions for urls which you want to follow from the url in key.
mycrawler.add_rules(rules)
mycrawler.start() # start crawling
示例#7
0
def start_crawler_post_save(sender, instance, created, **kwargs):
    crawler = Crawler(instance.seed_url)
    instance.result = crawler.start(instance.depth)
    instance.status = "COMPLETED"
    instance.save()
示例#8
0
from crawler.crawler import Crawler
import os
import json

url = os.getenv('CRAWLER_TARGET_URL')
output_path = os.getenv('CRAWLER_OUTPUT_PATH')
tags = json.loads(os.getenv('CRAWLER_TARGET_TAGS', '["a", "img", "script"]'))
if not url:
    raise NameError('CRAWLER_TARGET_URL env var not set')
if not output_path:
    raise NameError('CRAWLER_OUTPUT_PATH env var not set')
crawl = Crawler(url, output_path, tags)
crawl.start()