Пример #1
0
def handleStartpage(response, job, spider):
    try:
        if job._ADD['sitetype'] == "region":
            fed = job._ADD['fed']
            regionname = job._ADD['regionname']
            root = response.root()
            base_href = root.xpath("//base/@href")
            if base_href:
                base_href = base_href[0]
                if not base_href.startswith('http'):
                    base_href = "https:" + base_href
            else:
                base_href = None
            root.make_links_absolute(base_href)
            for group in root.xpath("//div[@class='well contest-group']"):
                group_class = group.xpath(".//h5/text()")[0]
                for link in group.xpath(".//a[@href]"):
                    spider.addToQueue(
                        CrawlJob(link.get("href"),
                                 sitetype="group",
                                 groupname=link.text_content().strip(),
                                 group_class=group_class,
                                 fed=fed,
                                 regionname=regionname))
                    logger.warning(link.get("href"))

        return True
    except:
        return False
Пример #2
0
def handleStartpage(response, job, spider):
    try:
        if job._ADD['sitetype'] == "verband":
            fed = job._ADD['fed']
            root = response.root()
            base_href = root.xpath("//base/@href")
            if base_href:
                base_href = base_href[0]
                if not base_href.startswith('http'):
                    base_href = "https:" + base_href
            else:
                base_href = None
            root.make_links_absolute(base_href)
            for link in root.xpath(
                    "//div[@data-match-height='championships']//a[@href]"):
                l = link.get("href")
                spider.addToQueue(
                    CrawlJob(l,
                             sitetype="region",
                             regionname=link.text_content().strip()))
                logger.warning(l)
            try:
                link = root.xpath(
                    "//div[@class='row m-l text-center']//a[@class='btn btn-primary']"
                )[0]
                spider.addToQueue(
                    CrawlJob(link.get("href"),
                             sitetype="region",
                             regionname=link.text_content().strip(),
                             fed=fed))
                logger.warning(link.get("href"))
            except:
                pass
        return True
    except:
        return False
Пример #3
0
def seeds():
    jobs = []
    logger = logging.getLogger("logger")
    for federation in [
            'WTTV', 'DTTB', 'TTVBW', 'TTVWH', 'TTVB', 'BTTV', 'FTTB', 'HeTTV',
            'TTVMV', 'TTVN', 'RTTV', 'TTVR', 'PTTV', 'STTB', 'TTVSA', 'TTTV',
            'TTVSH', 'SbTTV', 'BaTTV'
    ]:
        host = random.choice(['wttv', 'dttb', 'rttv', 'ttvn'])
        url = 'http://{}.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubSearch?federation={}'.format(
            host, federation)
        logger.debug(url)
        job = CrawlJob(url, fed=federation, typ='seed')
        jobs.append(job)
    return jobs
Пример #4
0
def crawlOverview(r, job, spider, crawled):
    if job._ADD['typ'] == 'seed':
        root = r.root()
        root.make_links_absolute(job.url)
        for link in root.xpath("//div[@id='content-row1']//a/@href"):
            if link not in crawled:
                crawled[link] = True
                q = urlparse(link).query
                query = parse_qs(q)
                region = query['regionName'][0]
                spider.addToQueue(
                    CrawlJob(link,
                             typ="region",
                             fed=job._ADD['fed'],
                             region=region))
    return True
Пример #5
0
def seeds():
    jobs = []
    clubids = set()
    counter = 0
    with open("clubs.jsonl", 'r') as f:
        for line in f.readlines():
            data = json.loads(line.strip())
            if data['clubid'] in clubids:
                continue
            clubids.add(data['clubid'])
            counter += 1
            host = random.choice(['wttv', 'dttb', 'rttv', 'ttvn'])
            url = 'http://{}.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubInfoDisplay?club={}'.format(host, data['clubid'])
            jobs.append(CrawlJob(url, club=data))

    print('{} Todos'.format(counter))
    return jobs
Пример #6
0
def handleStartpage(response, job, spider):
    if job._ADD['sitetype'] == "start":
        root = response.root()
        base_href = root.xpath("//base/@href")
        if base_href:
            base_href = base_href[0]
            if not base_href.startswith('http'):
                base_href = "https:" + base_href
        else:
            base_href = None
        root.make_links_absolute(base_href)

        for link in root.xpath(
                "//a[.//h5[@data-mh='federation-link-headline']]"):
            l = link.get("href")
            if not l.startswith("https"):
                l = "https:" + l
            spider.addToQueue(
                CrawlJob(l,
                         sitetype="verband",
                         fed=link.text_content().strip()))
            logger.warning(l)
    return True
Пример #7
0
def init():

    job = CrawlJob("https://www.mytischtennis.de/clicktt/verbaende",
                   sitetype="start")
    return [job]