def handleStartpage(response, job, spider): try: if job._ADD['sitetype'] == "region": fed = job._ADD['fed'] regionname = job._ADD['regionname'] root = response.root() base_href = root.xpath("//base/@href") if base_href: base_href = base_href[0] if not base_href.startswith('http'): base_href = "https:" + base_href else: base_href = None root.make_links_absolute(base_href) for group in root.xpath("//div[@class='well contest-group']"): group_class = group.xpath(".//h5/text()")[0] for link in group.xpath(".//a[@href]"): spider.addToQueue( CrawlJob(link.get("href"), sitetype="group", groupname=link.text_content().strip(), group_class=group_class, fed=fed, regionname=regionname)) logger.warning(link.get("href")) return True except: return False
def handleStartpage(response, job, spider): try: if job._ADD['sitetype'] == "verband": fed = job._ADD['fed'] root = response.root() base_href = root.xpath("//base/@href") if base_href: base_href = base_href[0] if not base_href.startswith('http'): base_href = "https:" + base_href else: base_href = None root.make_links_absolute(base_href) for link in root.xpath( "//div[@data-match-height='championships']//a[@href]"): l = link.get("href") spider.addToQueue( CrawlJob(l, sitetype="region", regionname=link.text_content().strip())) logger.warning(l) try: link = root.xpath( "//div[@class='row m-l text-center']//a[@class='btn btn-primary']" )[0] spider.addToQueue( CrawlJob(link.get("href"), sitetype="region", regionname=link.text_content().strip(), fed=fed)) logger.warning(link.get("href")) except: pass return True except: return False
def seeds(): jobs = [] logger = logging.getLogger("logger") for federation in [ 'WTTV', 'DTTB', 'TTVBW', 'TTVWH', 'TTVB', 'BTTV', 'FTTB', 'HeTTV', 'TTVMV', 'TTVN', 'RTTV', 'TTVR', 'PTTV', 'STTB', 'TTVSA', 'TTTV', 'TTVSH', 'SbTTV', 'BaTTV' ]: host = random.choice(['wttv', 'dttb', 'rttv', 'ttvn']) url = 'http://{}.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubSearch?federation={}'.format( host, federation) logger.debug(url) job = CrawlJob(url, fed=federation, typ='seed') jobs.append(job) return jobs
def crawlOverview(r, job, spider, crawled): if job._ADD['typ'] == 'seed': root = r.root() root.make_links_absolute(job.url) for link in root.xpath("//div[@id='content-row1']//a/@href"): if link not in crawled: crawled[link] = True q = urlparse(link).query query = parse_qs(q) region = query['regionName'][0] spider.addToQueue( CrawlJob(link, typ="region", fed=job._ADD['fed'], region=region)) return True
def seeds(): jobs = [] clubids = set() counter = 0 with open("clubs.jsonl", 'r') as f: for line in f.readlines(): data = json.loads(line.strip()) if data['clubid'] in clubids: continue clubids.add(data['clubid']) counter += 1 host = random.choice(['wttv', 'dttb', 'rttv', 'ttvn']) url = 'http://{}.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubInfoDisplay?club={}'.format(host, data['clubid']) jobs.append(CrawlJob(url, club=data)) print('{} Todos'.format(counter)) return jobs
def handleStartpage(response, job, spider): if job._ADD['sitetype'] == "start": root = response.root() base_href = root.xpath("//base/@href") if base_href: base_href = base_href[0] if not base_href.startswith('http'): base_href = "https:" + base_href else: base_href = None root.make_links_absolute(base_href) for link in root.xpath( "//a[.//h5[@data-mh='federation-link-headline']]"): l = link.get("href") if not l.startswith("https"): l = "https:" + l spider.addToQueue( CrawlJob(l, sitetype="verband", fed=link.text_content().strip())) logger.warning(l) return True
def init(): job = CrawlJob("https://www.mytischtennis.de/clicktt/verbaende", sitetype="start") return [job]