Exemplos de check_robotstxt em Python, exemplos de dosagelib.util.check_robotstxt em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: keenspot.py Projeto: KevinAnthony/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = format_name(match.group(3))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl

Exemplo n.º 2

0

Exibir arquivo

Arquivo: keenspot.py Projeto: BigYesh/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        mo = descurl_matcher.search(match.group(1))
        desc = get_description(url + mo.group(1), session)
        comicurl = match.group(2)
        name = unescape(match.group(3))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl+"d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = (comicurl, desc)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: keenspot.py Projeto: Freestila/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = unescape(match.group(3))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl

Exemplo n.º 4

0

Exibir arquivo

Arquivo: comicgenesis.py Projeto: KevinAnthony/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for comicgenesis", repr(name))
            continue
        else:
            res[name] = (url, num)

Exemplo n.º 5

0

Exibir arquivo

    def collect_results(self):
        # Parse the comic list page
        data = self.get_url('https://www.webtoons.com/en/dailySchedule')

        for comiclink in data.xpath(
                '//a[contains(@class, "daily_card_item")]'):
            comicurl = comiclink.attrib['href']
            name = comiclink.xpath(
                './/div[@class="info"]/p[@class="subj"]')[0].text
            try:
                check_robotstxt(comicurl, self.session)
            except IOError as e:
                print('[%s] INFO: robots.txt denied: %s' % (name, e))
                continue

            self.add_comic(name, comicurl)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: keenspot.py Projeto: littauer/dosagetest

    def collect_results(self):
        """Parse the front page."""
        data = self.get_url('http://keenspot.com/')

        for comiclink in data.xpath('//td[@id]/a'):
            comicurl = comiclink.attrib['href']
            name = comiclink.xpath("string()")
            try:
                if "/d/" not in comicurl:
                    check_robotstxt(comicurl + "d/", self.session)
                else:
                    check_robotstxt(comicurl, self.session)
            except IOError as e:
                print("[%s] INFO: robots.txt denied: %s" % (name, e))
                continue

            self.add_comic(name, comicurl)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: comicgenesis.py Projeto: shartge/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for", repr(name))
            continue
        else:
            res[name] = (url, num)