Пример #1
0
def get_results():
    """Parse all search result pages."""
    base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
    href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue"))
    num = re.compile(r'(\d+) pages?</span>')
    # store info in a dictionary {name -> number of comics}
    res = {}
    # a search for an empty string returned 825 result pages
    result_pages = 825
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    for i in range(1, result_pages + 1):
        print(i, file=sys.stderr, end=" ")
        handle_url(base % i, href, num, res)
    save_result(res)
Пример #2
0
def get_results():
    """Parse all search result pages."""
    base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
    href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue"))
    num = re.compile(r'(\d+) pages?</span>')
    # store info in a dictionary {name -> number of comics}
    res = {}
    # a search for an empty string returned 825 result pages
    result_pages = 825
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    session = requests.Session()
    for i in range(1, result_pages + 1):
        print(i, file=sys.stderr, end=" ")
        handle_url(base % i, session, href, num, res)
    save_result(res, json_file)
Пример #3
0
 def test_regex(self):
     matcher = re.compile(tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix))
     for tag, value, domatch in self.TagTests:
         self.match_tag(matcher, tag, value, domatch)
Пример #4
0
from __future__ import print_function
import codecs
import re
import sys
import os
import requests

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r"([^<]+)</a>")
num_matcher = re.compile(r"Number of Days: (\d+)")

# names of comics to exclude
exclude_comics = [
    "10",  # page is gone
    "54sinRed",  # page is 403 forbidden
    "6D4",  # redirected to another page
    "AaaSoCAwesomenessandaSliceofCheese",  # broken images
    "AcrossthePond",  # page moved
    "ACDeceptibotscomic",  # no images
    "AdamandSei",  # page has 403 forbidden
    "AdamsRoadGang",  # page is gone
    "ADVENTURERS",  # page is gone
    "AiYaiYai",  # page moved
    "AlltheCommies",  # missing images
Пример #5
0
import sys
import os

import requests

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name


json_file = __file__.replace(".py", ".json")


url_matcher = re.compile(
    tagre("td", "onmouseover", r'([^"]+)')
    + tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*')
    + r"(?:<b>)?([^<]+)(?:</b>)?</a>"
)


# names of comics to exclude
exclude_comics = [
    "BrawlintheFamily",  # non-standard navigation
    "CrowScare",  # non-standard navigation
    "Dreamless",  # non-standard navigation
    "EV",  # non-standard navigation
    "Exposure",  # non-standard navigation
    "Flipside",  # non-standard navigation
    "HerobyNight",  # non-standard navigation
    "JadeWarriors",  # non-standard navigation
Пример #6
0
 def test_regex(self):
     matcher = re.compile(
         tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix))
     for tag, value, domatch in self.TagTests:
         self.match_tag(matcher, tag, value, domatch)
Пример #7
0
"""
from __future__ import print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

url_matcher = re.compile(
    tagre("td", "onmouseover", r'([^"]+)') +
    tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
    r"(?:<b>)?([^<]+)(?:</b>)?</a>")

# names of comics to exclude
exclude_comics = [
    "BrawlintheFamily",  # non-standard navigation
    "CrowScare",  # non-standard navigation
    "Dreamless",  # non-standard navigation
    "EV",  # non-standard navigation
    "Exposure",  # non-standard navigation
    "Flipside",  # non-standard navigation
    "HerobyNight",  # non-standard navigation
    "JadeWarriors",  # non-standard navigation
    "LastBlood",  # non-standard navigation
    "MysticRevolution",  # non-standard navigation
Пример #8
0
"""
from __future__ import print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

#<a href="/shortname" class="alpha_list updated">name</a>
url_matcher = re.compile(
    tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")

# names of comics to exclude
exclude_comics = [
    "Angryprogrammer",  # unavailable
    "Complex",  # "coming soon"
    "Guinness",  # "coming soon"
    "Jabberwoncky",  # "coming soon"
    "KickyBrand",  # unavailable
    "Penmanship",  # unavailable
    "RandysRationale",  # "coming soon"
    "SaturdayMorningBreakfastCereal",  # duplicate
    "SignsOfOurTimes",  # "coming soon"
    "TheGagwriter",  # "coming soon"
    "Yaoyao",  # "coming soon"
]
Пример #9
0
"""
from __future__ import print_function
import re
import codecs
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

url_matcher = re.compile(
    tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')

# names of comics to exclude
exclude_comics = []


def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1)
Пример #10
0
"""
from __future__ import print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)')

# names of comics to exclude
exclude_comics = [
    "10", # page is gone
    "54sinRed", # page is 403 forbidden
    "6D4", # redirected to another page
    "AaaSoCAwesomenessandaSliceofCheese", # broken images
    "AcrossthePond", # page moved
    "ACDeceptibotscomic", # no images
    "AdamandSei", # page has 403 forbidden
    "AdamsRoadGang", # page is gone
    "ADVENTURERS", # page is gone
    "AiYaiYai", # page moved
    "AlltheCommies", # missing images
Пример #11
0
 def test_regex(self, tag, value, domatch):
     matcher = re.compile(
         tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix))
     self.match_tag(matcher, tag, value, domatch)
Пример #12
0
Script to get a list of creators.com comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import codecs
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'<strong>([^<]+)</strong>')

# names of comics to exclude
exclude_comics = [
]

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1)
Пример #13
0
Script to get a list of gocomics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

#<a href="/shortname" class="alpha_list updated">name</a>
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")

# names of comics to exclude
exclude_comics = [
    "Adagio", # too few comics
    "AgentGates", # too few comics
    "Apocalypseharry", # too few comics
    "BatkidandBatrat", # too few comics
    "BETWEENTHELINES", # comic unavailable
    "Bonner", # missing page
    "Buster", # comic unavailabe
    "CarteBlanche", # missing images
    "Critterdoodles", # missing images
    "CountyLine", # too few comics
    "Crawdiddy", # comic unavailable
    "DALTONDOG", # comic unavailable
Пример #14
0
import requests

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
                        truncate_name, format_name)

json_file = __file__.replace(".py", ".json")

# <div class="comictitle"><strong><a target="_blank"
# onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
# Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
                         tagre("a", "href", r'(http://[^"]+)') +
                         r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)')

# names of comics to exclude
exclude_comics = [
    "10", # page is gone
    "54sinRed", # page is 403 forbidden
    "6D4", # redirected to another page
    "AaaSoCAwesomenessandaSliceofCheese", # broken images
    "AcrossthePond", # page moved
    "ACDeceptibotscomic", # no images
    "AdamandSei", # page has 403 forbidden
    "AdamsRoadGang", # page is gone
    "ADVENTURERS", # page is gone
    "AiYaiYai", # page moved
Пример #15
0
    "TPTruePower",
    "TwoKeys",
    "UndertheSkin",
    "WelcometoFreakshow",
    "Whenweweresilent",
    "WhiteHeart",
    "Yaoishereforareason",
    "Zodiac",
]

# links to last valid strips
url_overrides = {
}

# HTML content matcher
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
  tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
Пример #16
0
"""
from __future__ import print_function
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name

json_file = __file__.replace(".py", ".json")

# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
                         tagre("a", "href", r'(http://[^"]+)') +
                         r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)')

# names of comics to exclude
exclude_comics = [
    "10",  # page is gone
    "54sinRed",  # page is 403 forbidden
    "6D4",  # redirected to another page
    "AaaSoCAwesomenessandaSliceofCheese",  # broken images
    "AcrossthePond",  # page moved
    "ACDeceptibotscomic",  # no images
    "AdamandSei",  # page has 403 forbidden
    "AdamsRoadGang",  # page is gone
    "ADVENTURERS",  # page is gone
    "AiYaiYai",  # page moved
Пример #17
0
Script to get a list of gocomics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, capfirst

json_file = __file__.replace(".py", ".json")

#<a href="/shortname" class="alpha_list updated">name</a>
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")

# names of comics to exclude
exclude_comics = [
    "FrikkFrakkAndFrank", # too few comics
    "Apocalypseharry", # too few comics
    "BatkidandBatrat", # too few comics
    "BETWEENTHELINES", # comic unavailable
    "Bonner", # missing page
    "Buster", # comic unavailabe
    "DALTONDOG", # comic unavailable
    "DellAndSteve", # too few comics
    "Dilbert", # redirect
    "InkeeDoodles", # comic unavailable
    "MaggiesComics", # too few comics
    "OfMiceandMud", # too few comics
Пример #18
0
 def test_regex(self, tag, value, domatch):
     matcher = re.compile(tagre("img", "src", '(%s[^"]*)' %
                          self.ValuePrefix))
     self.match_tag(matcher, tag, value, domatch)