Exemplo n.º 1
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    handle_url('http://www.arcamax.com/comics', session, res)
    save_result(res, json_file)
Exemplo n.º 2
0
def main(args):
    """Get scraper descriptions from google results."""
    if os.path.isfile(json_file):
        result = load_result(json_file)
    else:
        result = {}
    if args:
        tofind = args[0]
    else:
        tofind = None
    for scraperclass in sorted(get_scraperclasses(), key=classname):
        key = classname(scraperclass)
        if tofind and key != tofind:
            continue
        tofind = None
        if '_' in key:
            continue
        print(key)
        if scraperclass.description:
            continue
        if key in result:
            continue
        url = get_scraper_url(scraperclass)
        print(url)
        lang = scraperclass.lang
        description = get_description(url, lang)
        if description:
            print(description)
            # store result
            module = scraperclass.__module__
            result[key] = dict(description=description, module=module, url=url)
            save_result(result, json_file)
        else:
            print("No description found")
    return 0
Exemplo n.º 3
0
def main(args):
    """Get scraper descriptions from google results."""
    if os.path.isfile(json_file):
        result = load_result(json_file)
    else:
        result = {}
    if args:
        tofind = args[0]
    else:
        tofind = None
    for scraperclass in sorted(get_scraperclasses(), key=classname):
        key = classname(scraperclass)
        if tofind and key != tofind:
            continue
        tofind = None
        if '_' in key:
            continue
        print(key)
        if scraperclass.description:
            continue
        if key in result:
            continue
        url = get_scraper_url(scraperclass)
        print(url)
        lang = scraperclass.lang
        description = get_description(url, lang)
        if description:
            print(description)
            # store result
            module = scraperclass.__module__
            result[key] = dict(description=description, module=module, url=url)
            save_result(result, json_file)
        else:
            print("No description found")
    return 0
Exemplo n.º 4
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    handle_url('http://www.arcamax.com/comics', session, res)
    save_result(res, json_file)
Exemplo n.º 5
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    base = "http://keenspot.com/"
    handle_url(base, session, res)
    save_result(res, json_file)
Exemplo n.º 6
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    base = 'http://keenspot.com/'
    handle_url(base, session, res)
    save_result(res, json_file)
Exemplo n.º 7
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
    for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
        handle_url(base % c, session, res)
    save_result(res, json_file)
Exemplo n.º 8
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    handle_url('http://www.gocomics.com/features', session, res)
    handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
    handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
    save_result(res, json_file)
Exemplo n.º 9
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    base = "http://guide.comicgenesis.com/Keenspace_%s.html"
    for c in "0ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        handle_url(base % c, session, res)
    save_result(res, json_file)
Exemplo n.º 10
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    handle_url('http://www.gocomics.com/features', session, res)
    handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
    handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
    save_result(res, json_file)
Exemplo n.º 11
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page='
    pages = 382
    for i in range(1, pages + 1):
        url = baseUrl + str(i)
        handle_url(url, session, res)
    save_result(res, json_file)
Exemplo n.º 12
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page='
    pages = 382
    for i in range(1, pages+1):
        url = baseUrl + str(i)
        handle_url(url, session, res)
    save_result(res, json_file)
Exemplo n.º 13
0
def get_results():
    """Parse all search result pages."""
    base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
    session = requests.Session()
    # store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
    res = {}
    # a search for an empty string returned 286 result pages
    result_pages = 286
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    for i in range(0, result_pages):
        print(i+1, file=sys.stderr, end=" ")
        handle_url(base % (i*12), session, res)
    save_result(res, json_file)
Exemplo n.º 14
0
def get_results():
    """Parse all search result pages."""
    base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
    href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue"))
    num = re.compile(r'(\d+) pages?</span>')
    # store info in a dictionary {name -> number of comics}
    res = {}
    # a search for an empty string returned 825 result pages
    result_pages = 825
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    session = requests.Session()
    for i in range(1, result_pages + 1):
        print(i, file=sys.stderr, end=" ")
        handle_url(base % i, session, href, num, res)
    save_result(res, json_file)
Exemplo n.º 15
0
def get_results():
    """Parse all search result pages."""
    base = "http://www.theduckwebcomics.com/search/?page=%d&search=&type=0&type=1&last_update="
    href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue"))
    num = re.compile(r'(\d+) pages?</span>')
    # store info in a dictionary {name -> number of comics}
    res = {}
    # a search for an empty string returned 825 result pages
    result_pages = 825
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    session = requests.Session()
    for i in range(1, result_pages + 1):
        print(i, file=sys.stderr, end=" ")
        handle_url(base % i, session, href, num, res)
    save_result(res, json_file)
Exemplo n.º 16
0
def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    # Sort by page count, so we can abort when we get under some threshold.
    baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
               '&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
               '&all_st=1&all_la=1&page=%d')
    last_count = 999
    page = 1
    print("Parsing search result pages...", file=sys.stderr)
    while last_count >= MIN_COMICS:
        last_count = handle_url(baseUrl % page, session, res)
        page += 1
        print(last_count, file=sys.stderr, end=" ")
    save_result(res, json_file)
Exemplo n.º 17
0
def main(args):
    """Get scraper descriptions from google results."""
    if os.path.isfile(json_file):
        result = load_result(json_file)
    else:
        result = {}
    for classname, info in sorted(result.items()):
        if has_description(classname) or '_' in classname:
            continue
        if info.get('answer') == 'no':
            continue
        if not answer(classname, info):
            info['answer'] = 'no'
            save_result(result, json_file)
            continue
        filename = info['module'].replace('.', os.sep) + ".py"
        encoding = get_encoding(filename)
        with codecs.open(filename, 'r', encoding) as f:
            with codecs.open(filename + "_", 'w', encoding) as out:
                write_description(f, out, classname, info)
        os.rename(filename + "_", filename)
    return 0
Exemplo n.º 18
0
def main(args):
    """Get scraper descriptions from google results."""
    if os.path.isfile(json_file):
        result = load_result(json_file)
    else:
        result = {}
    for classname, info in sorted(result.items()):
        if has_description(classname) or '_' in classname:
            continue
        if info.get('answer') == 'no':
            continue
        if not answer(classname, info):
            info['answer'] = 'no'
            save_result(result, json_file)
            continue
        filename = info['module'].replace('.', os.sep) + ".py"
        encoding = get_encoding(filename)
        with codecs.open(filename, 'r', encoding) as f:
            with codecs.open(filename+"_", 'w', encoding) as out:
                write_description(f, out, classname, info)
        os.rename(filename+"_", filename)
    return 0