Пример #1
0
def fetch(url_u):
    user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
    urllib.URLopener.version = user_agent
    socket.setdefaulttimeout(120)

    # are we fetching from pedia?
    wiki = False
    parse_obj = urlparse.urlparse(url_u)
    if re.match(u'.*wikipedia[.]org$', parse_obj.netloc):
        match = re.search(u'^[/]wiki[/](.*)', parse_obj.path)
        if match:
            wiki = True
            article = match.group(1)
            pageurl_u = url_u # backup pageurl
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))
        else:
            io.message("Failed to redirect url to edit page: %s" %
                       display_url(url_u))

    io.message("Fetch url: %s" % display_url(url_u))
    txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    # if wiki, detect redirect (only one)
    if wiki:
        txt_u = decoder.detect_decode(txt_byte)
        txt_u = unmarkup.get_wiki_body(txt_u)
        match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u)
        if match:
            article = match.group(1)
            article = article[0].upper() + article[1:]
            article = re.sub('[ ]', '_', article)
            # backup pageurl
            pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article))
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))

            io.message("Detected a wiki redirect to: %s" % display_url(url_u))
            txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    try:
        url_u = pageurl_u
    except UnboundLocalError:
        pass
    retrieved = Retrieved(txt_byte, url_u)

    return retrieved
Пример #2
0
def fetch(url_u):
    user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
    urllib.URLopener.version = user_agent
    socket.setdefaulttimeout(120)

    # are we fetching from pedia?
    wiki = False
    parse_obj = urlparse.urlparse(url_u)
    if re.match(u'.*wikipedia[.]org$', parse_obj.netloc):
        match = re.search(u'^[/]wiki[/](.*)', parse_obj.path)
        if match:
            wiki = True
            article = match.group(1)
            pageurl_u = url_u  # backup pageurl
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))
        else:
            io.message("Failed to redirect url to edit page: %s" %
                       display_url(url_u))

    io.message("Fetch url: %s" % display_url(url_u))
    txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    # if wiki, detect redirect (only one)
    if wiki:
        txt_u = decoder.detect_decode(txt_byte)
        txt_u = unmarkup.get_wiki_body(txt_u)
        match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u)
        if match:
            article = match.group(1)
            article = article[0].upper() + article[1:]
            article = re.sub('[ ]', '_', article)
            # backup pageurl
            pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article))
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))

            io.message("Detected a wiki redirect to: %s" % display_url(url_u))
            txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    try:
        url_u = pageurl_u
    except UnboundLocalError:
        pass
    retrieved = Retrieved(txt_byte, url_u)

    return retrieved
Пример #3
0
def find_next(url_u, web, handler=None):
    url_byte = decoder.encode(url_u)

    if handler:
        io.output("Running handler with page: %s" % url_byte)
        handler(url_u)

    io.output("Spidering page: %s" % url_byte)
    txt_byte = get_page(url_byte)

    candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte)
    encoding = decoder.detect_encoding(txt_byte)
    chosen_u = pick_url(candidates_byte, encoding=encoding)

    return chosen_u
Пример #4
0
def find_next(url_u, web, handler=None):
    url_byte = decoder.encode(url_u)

    if handler:
        io.output("Running handler with page: %s" % url_byte)
        handler(url_u)

    io.output("Spidering page: %s" % url_byte)
    txt_byte = get_page(url_byte)

    candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte)
    encoding = decoder.detect_encoding(txt_byte)
    chosen_u = pick_url(candidates_byte, encoding=encoding)

    return chosen_u
Пример #5
0
def url_handler(url_u, dir='/tmp/t'):
    if not os.path.isdir(dir):
        os.makedirs(dir)

    os.environ["ORIG_FILENAMES"] = "1"
    filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt'

    ret = fetcher.fetch(url_u)
    txt_u = decoder.detect_decode(ret.txt_byte)
    txt_u = unmarkup.unwiki(txt_u)

    # add license notice
    tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    notice = u"\n\n%s\nRetrieved on %s from:\n  %s" % ('-' * 78, tm, ret.url_u)
    notice += (u"\nLicensed under CC-BY-SA, see %s" %
               "http://creativecommons.org/licenses/by-sa/3.0/")
    txt_u += notice

    txt_byte = decoder.encode(txt_u)
    open(filename, 'w').write(txt_byte)
Пример #6
0
def url_handler(url_u, dir='/tmp/t'):
    if not os.path.isdir(dir):
        os.makedirs(dir)

    os.environ["ORIG_FILENAMES"] = "1"
    filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt'

    ret = fetcher.fetch(url_u)
    txt_u = decoder.detect_decode(ret.txt_byte)
    txt_u = unmarkup.unwiki(txt_u)

    # add license notice
    tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    notice = u"\n\n%s\nRetrieved on %s from:\n  %s" % ('-'*78, tm, ret.url_u)
    notice += (u"\nLicensed under CC-BY-SA, see %s" %
               "http://creativecommons.org/licenses/by-sa/3.0/")
    txt_u += notice

    txt_byte = decoder.encode(txt_u)
    open(filename, 'w').write(txt_byte)
Пример #7
0
def display_url(url_u):
    url_byte = decoder.encode(url_u)
    url_byte = urllib.unquote(url_byte)  # get rid of %20 etc
    return url_byte
Пример #8
0
def display_url(url_u):
    url_byte = decoder.encode(url_u)
    url_byte = urllib.unquote(url_byte) # get rid of %20 etc
    return url_byte
Пример #9
0
    filter_mediawiki = mediawiki.MediawikiFilter()
    txt_u = filter_mediawiki.get_wiki_body(txt_u)
    return txt_u

def unwiki(txt_u):
    filter_mediawiki = mediawiki.MediawikiFilter()
    filter_html = html.HtmlFilter()
    txt_u = filter_mediawiki.get_wiki_body(txt_u)
    txt_u = filter_html.resolve_specialchars(txt_u)
    txt_u = filter_mediawiki.unmarkup(txt_u)
    txt_u = filter_html.unmarkup(txt_u)
    return txt_u


if __name__ == "__main__":
    import decoder

    import fetcher
    ret = fetcher.fetch('http://en.wikipedia.org/w/index.php?title=Linguistics&action=edit')
    txt_u = decoder.detect_decode(ret.txt_byte)
    txt_u = unwiki(txt_u) or unhtml(txt_u)
    print(decoder.encode(txt_u))
    sys.exit()

    txt_byte = open(sys.argv[1]).read()
    txt_u = decoder.detect_decode(txt_byte)
    txt_u = unwiki(txt_u) or unhtml(txt_u)
    print(decoder.encode(txt_u))
    sys.exit()

Пример #10
0
def message(s):
    s = decoder.encode(s)
    if s[-1] != '\n':
        s += '\n'
    sys.stderr.write(s)
    sys.stderr.flush()
Пример #11
0
def output(s):
    s = decoder.encode(s)
    if s[-1] != '\n':
        s += '\n'
    sys.stdout.write(s)
    sys.stdout.flush()