def fetch(url_u): user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" urllib.URLopener.version = user_agent socket.setdefaulttimeout(120) # are we fetching from pedia? wiki = False parse_obj = urlparse.urlparse(url_u) if re.match(u'.*wikipedia[.]org$', parse_obj.netloc): match = re.search(u'^[/]wiki[/](.*)', parse_obj.path) if match: wiki = True article = match.group(1) pageurl_u = url_u # backup pageurl url_u = (u'http://%s/w/index.php?title=%s&action=edit' % (parse_obj.netloc, article)) else: io.message("Failed to redirect url to edit page: %s" % display_url(url_u)) io.message("Fetch url: %s" % display_url(url_u)) txt_byte = urllib.urlopen(decoder.encode(url_u)).read() # if wiki, detect redirect (only one) if wiki: txt_u = decoder.detect_decode(txt_byte) txt_u = unmarkup.get_wiki_body(txt_u) match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u) if match: article = match.group(1) article = article[0].upper() + article[1:] article = re.sub('[ ]', '_', article) # backup pageurl pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article)) url_u = (u'http://%s/w/index.php?title=%s&action=edit' % (parse_obj.netloc, article)) io.message("Detected a wiki redirect to: %s" % display_url(url_u)) txt_byte = urllib.urlopen(decoder.encode(url_u)).read() try: url_u = pageurl_u except UnboundLocalError: pass retrieved = Retrieved(txt_byte, url_u) return retrieved
def find_next(url_u, web, handler=None): url_byte = decoder.encode(url_u) if handler: io.output("Running handler with page: %s" % url_byte) handler(url_u) io.output("Spidering page: %s" % url_byte) txt_byte = get_page(url_byte) candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte) encoding = decoder.detect_encoding(txt_byte) chosen_u = pick_url(candidates_byte, encoding=encoding) return chosen_u
def url_handler(url_u, dir='/tmp/t'): if not os.path.isdir(dir): os.makedirs(dir) os.environ["ORIG_FILENAMES"] = "1" filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt' ret = fetcher.fetch(url_u) txt_u = decoder.detect_decode(ret.txt_byte) txt_u = unmarkup.unwiki(txt_u) # add license notice tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) notice = u"\n\n%s\nRetrieved on %s from:\n %s" % ('-' * 78, tm, ret.url_u) notice += (u"\nLicensed under CC-BY-SA, see %s" % "http://creativecommons.org/licenses/by-sa/3.0/") txt_u += notice txt_byte = decoder.encode(txt_u) open(filename, 'w').write(txt_byte)
def url_handler(url_u, dir='/tmp/t'): if not os.path.isdir(dir): os.makedirs(dir) os.environ["ORIG_FILENAMES"] = "1" filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt' ret = fetcher.fetch(url_u) txt_u = decoder.detect_decode(ret.txt_byte) txt_u = unmarkup.unwiki(txt_u) # add license notice tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) notice = u"\n\n%s\nRetrieved on %s from:\n %s" % ('-'*78, tm, ret.url_u) notice += (u"\nLicensed under CC-BY-SA, see %s" % "http://creativecommons.org/licenses/by-sa/3.0/") txt_u += notice txt_byte = decoder.encode(txt_u) open(filename, 'w').write(txt_byte)
def display_url(url_u): url_byte = decoder.encode(url_u) url_byte = urllib.unquote(url_byte) # get rid of %20 etc return url_byte
filter_mediawiki = mediawiki.MediawikiFilter() txt_u = filter_mediawiki.get_wiki_body(txt_u) return txt_u def unwiki(txt_u): filter_mediawiki = mediawiki.MediawikiFilter() filter_html = html.HtmlFilter() txt_u = filter_mediawiki.get_wiki_body(txt_u) txt_u = filter_html.resolve_specialchars(txt_u) txt_u = filter_mediawiki.unmarkup(txt_u) txt_u = filter_html.unmarkup(txt_u) return txt_u if __name__ == "__main__": import decoder import fetcher ret = fetcher.fetch('http://en.wikipedia.org/w/index.php?title=Linguistics&action=edit') txt_u = decoder.detect_decode(ret.txt_byte) txt_u = unwiki(txt_u) or unhtml(txt_u) print(decoder.encode(txt_u)) sys.exit() txt_byte = open(sys.argv[1]).read() txt_u = decoder.detect_decode(txt_byte) txt_u = unwiki(txt_u) or unhtml(txt_u) print(decoder.encode(txt_u)) sys.exit()
def message(s): s = decoder.encode(s) if s[-1] != '\n': s += '\n' sys.stderr.write(s) sys.stderr.flush()
def output(s): s = decoder.encode(s) if s[-1] != '\n': s += '\n' sys.stdout.write(s) sys.stdout.flush()