Exemplo n.º 1
0
 def upload_or_link(tmpfile):
     """This is called from inside screenshot_url to avoid
     uploading duplicate files to S3 when the page has not
     changed."""
     with file(tmpfile) as fil_ro:
         bytes = fil_ro.read()
         image_sha1 = hashlib.sha1(bytes).hexdigest()
         if previous and previous.image_sha1 == image_sha1:
             return (previous.image_sha1, previous.image_url)
         else:
             filename = "{hash}/{hash}_{timestamp}.png".format(
                 hash=self.url_sha1, timestamp=abbrev_isoformat(now))
             new_url = upload_image(tmpfile, filename, 'image/png')
             if new_url is None:
                 return None
             return (image_sha1, new_url)
Exemplo n.º 2
0
 def upload_or_link(tmpfile):
     """This is called from inside screenshot_url to avoid
     uploading duplicate files to S3 when the page has not
     changed."""
     with file(tmpfile) as fil_ro:
         bytes = fil_ro.read()
         image_sha1 = hashlib.sha1(bytes).hexdigest()
         if previous and previous.image_sha1 == image_sha1:
             return (previous.image_sha1, previous.image_url)
         else:
             filename = "{hash}/{hash}_{timestamp}.png".format(hash=self.url_sha1,
                                                               timestamp=abbrev_isoformat(now))
             new_url = upload_image(tmpfile, filename, 'image/png')
             if new_url is None:
                 return None
             return (image_sha1, new_url)
def relative_mirror_url(mirror):
    tz = pytz.timezone(settings.TIME_ZONE)
    timestamp = abbrev_isoformat(mirror.timestamp.astimezone(tz))
    print timestamp
    parsed = urlparse.urlparse(mirror.election_url.url)
    if parsed.path == '/':
        fixedpath = "index.html"
    else:
        fixedpath = parsed.path.strip('/')
    path = '{state}/{sha1}/{timestamp}/{netloc}/{path}'.format(state=mirror.election_url.state,
                                                               sha1=mirror.election_url.url_sha1,
                                                               timestamp=timestamp,
                                                               netloc=parsed.netloc,
                                                               path=fixedpath)
    if parsed.query:
        path = path + '%3F' + parsed.query
    return path
def relative_mirror_url(mirror):
    tz = pytz.timezone(settings.TIME_ZONE)
    timestamp = abbrev_isoformat(mirror.timestamp.astimezone(tz))
    print timestamp
    parsed = urlparse.urlparse(mirror.election_url.url)
    if parsed.path == '/':
        fixedpath = "index.html"
    else:
        fixedpath = parsed.path.strip('/')
    path = '{state}/{sha1}/{timestamp}/{netloc}/{path}'.format(
        state=mirror.election_url.state,
        sha1=mirror.election_url.url_sha1,
        timestamp=timestamp,
        netloc=parsed.netloc,
        path=fixedpath)
    if parsed.query:
        path = path + '%3F' + parsed.query
    return path
Exemplo n.º 5
0
def mirror_url(urlobj):
    local_timezone = pytz.timezone(settings.TIME_ZONE)
    now = pytz.datetime.datetime.now(tz=local_timezone)

    url_mirror_root = os.path.abspath(os.path.join(settings.MIRROR_ROOT,
                                                   urlobj.state,
                                                   urlobj.url_sha1))
    if not os.path.exists(url_mirror_root):
        os.makedirs(url_mirror_root)
    dest_dir = os.path.join(url_mirror_root, abbrev_isoformat(now))
    log_path = os.path.join(url_mirror_root, "wget.log")

    previous = urlobj.latest_mirror()
    if previous:
        copy_dir(previous.dir, dest_dir)
    elif not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    user_agent_arg = ("--user-agent='{ua}'".format(ua=urlobj.user_agent)
                      if urlobj.user_agent
                      else "")
    args = ["wget",
            "--no-verbose",
            "-p",
            "--convert-links",
            "--wait=1",
            "-N",
            "--random-wait",
            user_agent_arg,
            "-o",
            log_path,
            "-P",
            dest_dir,
            urlobj.url]
    try:
        (stdout, stderr) = run_subprocess_safely(args)

        mirror = ElectionMirror.objects.create(election_url=urlobj,
                                               timestamp=now,
                                               dir=dest_dir)
        return mirror
    except ProcessTimeout as e:
        log.error(u"wget failed to mirror url {url}: {e} {e_type}",
                  url=urlobj.url, e=unicode(e), e_type=type(e))
Exemplo n.º 6
0
def mirror_url(urlobj):
    local_timezone = pytz.timezone(settings.TIME_ZONE)
    now = pytz.datetime.datetime.now(tz=local_timezone)

    url_mirror_root = os.path.abspath(
        os.path.join(settings.MIRROR_ROOT, urlobj.state, urlobj.url_sha1))
    if not os.path.exists(url_mirror_root):
        os.makedirs(url_mirror_root)
    dest_dir = os.path.join(url_mirror_root, abbrev_isoformat(now))
    log_path = os.path.join(url_mirror_root, "wget.log")

    previous = urlobj.latest_mirror()
    if previous:
        copy_dir(previous.dir, dest_dir)
    elif not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    user_agent_arg = ("--user-agent='{ua}'".format(
        ua=urlobj.user_agent) if urlobj.user_agent else "")
    args = [
        "wget", "--no-verbose", "-p", "--convert-links", "--wait=1", "-N",
        "--random-wait", user_agent_arg, "-o", log_path, "-P", dest_dir,
        urlobj.url
    ]
    try:
        (stdout, stderr) = run_subprocess_safely(args)

        mirror = ElectionMirror.objects.create(election_url=urlobj,
                                               timestamp=now,
                                               dir=dest_dir)
        return mirror
    except ProcessTimeout as e:
        log.error(u"wget failed to mirror url {url}: {e} {e_type}",
                  url=urlobj.url,
                  e=unicode(e),
                  e_type=type(e))