Exemplo n.º 1
0
def last_crx(archivedir, extid, date=None):
    last_crx_path = ""
    last_crx_etag = ""

    etag_file = os.path.join(archivedir, get_local_archive_dir(extid),
                             extid + ".etag")
    if date is None and os.path.exists(etag_file):
        try:
            with open(etag_file, 'r') as f:
                d = json.load(f)
                return d["last_crx"], d["last_crx_etag"]
        except Exception:
            log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file))
            try:
                os.remove(etag_file)
            except Exception:
                log_exception("Could not remove etag file {}!".format(etag_file))

    # If we do not yet have an .etag file present, open the tarfile and look
    # there for one. After having done that once, the crawler creates the .etag
    # file to avoid opening the tar file in the future.
    tar = os.path.join(archivedir, get_local_archive_dir(extid),
                       extid + ".tar")
    if os.path.exists(tar):
        with tarfile.open(tar, 'r') as t:
            old_crxs = sorted([
                x.name for x in t.getmembers()
                if x.name.endswith(".crx") and x.size > 0 and (
                    date is None or (dateutil.parser.parse(
                        os.path.split(os.path.split(x.name)[0])[1]) <= date))
            ])
            if old_crxs:
                last_crx_path = old_crxs[-1]
                headers_content = t.extractfile(
                    last_crx_path + ".headers").read().decode().replace(
                        '"', '\\"').replace("'", '"')
                headers_json = json.loads(headers_content)
                last_crx_etag = headers_json["ETag"]

                if date is None:
                    with open(etag_file, 'w') as f:
                        json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f)

    return last_crx_path, last_crx_etag
Exemplo n.º 2
0
def all_crx(archivedir, extid, date=None):
    tar = os.path.join(archivedir, get_local_archive_dir(extid),
                       extid + ".tar")
    all_crxs = []
    if os.path.exists(tar):
        t = tarfile.open(tar, 'r')
        all_crxs = sorted([
            x.name for x in t.getmembers()
            if x.name.endswith(".crx") and x.size > 0
        ])
        t.close()
    return all_crxs
Exemplo n.º 3
0
def first_crx(archivedir, extid, date=None):
    first_crx_path = ""
    tar = os.path.join(archivedir, get_local_archive_dir(extid),
                       extid + ".tar")
    if os.path.exists(tar):
        t = tarfile.open(tar, 'r')
        old_crxs = sorted([
            x.name for x in t.getmembers()
            if x.name.endswith(".crx") and x.size > 0 and (
                date is None or (date <= dateutil.parser.parse(
                    os.path.split(os.path.split(x.name)[0])[1])))
        ])
        t.close()
        if old_crxs:
            first_crx_path = old_crxs[0]

    return first_crx_path
Exemplo n.º 4
0
def iter_tar_entries_from_file_ext(archivedir, extid, ext):
    tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
    with tarfile.open(tar, 'r') as tf:
        for tarentry in tf:
            if tarentry.isfile():
                yield (tarentry, tf.extractfile(tarentry))
Exemplo n.º 5
0
def update_extension(tup):
    archivedir, con, ext_id, forums = tup
    update_db = False
    set_logger_tag(ext_id)
    log_info("Updating extension {}".format(" (including forums)" if forums else ""), 1)
    is_new = False
    tar_exception = None
    sql_exception = None
    sql_success = False
    tmptardir = ""
    start = time.time()

    date = datetime.datetime.now(datetime.timezone.utc).isoformat()

    tardir = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id)
    tar = (tardir + ".tar")

    try:
        tmpdir = tempfile.mkdtemp()
        tmptardir = os.path.join(tmpdir, ext_id)
        log_info("* tmptardir = {}".format(tmptardir), 2)
        os.makedirs(
            os.path.join(archivedir, get_local_archive_dir(ext_id)),
            exist_ok=True)
    except Exception as e:
        log_exception("* FATAL: cannot create tmpdir", 3)
        tar_exception = e
        return UpdateResult(ext_id, is_new, tar_exception, None, None, None,
                            None, sql_exception, False)

    res_overview = update_overview(tmptardir, date, ext_id)
    res_reviews = None
    res_support = None
    if forums:
        res_reviews = update_reviews(tmptardir, date, ext_id)

    res_crx = update_crx(archivedir, tmptardir, ext_id, date)

    if forums:
        res_support = update_support(tmptardir, date, ext_id)

    backup = False
    if backup:
        try:
            os.sync()
            if os.path.exists(tardir + "bak.tar"):
                shutil.move(tardir + ".bak.tar",
                            tardir + ".bak." + date + ".tar")
                os.remove(tardir + ".bak." + date + ".tar")
        except Exception:
            pass

        try:
            if os.path.exists(tar):
                shutil.copyfile(tar, tardir + ".bak.tar")
        except Exception as e:
            log_exception("* FATAL: cannot rename old tar archive", 3)
            tar_exception = e
            try:
                write_text(tardir, date, ext_id + ".tar.rename.exception",
                           traceback.format_exc())
            except Exception:
                pass

    if not os.path.exists(tar):
        is_new = True
    try:
        start = time.time()
        with tarfile.open(tar, mode='a:') as ar:
            ar.add(tmptardir, arcname=ext_id)
        log_info("* Appending new data to tar took {:.2f}s".format(time.time() - start), 2)
    except Exception as e:
        log_exception("* FATAL: cannot create tar archive", 3)
        tar_exception = e
        try:
            write_text(tardir, date, ext_id + ".tar.create.exception",
                       traceback.format_exc())
        except Exception:
            pass
    if update_db:
        try:
            update_db_incremental(tmptardir, ext_id, date, con)
            sql_success = True
        except Exception as e:
            log_exception("* Exception during update of db", 3)
            sql_exception = e

            try:
                write_text(tardir, date, ext_id + ".sql.exception",
                           traceback.format_exc())
            except Exception:
                pass
    else:
        log_info("* DB Update disabled")
        
    try:
        shutil.rmtree(path=tmpdir)
    except Exception as e:
        log_exception("* FATAL: cannot remove archive directory", 3)
        tar_exception = e
        try:
            write_text(tardir, date, ext_id + ".dir.remove.exception",
                       traceback.format_exc())
        except Exception:
            pass

    log_info("* Duration: {}".format(datetime.timedelta(seconds=int(time.time() - start))), 2)
    return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
                        res_reviews, res_support, sql_exception, sql_success)
Exemplo n.º 6
0
def update_crx(archivedir, tmptardir, ext_id, date):
    res = None
    extfilename = "default_ext_archive.crx"
    last_crx_file, last_crx_etag = last_crx(archivedir, ext_id)
    last_crx_http_date = last_modified_http_date(last_crx_file)
    headers = ""
    if last_crx_file is not "":
        headers = {'If-Modified-Since': last_crx_http_date}
    try:
        log_info("* Checking If-Modified-Since", 2)
        with request_manager.normal_request():
            res = requests.get(
                const_download_url().format(ext_id),
                stream=True,
                headers=headers,
                timeout=10)
        log_info("* crx archive (Last: {}): {}".format(value_of(last_crx_http_date, "n/a"), str(res.status_code)), 2)
        extfilename = os.path.basename(res.url)
        if re.search('&', extfilename):
            extfilename = "default.crx"

        if res.status_code == 304:
            with request_manager.normal_request():
                etag = requests.head(
                    const_download_url().format(ext_id),
                    timeout=10,
                    allow_redirects=True).headers.get('ETag')
            write_text(tmptardir, date, extfilename + ".etag", etag)
            log_info("- checking etag, last: {}".format(last_crx_etag), 3)
            log_info("              current: {}".format(etag), 3)

            if (etag is not "") and (etag != last_crx_etag):
                log_info("- downloading due to different etags", 3)

                with request_manager.normal_request():
                    res = requests.get(
                        const_download_url().format(ext_id),
                        stream=True,
                        timeout=10)
            else:
                write_text(tmptardir, date, extfilename + ".link",
                           os.path.join("..",
                                        last_modified_utc_date(last_crx_file),
                                        extfilename) + "\n")
        store_request_metadata(tmptardir, date, extfilename, res)
        if res.status_code == 200:
            validate_crx_response(res, ext_id, extfilename)
            with open(os.path.join(tmptardir, date, extfilename), 'wb') as f:
                for chunk in res.iter_content(chunk_size=512 * 1024):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
            write_text(tmptardir, date, extfilename + ".etag",
                       res.headers.get("ETag"))
            etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag")
            with open(etag_file, 'w') as f:
                json.dump({
                              "last_crx": os.path.join(ext_id, date, extfilename),
                              "last_crx_etag": res.headers.get("ETag")
                          }, f)
    except Exception as e:
        log_exception("Exception when updating crx", 3)
        write_text(tmptardir, date, extfilename + ".exception",
                   traceback.format_exc())
        return RequestResult(res, e)
    return RequestResult(res)