def backfill_flagged_worker(rows):
    """Parallel worker for backfill_flagged_etags()
    Expects a list of records to work on and commit as a group
    """
    storage = IDigBioStorage()
    with apidbpool.connection(autocommit=False) as conn:
        cur = conn.cursor()
        for row in rows:
            try:
                table = make_temp_table_name(row['prefix'])
                b = storage.get_bucket(
                    row['ceph_bucket']
                )  # Two phase here because validate=False in storage class so etag is not populated
                row["etag"] = b.get_key(row["ceph_name"]).etag[1:-1]
                cur.execute(
                    """UPDATE {0}
                    SET ceph_etag=%(etag)s
                    WHERE ceph_bucket=%(ceph_bucket)s AND ceph_name=%(ceph_name)s
                    """.format(table), row)
            except:
                logger.error(
                    "Failed to update etag for {0}:{1} in {2} {3}".format(
                        row["ceph_bucket"], row["ceph_name"], row["prefix"],
                        traceback.format_exc()))
    conn.commit()
    return 0
def get_objects_from_ceph():
    local_cur.execute("SELECT etag FROM objects")
    existing_objects = set()
    for r in local_cur:
        existing_objects.add(r[0])

    print len(existing_objects)

    s = IDigBioStorage()
    buckets = ["datasets","images"]
    count = 0
    rowcount = 0
    lrc = 0
    for b_k in buckets:
        b = s.get_bucket("idigbio-" + b_k + "-prod")
        for k in b.list():
            if k.name not in existing_objects:
                try:
                    ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'})
                    detected_mime = magic.from_buffer(ks, mime=True)
                    local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime})
                    existing_objects.add(k.name)
                    rowcount += local_cur.rowcount
                except:
                    print "Ceph Error", b_k, k.name
            count += 1


            if rowcount != lrc and rowcount % 10000 == 0:
                print count, rowcount
                local_pg.commit()
                lrc = rowcount
        print count, rowcount
        local_pg.commit()
def bucket_list_worker(work):
    logger.debug("Listing and inserting prefix {0} from bucket {1}".format(
        work["prefix"], work["bucket"]))

    storage = IDigBioStorage()
    # Read through bucket inserting into temp table
    with apidbpool.connection(
            autocommit=False
    ) as conn:  # use a single connection from the pool to commit groups of statements
        cur = conn.cursor()
        #        inserted = 1
        #logger.info("Importing bucket listing for {0}.".format(bucket))
        b = storage.get_bucket(work["bucket"])
        for f in b.list(prefix=work["prefix"]):
            # see backfill_new_etags() for why no etag here
            cur.execute(("INSERT INTO {0} "
                         "(ceph_bucket, ceph_name, ceph_date, ceph_bytes) "
                         "VALUES (%s, %s, %s, %s)").format(
                             make_temp_table_name(work["prefix"])),
                        (work["bucket"], f.name, f.last_modified, f.size))


#            inserted += 1

#            if (inserted % 10000) == 0:
#                logger.info("Committing {0}".format(inserted))
#                conn.commit()
        conn.commit()
        return 1
예제 #4
0
def get_objects_from_ceph():
    local_cur.execute("SELECT etag FROM objects")
    existing_objects = set()
    for r in local_cur:
        existing_objects.add(r[0])

    print len(existing_objects)

    s = IDigBioStorage()
    buckets = ["datasets","images"]
    count = 0
    rowcount = 0
    lrc = 0
    for b_k in buckets:
        b = s.get_bucket("idigbio-" + b_k + "-prod")
        for k in b.list():
            if k.name not in existing_objects:
                try:
                    ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'})
                    detected_mime = magic.from_buffer(ks, mime=True)
                    local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime})
                    existing_objects.add(k.name)
                    rowcount += local_cur.rowcount
                except:
                    print "Ceph Error", b_k, k.name
            count += 1


            if rowcount != lrc and rowcount % 10000 == 0:
                print count, rowcount
                local_pg.commit()
                lrc = rowcount
        print count, rowcount
        local_pg.commit()
def set_deriv_from_ceph():
    s = IDigBioStorage()
    b = s.get_bucket("idigbio-images-prod-thumbnail")
    count = 0
    for k in b.list():
        local_cur.execute("UPDATE objects SET derivatives=true WHERE etag=%s", (k.name.split(".")[0],))
        count += 1

        if count % 10000 == 0:
            print count
            local_pg.commit()
    print count
    local_pg.commit()
예제 #6
0
def set_deriv_from_ceph():
    s = IDigBioStorage()
    b = s.get_bucket("idigbio-images-prod-thumbnail")
    count = 0
    for k in b.list():
        local_cur.execute("UPDATE objects SET derivatives=true WHERE etag=%s", (k.name.split(".")[0],))
        count += 1

        if count % 10000 == 0:
            print count
            local_pg.commit()
    print count
    local_pg.commit()
예제 #7
0
def get_objects_from_ceph():
    import magic
    existing_objects = set(r[0] for r in apidbpool.fetchiter(
        "SELECT etag FROM objects", cursor_factory=cursor))

    logger.info("Found %d objects", len(existing_objects))

    s = IDigBioStorage()
    buckets = ["datasets", "images"]
    count = 0
    rowcount = 0
    lrc = 0
    with apidbpool.connection() as conn:
        with apidbpool.cursor() as cur:
            for b_k in buckets:
                b = s.get_bucket("idigbio-" + b_k + "-prod")
                for k in b.list():
                    if k.name not in existing_objects:
                        try:
                            ks = k.get_contents_as_string(
                                headers={'Range': 'bytes=0-100'})
                            detected_mime = magic.from_buffer(ks, mime=True)
                            cur.execute(
                                """INSERT INTO objects (bucket,etag,detected_mime)
                                   SELECT %(bucket)s,%(etag)s,%(dm)s
                                   WHERE NOT EXISTS(
                                      SELECT 1 FROM objects WHERE etag=%(etag)s)""",
                                {
                                    "bucket": b_k,
                                    "etag": k.name,
                                    "dm": detected_mime
                                })
                            existing_objects.add(k.name)
                            rowcount += cur.rowcount
                        except:
                            logger.exception(
                                "Ceph Error; bucket:%s keyname:%s", b_k,
                                k.name)
                    count += 1

                    if rowcount != lrc and rowcount % 10000 == 0:
                        logger.info("Count: %8d,  rowcount: %8d", count,
                                    rowcount)

                        conn.commit()
                        lrc = rowcount
                conn.commit()
                logger.info("Count: %8d,  rowcount: %8d  (Finished %s)", count,
                            rowcount, b_k)
예제 #8
0
def main():
    index_file_name = "index.txt"

    query = {
        "size": 0,
        "aggs": {
            "rs": {
                "terms": {
                    "field": "recordset",
                    "size": 1000
                },
                "aggs": {
                    "ic":{
                        "terms": {
                            "field": "institutioncode",
                            "size": 1000,
                        },
                        "aggs": {
                            "cc": {
                                "terms":{
                                    "field": "collectioncode",
                                    "size": 1000,
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    r = requests.post("http://search.idigbio.org/idigbio/records/_search",
                      data=json.dumps(query),
                      headers={"Content-Type": "application/json"})
    r.raise_for_status()
    ro = r.json()

    recordsets = {}
    for rs_b in ro["aggregations"]["rs"]["buckets"]:
        rsid = rs_b["key"]
        ic = ""
        cc = ""
        if len(rs_b["ic"]["buckets"]) == 0:
            ic = ""
            cc = ""
        elif len(rs_b["ic"]["buckets"]) == 1 or (
                float(rs_b["ic"]["buckets"][0]["doc_count"]) / float(rs_b["doc_count"]) > 0.9
            ):
            ic_b = rs_b["ic"]["buckets"][0]
            ic = get_true_ic(ic_b["key"])
            if len(ic_b["cc"]["buckets"]) == 0:
                cc = ""
            elif len(ic_b["cc"]["buckets"]) == 1:
                cc = ic_b["cc"]["buckets"][0]["key"]
            else:
                cc = "MULTIPLE"
        else:
            # print(rs_b)
            ic = "MULTIPLE"
            cc = "MULTIPLE"
        recordsets[rsid] = {
            "institutioncode": ic,
            "collectioncode": cc
        }

    s = IDigBioStorage()
    b = s.get_bucket("idigbio-static-downloads")

    headers = ["zipfile","emlfile","etag","modified","recordset_id", "institutioncode", "collectioncode"]
    files = {}

    for k in b.list():
        # Skip the index itself
        if k == index_file_name:
            continue

        # Skip files older than 8 days
        lm_d = dateutil.parser.parse(k.last_modified).date()
        if lm_d < (datetime.datetime.now() - datetime.timedelta(7)).date():
            continue

        fkey = k.name.split(".")[0]
        if fkey not in files:
            files[fkey] = {k:"" for k in headers}

        if k.name.endswith(".eml"):
            files[fkey]["emlfile"] = k.name
        elif k.name.endswith(".zip"):
            files[fkey]["zipfile"] = k.name
            files[fkey]["modified"] = k.last_modified
            files[fkey]["etag"] = k.etag
            if is_uuid(fkey):
                files[fkey]["recordset_id"] = fkey
                if fkey in recordsets:
                    files[fkey]["institutioncode"] = recordsets[fkey]["institutioncode"]
                    files[fkey]["collectioncode"] = recordsets[fkey]["collectioncode"]
                else:
                    files[fkey]["institutioncode"] = ""
                    files[fkey]["collectioncode"] = ""

    fil = StringIO()

    cw = csv.writer(fil,delimiter="\t")

    cw.writerow(headers)
    for k in files:
        if files[k]["zipfile"] != "":
            cw.writerow([files[k][h].replace("\"","") for h in headers])

    fil.seek(0)

    ik = b.get_key(index_file_name,validate=False)
    ik.content_type = 'text/tsv'
    ik.set_contents_from_file(fil)
    ik.make_public()