예제 #1
0
def create_tables():
    """
    This function is out-of-sync with actual database, unmaintained.
    Commenting out all action in this function, it will do nothing until modified again.
    """

    db = PostgresDB()
    logger.error('create_tables called but has no valid code to run.')

    # db.execute("""CREATE TABLE IF NOT EXISTS publishers (
    #     id BIGSERIAL NOT NULL PRIMARY KEY,
    #     uuid uuid UNIQUE,
    #     name text NOT NULL,
    #     recordids text[] NOT NULL DEFAULT '{}',
    #     pub_type varchar(20) NOT NULL DEFAULT 'rss',
    #     portal_url text,
    #     rss_url text NOT NULL,
    #     auto_publish boolean NOT NULL DEFAULT false,
    #     first_seen timestamp NOT NULL DEFAULT now(),
    #     last_seen timestamp NOT NULL DEFAULT now(),
    #     pub_date timestamp
    # )""")

    # #pubid, rsid  Ingest, rs_record_id, eml_link, file_link, First Seen Date, Last Seen Date, Feed Date, Harvest Date, Harvest Etag
    # db.execute("""CREATE TABLE IF NOT EXISTS recordsets (
    #     id BIGSERIAL NOT NULL PRIMARY KEY,
    #     uuid uuid UNIQUE,
    #     publisher_uuid uuid REFERENCES publishers(uuid),
    #     name text NOT NULL,
    #     recordids text[] NOT NULL DEFAULT '{}',
    #     eml_link text,
    #     file_link text NOT NULL,
    #     ingest boolean NOT NULL DEFAULT false,
    #     first_seen timestamp NOT NULL DEFAULT now(),
    #     last_seen timestamp NOT NULL DEFAULT now(),
    #     pub_date timestamp,
    #     harvest_date timestamp,
    #     harvest_etag varchar(41)
    # )""")
    # db.commit()
    db.close()
예제 #2
0
def process_file(fname,
                 mime,
                 rsid,
                 existing_etags,
                 existing_ids,
                 ingest=False,
                 commit_force=False,
                 ispaused=False):
    rlogger = getrslogger(rsid)
    rlogger.info("Processing %s, type: %s", fname, mime)
    counts = {}
    t = datetime.datetime.now()
    filehash = calcFileHash(fname)
    db = PostgresDB()
    commited = False

    try:
        if mime == "application/zip":
            dwcaobj = Dwca(fname, skipeml=True, logname="idb")
            for dwcrf in dwcaobj.extensions:
                rlogger.debug("Processing %r", dwcrf.name)
                counts[dwcrf.name] = process_subfile(dwcrf,
                                                     rsid,
                                                     existing_etags,
                                                     existing_ids,
                                                     ingest=ingest,
                                                     db=db)
                dwcrf.close()
            rlogger.debug("processing core %r", dwcaobj.core.name)
            counts[dwcaobj.core.name] = process_subfile(dwcaobj.core,
                                                        rsid,
                                                        existing_etags,
                                                        existing_ids,
                                                        ingest=ingest,
                                                        db=db)
            dwcaobj.core.close()
            dwcaobj.close()
        elif mime == "text/plain":
            commas = False
            with open(fname, "rb") as testf:
                commas = "," in testf.readline()

            if commas:
                csvrf = DelimitedFile(fname, logname="idigbio")
                counts[fname] = process_subfile(csvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)
            else:
                tsvrf = DelimitedFile(fname,
                                      delimiter="\t",
                                      fieldenc=None,
                                      logname="idigbio")
                counts[fname] = process_subfile(tsvrf,
                                                rsid,
                                                existing_etags,
                                                existing_ids,
                                                ingest=ingest,
                                                db=db)

        if ingest:
            commit_ok = commit_force

            type_commits = []
            for k in counts:
                if k not in ingestion_types:
                    continue
                if (counts[k]["create"] /
                        float(counts[k]["processed_line_count"]) >= 0.5
                        and counts[k]["delete"] /
                        float(counts[k]["processed_line_count"]) >= 0.5):
                    type_commits.append(True)
                else:
                    type_commits.append(False)

            commit_ok = all(type_commits)

            if commit_ok:
                rlogger.info("Ready to Commit")
                db.commit()
                commited = True
            else:
                rlogger.error("Rollback")
                db.rollback()
        else:
            db.rollback()
        db.close()
    except Exception:
        logger.exception(
            "Unhandled Exception when processing {0}".format(fname))
        db.rollback()
        db.close()

    # Clear after processing an archive
    unconsumed_extensions.clear()
    core_siblings.clear()

    return {
        "name": fname,
        "filemd5": filehash,
        "recordset_id": rsid,
        "counts": counts,
        "processing_start_datetime": t.isoformat(),
        "total_processing_time": (datetime.datetime.now() - t).total_seconds(),
        "commited": commited,
        "paused": ispaused
    }