Пример #1
0
 def viewSingleRecord(cls, record, research_mode):
     view_aspects = defineViewSchema()
     view_checker = ViewDataChecker(view_aspects)
     view_checker.regValue(0, record)
     rep_out = StringIO()
     is_ok = view_checker.finishUp(rep_out)
     if not is_ok:
         logging.error("Single record annotation failed:\n" +
                       rep_out.getvalue())
     assert is_ok
     aspects = AspectSetH.load(view_aspects.dump())
     return aspects.getViewRepr(record, research_mode)
Пример #2
0
def createDataSet(app_config, name, kind, mongo, source, report_lines):
    global DRUID_ADM
    vault_dir = app_config["data-vault"]
    if not os.path.isdir(vault_dir):
        print >> sys.stderr, "No vault directory:", vault_dir
        assert False
    checkDSName(name, kind)
    ds_dir = vault_dir + "/" + name
    if not mongo:
        mongo = name
    if os.path.exists(ds_dir):
        print >> sys.stderr, "Dataset exists:", ds_dir
        assert False
    assert (kind == "xl") == (DRUID_ADM is not None)
    os.mkdir(ds_dir)

    post_proc = None
    view_aspects = defineViewSchema()
    view_checker = ViewDataChecker(view_aspects)
    filter_set = defineFilterSchema()

    if report_lines:
        print >> sys.stderr, "Processing..."

    data_rec_no = 0
    fdata_out = gzip.open(ds_dir + "/fdata.json.gz", 'wb')
    pdata_out = gzip.open(ds_dir + "/pdata.json.gz", 'wb')
    input = JsonLineReader(source)
    metadata_record = None
    with FormatterIndexBZ2(ds_dir + "/vdata.ixbz2") as vdata_out:
        for inp_rec_no, record in enumerate(input):
            if post_proc is not None:
                post_proc.transform(inp_rec_no, record)
            if record.get("record_type") == "metadata":
                assert inp_rec_no == 0
                metadata_record = record
                filter_set.setMeta(metadata_record)
                continue
            view_checker.regValue(data_rec_no, record)
            vdata_out.putLine(json.dumps(record, ensure_ascii=False))
            flt_data = filter_set.process(data_rec_no, record)
            pre_data = PresentationData.make(record)
            if DRUID_ADM is not None:
                DRUID_ADM.addFieldsToRec(flt_data, pre_data, data_rec_no)
            print >> fdata_out, json.dumps(flt_data, ensure_ascii=False)
            print >> pdata_out, json.dumps(pre_data, ensure_ascii=False)
            data_rec_no += 1
            if report_lines and data_rec_no % report_lines == 0:
                print >> sys.stderr, "\r%d lines..." % data_rec_no,
    if report_lines:
        print >> sys.stderr, "\nTotal lines: %d" % data_rec_no
    input.close()
    fdata_out.close()
    pdata_out.close()

    rep_out = StringIO()
    is_ok = view_checker.finishUp(rep_out)
    is_ok &= filter_set.reportProblems(rep_out)

    flt_data = filter_set.dump()
    if kind == "xl":
        is_ok &= DRUID_ADM.uploadDataset(
            name, flt_data, os.path.abspath(ds_dir + "/fdata.json.gz"),
            os.path.abspath(ds_dir + "/druid_rq.json"))

    if is_ok:
        ds_info = {
            "name": name,
            "kind": kind,
            "view_schema": view_aspects.dump(),
            "flt_schema": flt_data,
            "total": data_rec_no,
            "mongo": mongo,
            "family": filter_set.getFamilyInfo().dump(),
            "meta": metadata_record
        }
        with codecs.open(ds_dir + "/dsinfo.json", "w",
                         encoding="utf-8") as outp:
            print >> outp, json.dumps(ds_info, sort_keys=True, indent=4)

        with codecs.open(ds_dir + "/stat.json", "w", encoding="utf-8") as outp:
            print >> outp, json.dumps(view_checker.dump(),
                                      sort_keys=True,
                                      indent=4)

        with codecs.open(ds_dir + "/active", "w", encoding="utf-8") as outp:
            print >> outp, ""
        print >> rep_out, "Dataset %s kind=%s succesively created" % (name,
                                                                      kind)
    else:
        print >> rep_out, "Process terminated"

    with codecs.open(ds_dir + "/create.log", "w", encoding="utf-8") as outp:
        print >> outp, rep_out.getvalue()

    print >> sys.stdout, rep_out.getvalue()
Пример #3
0
def createDataSet(app_config, name, kind, mongo,
        source, ds_inventory, report_lines, date_loaded):
    global DRUID_ADM
    vault_dir = app_config["data-vault"]
    if not os.path.isdir(vault_dir):
        print("No vault directory:", vault_dir, file = sys.stderr)
        assert False
    checkDSName(name, kind)
    ds_dir = os.path.abspath(vault_dir + "/" + name)
    if not mongo:
        mongo = name
    if os.path.exists(ds_dir):
        print("Dataset exists:", ds_dir, file = sys.stderr)
        assert False
    assert (kind == "xl") == (DRUID_ADM is not None)
    os.mkdir(ds_dir)

    anfisa_version = AnfisaConfig.getAnfisaVersion()
    prepareSolutions()
    post_proc = None
    view_aspects = defineViewSchema()
    view_checker = ViewDataChecker(view_aspects)
    filter_set = defineFilterSchema()

    if kind == "ws":
        trans_prep = TranscriptPreparator(
            filter_set.getTranscriptDescrSeq(), True)
    else:
        trans_prep = None

    if report_lines:
        print("Processing...", file = sys.stderr)

    data_rec_no = 0
    metadata_record = None

    vdata_out = Popen(sys.executable + " -m utils.ixbz2 --calm -o " +
        ds_dir + "/vdata.ixbz2 /dev/stdin", shell = True,
        stdin = PIPE, stderr = PIPE,
        bufsize = 1, universal_newlines = False, # line buffer
        close_fds = True)

    vdata_stdin = TextIOWrapper(vdata_out.stdin, encoding = "utf-8",
        line_buffering = True)

    with    gzip.open(ds_dir + "/fdata.json.gz", 'wb') as fdata_stream, \
            gzip.open(ds_dir + "/pdata.json.gz", 'wb') as pdata_stream, \
            JsonLineReader(source) as input:
        fdata_out = TextIOWrapper(fdata_stream,
            encoding = "utf-8", line_buffering = True)
        pdata_out = TextIOWrapper(pdata_stream,
            encoding = "utf-8", line_buffering = True)
        for inp_rec_no, record in enumerate(input):
            if post_proc is not None:
                post_proc.transform(inp_rec_no, record)
            if record.get("record_type") == "metadata":
                assert inp_rec_no == 0
                metadata_record = record
                if "versions" in metadata_record:
                    metadata_record["versions"]["Anfisa load"] = anfisa_version
                filter_set.setMeta(metadata_record)
                continue
            flt_data = filter_set.process(data_rec_no, record)
            view_checker.regValue(data_rec_no, record)
            print(json.dumps(record, ensure_ascii = False), file = vdata_stdin)
            pre_data = PresentationData.make(record)
            if DRUID_ADM is not None:
                DRUID_ADM.addFieldsToRec(flt_data, pre_data, data_rec_no)
            if trans_prep is not None:
                trans_prep.doRec(record, flt_data)
            print(json.dumps(flt_data, ensure_ascii = False), file = fdata_out)
            print(json.dumps(pre_data, ensure_ascii = False), file = pdata_out)
            data_rec_no += 1
            if report_lines and data_rec_no % report_lines == 0:
                sys.stderr.write("\r%d lines..." % data_rec_no)
    if report_lines:
        print("\nTotal lines: %d" % data_rec_no, file = sys.stderr)

    _, vreport_data = vdata_out.communicate()
    for line in str(vreport_data, encoding="utf-8").splitlines():
        print(line, file = sys.stderr)
    vdata_out.wait()

    rep_out = StringIO()
    is_ok = view_checker.finishUp(rep_out)
    is_ok &= filter_set.reportProblems(rep_out)

    if trans_prep is not None:
        total_item_count = trans_prep.finishUp()
    else:
        total_item_count = None

    flt_schema_data = filter_set.dump()
    if kind == "xl":
        is_ok &= DRUID_ADM.uploadDataset(name, flt_schema_data,
            os.path.abspath(ds_dir + "/fdata.json.gz"),
            os.path.abspath(ds_dir + "/druid_rq.json"))

    if is_ok:
        ds_doc_dir = ds_dir + "/doc"
        ds_info = {
            "name": name,
            "kind": kind,
            "view_schema": view_aspects.dump(),
            "flt_schema": flt_schema_data,
            "total": data_rec_no,
            "mongo": mongo,
            "modes": [],
            "meta": metadata_record,
            "doc": prepareDocDir(ds_doc_dir, ds_inventory),
            "date_loaded": date_loaded}

        if total_item_count is not None:
            ds_info["total_items"] = total_item_count

        with open(ds_dir + "/dsinfo.json", "w", encoding = "utf-8") as outp:
            print(json.dumps(ds_info, sort_keys = True, indent = 4),
                file = outp)

        with open(ds_dir + "/stat.json", "w", encoding = "utf-8") as outp:
            print(json.dumps(view_checker.dump(), sort_keys = True,
                indent = 4), file = outp)

        mongo_conn = MongoConnector(app_config["mongo-db"],
            app_config.get("mongo-host"), app_config.get("mongo-port"))
        mongo_agent = mongo_conn.getDSAgent(name, kind)
        mongo_agent.checkCreationDate(date_loaded, source)

        with open(ds_dir + "/doc/info.html", "w", encoding = "utf-8") as outp:
            reportDS(outp, ds_info, mongo_agent)

        with open(ds_dir + "/active", "w", encoding = "utf-8") as outp:
            print("", file = outp)
        print("Dataset %s kind=%s succesively created" % (
            name, kind), file = rep_out)
    else:
        print("Process terminated", file = rep_out)

    with open(ds_dir + "/create.log",
            "w", encoding = "utf-8") as outp:
        print(rep_out.getvalue(), file = outp)

    print(rep_out.getvalue())