def viewSingleRecord(cls, record, research_mode): view_aspects = defineViewSchema() view_checker = ViewDataChecker(view_aspects) view_checker.regValue(0, record) rep_out = StringIO() is_ok = view_checker.finishUp(rep_out) if not is_ok: logging.error("Single record annotation failed:\n" + rep_out.getvalue()) assert is_ok aspects = AspectSetH.load(view_aspects.dump()) return aspects.getViewRepr(record, research_mode)
def createDataSet(app_config, name, kind, mongo, source, report_lines): global DRUID_ADM vault_dir = app_config["data-vault"] if not os.path.isdir(vault_dir): print >> sys.stderr, "No vault directory:", vault_dir assert False checkDSName(name, kind) ds_dir = vault_dir + "/" + name if not mongo: mongo = name if os.path.exists(ds_dir): print >> sys.stderr, "Dataset exists:", ds_dir assert False assert (kind == "xl") == (DRUID_ADM is not None) os.mkdir(ds_dir) post_proc = None view_aspects = defineViewSchema() view_checker = ViewDataChecker(view_aspects) filter_set = defineFilterSchema() if report_lines: print >> sys.stderr, "Processing..." data_rec_no = 0 fdata_out = gzip.open(ds_dir + "/fdata.json.gz", 'wb') pdata_out = gzip.open(ds_dir + "/pdata.json.gz", 'wb') input = JsonLineReader(source) metadata_record = None with FormatterIndexBZ2(ds_dir + "/vdata.ixbz2") as vdata_out: for inp_rec_no, record in enumerate(input): if post_proc is not None: post_proc.transform(inp_rec_no, record) if record.get("record_type") == "metadata": assert inp_rec_no == 0 metadata_record = record filter_set.setMeta(metadata_record) continue view_checker.regValue(data_rec_no, record) vdata_out.putLine(json.dumps(record, ensure_ascii=False)) flt_data = filter_set.process(data_rec_no, record) pre_data = PresentationData.make(record) if DRUID_ADM is not None: DRUID_ADM.addFieldsToRec(flt_data, pre_data, data_rec_no) print >> fdata_out, json.dumps(flt_data, ensure_ascii=False) print >> pdata_out, json.dumps(pre_data, ensure_ascii=False) data_rec_no += 1 if report_lines and data_rec_no % report_lines == 0: print >> sys.stderr, "\r%d lines..." % data_rec_no, if report_lines: print >> sys.stderr, "\nTotal lines: %d" % data_rec_no input.close() fdata_out.close() pdata_out.close() rep_out = StringIO() is_ok = view_checker.finishUp(rep_out) is_ok &= filter_set.reportProblems(rep_out) flt_data = filter_set.dump() if kind == "xl": is_ok &= DRUID_ADM.uploadDataset( name, flt_data, os.path.abspath(ds_dir + "/fdata.json.gz"), os.path.abspath(ds_dir + "/druid_rq.json")) if is_ok: ds_info = { "name": name, "kind": kind, "view_schema": view_aspects.dump(), "flt_schema": flt_data, "total": data_rec_no, "mongo": mongo, "family": filter_set.getFamilyInfo().dump(), "meta": metadata_record } with codecs.open(ds_dir + "/dsinfo.json", "w", encoding="utf-8") as outp: print >> outp, json.dumps(ds_info, sort_keys=True, indent=4) with codecs.open(ds_dir + "/stat.json", "w", encoding="utf-8") as outp: print >> outp, json.dumps(view_checker.dump(), sort_keys=True, indent=4) with codecs.open(ds_dir + "/active", "w", encoding="utf-8") as outp: print >> outp, "" print >> rep_out, "Dataset %s kind=%s succesively created" % (name, kind) else: print >> rep_out, "Process terminated" with codecs.open(ds_dir + "/create.log", "w", encoding="utf-8") as outp: print >> outp, rep_out.getvalue() print >> sys.stdout, rep_out.getvalue()
def createDataSet(app_config, name, kind, mongo, source, ds_inventory, report_lines, date_loaded): global DRUID_ADM vault_dir = app_config["data-vault"] if not os.path.isdir(vault_dir): print("No vault directory:", vault_dir, file = sys.stderr) assert False checkDSName(name, kind) ds_dir = os.path.abspath(vault_dir + "/" + name) if not mongo: mongo = name if os.path.exists(ds_dir): print("Dataset exists:", ds_dir, file = sys.stderr) assert False assert (kind == "xl") == (DRUID_ADM is not None) os.mkdir(ds_dir) anfisa_version = AnfisaConfig.getAnfisaVersion() prepareSolutions() post_proc = None view_aspects = defineViewSchema() view_checker = ViewDataChecker(view_aspects) filter_set = defineFilterSchema() if kind == "ws": trans_prep = TranscriptPreparator( filter_set.getTranscriptDescrSeq(), True) else: trans_prep = None if report_lines: print("Processing...", file = sys.stderr) data_rec_no = 0 metadata_record = None vdata_out = Popen(sys.executable + " -m utils.ixbz2 --calm -o " + ds_dir + "/vdata.ixbz2 /dev/stdin", shell = True, stdin = PIPE, stderr = PIPE, bufsize = 1, universal_newlines = False, # line buffer close_fds = True) vdata_stdin = TextIOWrapper(vdata_out.stdin, encoding = "utf-8", line_buffering = True) with gzip.open(ds_dir + "/fdata.json.gz", 'wb') as fdata_stream, \ gzip.open(ds_dir + "/pdata.json.gz", 'wb') as pdata_stream, \ JsonLineReader(source) as input: fdata_out = TextIOWrapper(fdata_stream, encoding = "utf-8", line_buffering = True) pdata_out = TextIOWrapper(pdata_stream, encoding = "utf-8", line_buffering = True) for inp_rec_no, record in enumerate(input): if post_proc is not None: post_proc.transform(inp_rec_no, record) if record.get("record_type") == "metadata": assert inp_rec_no == 0 metadata_record = record if "versions" in metadata_record: metadata_record["versions"]["Anfisa load"] = anfisa_version filter_set.setMeta(metadata_record) continue flt_data = filter_set.process(data_rec_no, record) view_checker.regValue(data_rec_no, record) print(json.dumps(record, ensure_ascii = False), file = vdata_stdin) pre_data = PresentationData.make(record) if DRUID_ADM is not None: DRUID_ADM.addFieldsToRec(flt_data, pre_data, data_rec_no) if trans_prep is not None: trans_prep.doRec(record, flt_data) print(json.dumps(flt_data, ensure_ascii = False), file = fdata_out) print(json.dumps(pre_data, ensure_ascii = False), file = pdata_out) data_rec_no += 1 if report_lines and data_rec_no % report_lines == 0: sys.stderr.write("\r%d lines..." % data_rec_no) if report_lines: print("\nTotal lines: %d" % data_rec_no, file = sys.stderr) _, vreport_data = vdata_out.communicate() for line in str(vreport_data, encoding="utf-8").splitlines(): print(line, file = sys.stderr) vdata_out.wait() rep_out = StringIO() is_ok = view_checker.finishUp(rep_out) is_ok &= filter_set.reportProblems(rep_out) if trans_prep is not None: total_item_count = trans_prep.finishUp() else: total_item_count = None flt_schema_data = filter_set.dump() if kind == "xl": is_ok &= DRUID_ADM.uploadDataset(name, flt_schema_data, os.path.abspath(ds_dir + "/fdata.json.gz"), os.path.abspath(ds_dir + "/druid_rq.json")) if is_ok: ds_doc_dir = ds_dir + "/doc" ds_info = { "name": name, "kind": kind, "view_schema": view_aspects.dump(), "flt_schema": flt_schema_data, "total": data_rec_no, "mongo": mongo, "modes": [], "meta": metadata_record, "doc": prepareDocDir(ds_doc_dir, ds_inventory), "date_loaded": date_loaded} if total_item_count is not None: ds_info["total_items"] = total_item_count with open(ds_dir + "/dsinfo.json", "w", encoding = "utf-8") as outp: print(json.dumps(ds_info, sort_keys = True, indent = 4), file = outp) with open(ds_dir + "/stat.json", "w", encoding = "utf-8") as outp: print(json.dumps(view_checker.dump(), sort_keys = True, indent = 4), file = outp) mongo_conn = MongoConnector(app_config["mongo-db"], app_config.get("mongo-host"), app_config.get("mongo-port")) mongo_agent = mongo_conn.getDSAgent(name, kind) mongo_agent.checkCreationDate(date_loaded, source) with open(ds_dir + "/doc/info.html", "w", encoding = "utf-8") as outp: reportDS(outp, ds_info, mongo_agent) with open(ds_dir + "/active", "w", encoding = "utf-8") as outp: print("", file = outp) print("Dataset %s kind=%s succesively created" % ( name, kind), file = rep_out) else: print("Process terminated", file = rep_out) with open(ds_dir + "/create.log", "w", encoding = "utf-8") as outp: print(rep_out.getvalue(), file = outp) print(rep_out.getvalue())