def save_delete_manifest(self): """ Save an updated manifest for deletion armada delete doesn't support --values files as does the apply. To handle proper deletion of the conditional charts/chart groups that end up in the overrides files, create a unified file for use when deleting. NOTE #1: If we want to abandon using manifest overrides files altogether, this generated file could probably be used on apply and delete. NOTE #2: Diffing the original manifest and this manifest provides a clear view of the conditional changes that were enforced by the system in the plugins """ if os.path.exists(self.manifest_path): # cleanup existing deletion manifest self._cleanup_deletion_manifest() with open(self.delete_manifest, 'w') as f: try: yaml.dump_all(self.content, f, Dumper=yaml.RoundTripDumper, explicit_start=True, default_flow_style=False) LOG.debug("Delete manifest file %s generated" % self.delete_manifest) except Exception as e: LOG.error("Failed to generate delete manifest file %s: " "%s" % (self.delete_manifest, e)) else: LOG.error("Manifest directory %s does not exist" % self.manifest_path)
def main(argv): yaml_file = argv[1] yaml_output = argv[2] image_record_files = argv[3:] document_out = collections.OrderedDict() new_image_dict = {} image_records = [] # Read all lines from all files in image_records list for image_record_file in image_record_files: with open(image_record_file) as ir_file: new_records = [line.rstrip() for line in ir_file.readlines()] image_records.extend(new_records) # Create a dictionary to map image name to image location/tag for image in image_records: name = get_image_name(image) if name != '': new_image_dict[name] = image # Load chart into dictionary(s) and then modify any image locations/tags if required for document in yaml.load_all(open(yaml_file), Loader=yaml.RoundTripLoader, preserve_quotes=True, version=(1, 1)): document_name = (document['schema'], document['metadata']['schema'], document['metadata']['name']) modify_yaml(document, '', '', new_image_dict) document_out[document_name] = document # Save modified yaml to file yaml.dump_all(document_out.values(), open(yaml_output, 'w'), Dumper=yaml.RoundTripDumper, default_flow_style=False)
def round_trip_dump_all( data, stream=None, # *, indent=None, block_seq_indent=None, default_flow_style=unset, top_level_colon_align=None, prefix_colon=None, explicit_start=None, explicit_end=None, version=None, allow_unicode=None, ): import ruamel.yaml # NOQA yaml = ruamel.yaml.YAML() yaml.indent(mapping=indent, sequence=indent, offset=block_seq_indent) if default_flow_style is not unset: yaml.default_flow_style = default_flow_style yaml.top_level_colon_align = top_level_colon_align yaml.prefix_colon = prefix_colon yaml.explicit_start = explicit_start yaml.explicit_end = explicit_end yaml.version = version yaml.allow_unicode = allow_unicode if stream is not None: yaml.dump(data, stream=stream) return buf = io.StringIO() yaml.dump_all(data, stream=buf) return buf.getvalue()
def move_annotations(f): """Modifies a YAML ProwJob file in-place by moving name and annotations to the top of the spec elements. :param f: :return: """ files = list(yaml.load_all(open(f))) # pylint: disable=R1702 for lvl1 in files: for lvl2 in lvl1.values(): if isinstance(lvl2, ruamel.yaml.comments.CommentedSeq): for job in lvl2: if not 'annotations' in job: continue job.move_to_end('annotations', last=False) job.move_to_end('name', last=False) elif isinstance(lvl2, ruamel.yaml.comments.CommentedMap): for lvl3 in lvl2.values(): if isinstance(lvl3, bool): continue for job in lvl3: if not 'annotations' in job: continue job.move_to_end('annotations', last=False) job.move_to_end('name', last=False) else: print('skipping', lvl2) yaml.dump_all(files, open(f, 'w'))
def dump_list_yaml(data, path): """ Dump a list of dictionaries to a single yaml file """ with open(path, 'w') as file: yaml.dump_all(data, file, default_flow_style=False, Dumper=NoAliasRTDumper)
def write_data(self, code_type: str = "utf-8") -> Union[list, dict, None]: with open(self.path, "w", encoding=code_type) as _yaml: if isinstance(self.data, list): ryaml.dump_all(self.data, _yaml, Dumper=ryaml.RoundTripDumper) else: ryaml.dump(self.data, _yaml, Dumper=ryaml.RoundTripDumper) _yaml.close() return self.data
def _process_datasets(output_dir: Path, datasets: Iterable[Path], do_checksum: bool, newer_than: datetime): logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) for dataset_path in datasets: (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(str(dataset_path)) create_date = datetime.utcfromtimestamp(ctime) if create_date <= newer_than: logging.info( "Dataset creation time %s is older than start date %s...SKIPPING", create_date, newer_than, ) else: if dataset_path.is_dir(): dataset_path = dataset_path.joinpath( dataset_path.stem.replace("PRD_MSIL1C", "MTD_SAFL1C") + ".xml") if dataset_path.suffix not in [".xml", ".zip"]: raise RuntimeError("want xml or zipped archive") logging.info("Processing %s", dataset_path) output_path = Path(output_dir) yaml_path = output_path.joinpath(dataset_path.name + ".yaml") logging.info("Output %s", yaml_path) if os.path.exists(str(yaml_path)): logging.info("Output already exists %s", yaml_path) with open(str(yaml_path)) as f: if do_checksum: logging.info("Running checksum comparison") datamap = yaml.load_all(f) for data in datamap: yaml_sha1 = data["checksum_sha1"] checksum_sha1 = hashlib.sha1( dataset_path.open("rb").read()).hexdigest() if checksum_sha1 == yaml_sha1: logging.info( "Dataset preparation already done...SKIPPING") continue else: logging.info( "Dataset has changed...ARCHIVING out of date yaml" ) archive_yaml(yaml_path, output_dir) else: logging.info( "Dataset preparation already done...SKIPPING") continue documents = prepare_dataset(dataset_path) if documents: logging.info("Writing %s dataset(s) into %s", len(documents), yaml_path) with open(str(yaml_path), "w") as stream: yaml.dump_all(documents, stream) else: logging.info("No datasets discovered. Bye!")
def operation_yaml(self, mode='r', *data): # to operation the yaml file if mode == "w": with open(self.__tgt_file, 'w', encoding='utf-8') as f: yaml.dump_all(data, f, Dumper=yaml.RoundTripDumper) logger.info("yaml文件写入成功。") if mode == "r": warnings.simplefilter('ignore', ruamel.yaml.error.UnsafeLoaderWarning) with open(self.__tgt_file, 'r', encoding='utf-8') as f: result = yaml.load_all(f.read()) logger.info("yaml文件读取成功。") return result
def write_yaml_file(self, *data): """ to write yaml data :param data: :return: """ try: with open(self.file, 'w', encoding='utf-8') as f: yaml.dump_all(data, f, Dumper=yaml.RoundTripDumper) print("successful for writing!") except Exception as e: print(f"raise error{e}") finally: f.close()
def main(): parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group('required arguments') requiredArgs.add_argument('-i', '--input', dest="input_yaml", default=False, help='YAML file to be sorted', required=True, metavar='INPUT_YAML_FILE') parser.add_argument('-o', '--output', dest="output_yaml", default=False, help='Sorted YAML output file', metavar='OUTPUT_YAML_FILE') if len(sys.argv) < 3: parser.print_help() sys.exit(1) try: args = parser.parse_args() except: parser.print_help() sys.exit(0) try: f_input = open(args.input_yaml, 'r') yaml_tmp = list( ruamel.yaml.round_trip_load_all(f_input, preserve_quotes=True)) if not yaml_tmp[-1]: yaml_tmp = yaml_tmp[:-1] sorted_yaml = sorted(yaml_tmp, key=lambda i: i['name']) if args.output_yaml: f_output = open(args.output_yaml, 'w') else: f_output = open(args.input_yaml + "_sorted.yaml", 'w') yaml.dump_all(sorted_yaml, f_output) f_output.write("...") f_input.close() f_output.close() print("[INFO] - Sorting completed.") print("[INFO] - Number of Artifacts: " + str(len(yaml_tmp))) except Exception as exc: print(exc)
def export_yaml_file(content: Union[dict, list], path: str, multiple: bool = False): """ 保存YAML文件 @param content: 文件内容 @param path: 保存文件路径 @param multiple: 同文件内是否包含多个文档 @return: """ with open(path, "w", encoding="utf-8") as f: if multiple: yaml.dump_all(content, f, Dumper=yaml.RoundTripDumper) else: yaml.dump(content, f, Dumper=yaml.RoundTripDumper)
def serialize(self, out): def nested_set(target, path, value): if len(path) > 1: if len(path) == 2 and path[1][0] == '[' and path[1][ -1] == ']' and path[1][1:-1].isdigit(): if path[0] not in target: target[path[0]] = [] target[path[0]].append(value) else: if path[0] not in target: target[path[0]] = UnsortableOrderedDict() nested_set(target[path[0]], path[1:], value) else: target[path[0]] = value units = UnsortableOrderedDict() for unit in self.unit_iter(): nested_set(units, unit.getid().split(' / '), unit.target) out.write( yaml.dump_all( [self.get_root_node(units)], Dumper=YAMLDumper, default_flow_style=False, encoding='utf-8', allow_unicode=True, default_style='|', width=float("inf"), ))
def main(templated_helm_file, require_file): yaml = ruamel.yaml.YAML() yaml.indent(mapping=2) with open(templated_helm_file, 'r') as f: templated_helm = yaml.load_all(f.read()) with open(require_file, 'r') as f: requirements = yaml.load(f.read()) checker = RequirementChecker(requirements.get('resources')) new_doc = {} for yaml_doc in templated_helm: if yaml_doc is None: continue if checker.is_required(yaml_doc): new_doc[get_resource_key(yaml_doc)] = yaml_doc print('# GENERATED FILE: edits made by hand will not be preserved.') print('---') yaml.dump_all(same_sort(requirements, new_doc), sys.stdout)
def test_load_all_perserve_quotes(self): import ruamel.yaml # NOQA yaml = ruamel.yaml.YAML() yaml.preserve_quotes = True s = dedent("""\ a: 'hello' --- b: "goodbye" """) data = [] for x in yaml.load_all(s): data.append(x) buf = ruamel.yaml.compat.StringIO() yaml.dump_all(data, buf) out = buf.getvalue() print(type(data[0]['a']), data[0]['a']) # out = ruamel.yaml.round_trip_dump_all(data) print(out) assert out == s
def load_yaml_file(path, round_tripping=False): with io.open(path, 'r', encoding='utf-8') as reader: pathdir = os.path.dirname(path) newfloder = pathdir + "copy" if not os.path.exists(newfloder): #判断文件夹是否存在 os.mkdir(newfloder) if round_tripping: #判断是否有注释 data = round_trip_load(reader) # with open(newfloder+"/"+os.path.basename(path),"w",encoding="utf-8") as w: 拼接路径 加号方式需要有"/" join是逗号 with open(os.path.join(newfloder, os.path.basename(path)), "w", encoding="utf-8") as wr: round_trip_dump(data, wr, allow_unicode=True) else: data = safe_load(reader) with open(os.path.basename(path), "w", encoding="utf-8") as wr: # dump(data, w, allow_unicode=True) dump_all([data], wr, allow_unicode=True) return data
def to_yaml(self) -> str: """ Serializes this object to a YAML string :return: the yaml string """ root_as_dict = self._root.to_dict() config_maps_as_dicts = [cm.to_dict() for cm in self.config_maps] role_bindings_as_dicts = [rb.to_dict() for rb in self.role_bindings] services_as_dicts = [svc.to_dict() for svc in self.services] return yaml.dump_all( itertools.chain(role_bindings_as_dicts, config_maps_as_dicts, services_as_dicts, [root_as_dict]), Dumper=yaml.RoundTripDumper, )
def dump_all_round_trip(documents, stream=None, **kwargs): """Dumps multiple YAML documents to the stream using the RoundTripDumper. Args: documents: An iterable of YAML serializable Python objects to dump. stream: The stream to write the data to or None to return it as a string. **kwargs: Other arguments to the dump method. Returns: The string representation of the YAML data if stream is None. """ return yaml.dump_all(documents, stream=stream, default_flow_style=False, indent=2, Dumper=yaml.RoundTripDumper, **kwargs)
def test_multi_doc_begin_end(self): from ruamel import yaml inp = """\ --- - a ... --- - b ... """ docs = list(round_trip_load_all(inp)) assert docs == [['a'], ['b']] out = yaml.dump_all( docs, Dumper=yaml.RoundTripDumper, explicit_start=True, explicit_end=True ) assert out == '---\n- a\n...\n---\n- b\n...\n'
def loadsend_file(self, filename): # Verify file can be opened. if not os.path.exists(filename): raise FileNotFoundError(filename) file_templ_vars = dict( file_name=os.path.abspath(filename), file_dir=os.path.abspath(os.path.dirname(filename)), file_uuid1=uuid.uuid1(), work_dir=self.opts.workdir, ) log.info('Processing "%s"...', filename) fp = open(filename) yaml = ruamel.yaml.YAML() datas = list(yaml.load_all(fp)) for data in datas: # Some sanity checks... if not 'notes' in data: log.warning("No notes in data.") continue elif not data['notes']: log.warning("Data has emply notes list.") continue query_results, defaults = self.query_notes(self.opts.query, data) if query_results.empty: log.warning("Query returned no results.") else: if self.opts.question: log.info("") log.info("Running in question mode.") log.info("Query results:\n %s", str(query_results)) log.info("") log.info("Query result details below.") log.info("") else: if self.opts.annotations: log.info("") log.info("Running in annotations mode.") log.info("") log.debug("Query results:\n %s", str(query_results)) annotations_field = defaults['annotationsField'] for i in query_results.index: rtnote = query_results.rtnote[i] note_id = str(rtnote['id']) skip = query_results.skip[i] deck = query_results.deckName[i] model = query_results.modelName[i] use_md = query_results.useMarkdown[i] md_sty = query_results.markdownStyle[i] md_lineno = query_results.markdownLineNums[i] md_tablen = query_results.markdownTabLength[i] md_mathext = query_results.useMarkdownMathExt[i] string_templ_delim = query_results.stringTemplDelim[i] tags = sorted(query_results.tags[i].replace(',', '\n').split()) fields = query_results.fields[i] media = query_results.media[i] description = "{}:{}".format(note_id, fields) if self.opts.question: log.info("*** ID: {}".format(note_id)) log.info("--- Should skip: {}".format(skip)) log.info("--- Note tags:") #log.info(yaml.dump(tags)) log.info(tags) log.info("--- Note annotations:") #log.info(yaml.dump(query_results.annotations[i])) log.info(query_results.annotations[i]) log.info("--- Note fields:") #log.info(yaml.dump(fields)) #log.info(fields) #log.info("--- Raw fields:") log.info(fields) log.info("--- Note media:") log.info(media) log.info("") continue if skip: log.info("Skipping note with ID: {}".format(note_id)) continue log.info("Processing note with ID: {}".format(note_id)) log.debug("Note fields: {}".format(fields)) # Check for note with given ID. # Get info for existing note. creating_new_note = True note_info = self.anki.notesInfo([note_id]) if note_info.get("error", None) or not note_info['result'][0]: if self.opts.annotations: log.info( "Can't find note with ID %s; skipping annotations for this note.", note_id) continue else: log.info( "Can't find note with ID %s; a new note will be created.", note_id) else: creating_new_note = False note_templ_vars = dict(file_templ_vars) note_templ_vars['note_id'] = note_id note_uuid1 = uuid.uuid1() note_templ_vars['note_uuid1'] = note_uuid1 file_uuid1 = note_templ_vars['file_uuid1'] if creating_new_note: # No provided ID; assume new note should be created. log.debug("Creating new note...") temporary_fields = { k: self.format_text(str(v), False, md_sty, md_lineno, md_tablen, md_mathext, string_templ_delim, **note_templ_vars) for (k, v) in fields.items() } # Create, obtaining returned ID anki_result = self.anki.addNote(deck, model, temporary_fields, tags=tags) if anki_result.get("error", None): log.warning("Can't create note: %s", description) else: # Add ID to note_node note_id = anki_result['result'] note_templ_vars['note_id'] = note_id prev_id, rtnote['id'] = rtnote['id'], note_id log.info("ID %s replaced with %s.", prev_id, note_id) log.debug("Updating note...") # Assume provided ID is valid for existing note to be updated. # Convert each field from Markdown (if `use_md` is True). # Special handling for the annotations field. If we can find this note # in Anki's flashcard deck, then we'll grab any annotations the user has # made for that note, and store them in the YAML file. If we are instead # creating a new note, we'll transfer any annotations from the YAML file # to the new note. # # Because of this special handling, we remove annotations from the # shallow copy of the note. We'll instead use the round-trip version. if self.opts.annotations: pass else: class StringTemplate(string.Template): delimiter = string_templ_delim if annotations_field in fields: del fields[annotations_field] converted_fields = { k: self.format_text(str(v), use_md, md_sty, md_lineno, md_tablen, md_mathext, string_templ_delim, field_no, **note_templ_vars) for (field_no, (k, v)) in enumerate(fields.items()) } for media_item in media: item_path = media_item['path'] #item_path = string.Template(item_path).safe_substitute( item_path = StringTemplate(item_path).safe_substitute( note_templ_vars) item_name = media_item.get('name', os.path.basename(item_path)) #item_name = string.Template(item_name).safe_substitute( item_name = StringTemplate(item_name).safe_substitute( note_templ_vars) log.info("Considering sending media item...") log.info(" local path: {}".format(item_path)) log.info(" remote name: {}".format(item_name)) anki_result = self.anki.statMediaFile(item_name) must_send_new_media_item = False item_data = None try: if anki_result.get("error", None): log.info( "Can't get remote media file status (probably missing)..." ) must_send_new_media_item = True else: if not anki_result['result']: log.info( "... Media item is not present on remote..." ) must_send_new_media_item = True else: log.info( "... Media item is already present on remote..." ) log.info("... Reading local data...") item_data = open(item_path, 'rb').read() item_adler32 = zlib.adler32(item_data) remote_adler32 = anki_result['result'][ 'adler32'] log.info(" Remote checksum: {}".format( remote_adler32)) log.info(" Local checksum: {}".format( item_adler32)) if remote_adler32 == item_adler32: log.info( "... Remote checksum matches that of local version..." ) else: log.info( "... Remote checksum is not the same as local..." ) must_send_new_media_item = True if must_send_new_media_item: if item_data is None: log.info("... Reading local data...") item_data = open(item_path, 'rb').read() log.info( "... Encoding {} bytes of local data...". format(len(item_data))) item_base64 = base64.b64encode( item_data).decode("utf-8") log.info( "... Sending {} bytes of encoded data to remote..." .format(len(item_base64))) anki_result = self.anki.storeMediaFile( item_name, item_base64) if anki_result.get("error", None): log.warning("Can't store media file: %s", item_name) except FileNotFoundError as e: log.warning('File not found: "%s"', e) log.warning('*** Skipping missing media item: %s', item_name) finally: log.info("... Done with media item.") # If we found this note in Anki's flashcard deck, then we'll grab any # annotations the user has made for that note, and store them in the # YAML file. If we are instead creating a new note, we'll transfer any # annotations from the YAML file to the new note. if creating_new_note: log.debug("Transferring annotations to new note...") # Transfer annotations from YAML file to new note. annotations = rtnote['fields'].get(annotations_field, "") else: log.debug("Transferring annotations from existing note...") upstream_fields = note_info['result'][0]['fields'] annotations = upstream_fields.get(annotations_field, dict(value=''))['value'] # Transfer annotations from existing note to YAML file. rtnote['fields'][annotations_field] = annotations if not self.opts.annotations: converted_fields[annotations_field] = annotations # Update converted note fields... result = self.anki.updateNoteFields( note_id, converted_fields) if result.get("error", None): log.warning("Can't update note: %s", description) continue # Update note tags... ## First get existing note tags. note_info = self.anki.notesInfo([note_id]) if note_info.get("error", None): log.warning("Can't get tags for note: %s", description) continue current_tags = sorted(note_info['result'][0]['tags']) if current_tags != tags: rt_non_annot_tags = set( filter(lambda s: not s.startswith('ann:'), rtnote.get('tags', list()))) non_annot_tags = set( filter(lambda s: not s.startswith('ann:'), tags)) cur_non_annot_tags = set( filter(lambda s: not s.startswith('ann:'), current_tags)) cur_annot_tags = set( filter(lambda s: s.startswith('ann:'), current_tags)) tags = sorted(list(non_annot_tags.union(cur_annot_tags))) rt_tags = sorted( list(rt_non_annot_tags.union(cur_annot_tags))) rtnote['tags'] = rt_tags ## Remove existing note tags. log.info("Removing tags %s...", cur_non_annot_tags) result = self.anki.removeTags([note_id], " ".join(cur_non_annot_tags)) if result.get("error", None): log.warning("Can't remove tags for note: %s", description) ## Add new note tags. log.info("Replacing with tags %s...", tags) result = self.anki.addTags([note_id], " ".join(tags)) if result.get("error", None): log.warning("Can't add tags for note: %s", description) note_info = self.anki.notesInfo([note_id]) log.info('Saving "%s"...', filename) fp = open(filename, 'w') yaml.dump_all(datas, fp)
for line in f: if re.match(pattern, line): return True return False # load the nodes from the reachability graph from the Alloy/reach output configs, old_graph = loadReachability() new_graph = {} num_old = 0 num_new = 0 for n in old_graph: # generate a new reachability graph from all the transitions whose # upper bounds exceed the threshold new_graph[n] = {x: old_graph[n][x] for x in old_graph[n] if check(configs["configs"][n],configs["configs"][x])} # count both the old and new transitions (for reporting purposes) num_old += len(old_graph[n]) num_new += len(new_graph[n]) print("Trimmed %d of %d transitions (%f%%)" % (num_old-num_new, num_old, ((num_old-num_new)/num_old)*100.0)) subdir = "%s/trim%s" % (results_dir,weights) if not os.path.exists(subdir): try: os.makedirs(subdir) except OSError as exc: if exc.errno != errno.EEXIST: raise with open("%s/rubis.yaml" % subdir, 'w') as f: yaml.dump_all((configs, new_graph), f)
# python3 configmap2secrets.py ../../kube/config/dev/internal-resources-cm.yml import base64 from pathlib import Path import sys import ruamel.yaml from ruamel.yaml.comments import CommentedMap as OrderedDict # to avoid '!!omap' in yaml input_file = Path(sys.argv[1]) yaml = ruamel.yaml.YAML() yaml.compact(seq_seq=False, seq_map=True) yaml.default_flow_style = False yaml.explicit_start = True yaml.preserve_quotes = True yaml.indent(mapping=2, sequence=2, offset=0) yaml.version = (1, 2) L = [D for D in yaml.load_all(input_file)] for D in L: if D.get('kind') == 'ConfigMap': OD = OrderedDict() for k, v in D.get('data').items(): OD[k] = base64.b64encode(v.encode('ascii')).decode('ascii') D['kind'] = 'Secret' D['data'] = OD yaml.dump_all(L, stream=sys.stdout)
import sys import ruamel.yaml as yaml GET_QUOTE_RESPONSE = ' '.join(sys.argv[1:]) file = "../kubernetes-manifests/shippingservice.yaml" with open(file, "r") as stream: d = list(yaml.safe_load_all(stream)) d[0]['spec']['template']['spec']['containers'][0]['env'][1][ 'value'] = GET_QUOTE_RESPONSE with open(file, "w") as stream: yaml.dump_all(d, stream, default_flow_style=False) file = "../kubernetes-manifests/testservice.yaml" with open(file, "r") as stream: d = list(yaml.safe_load_all(stream)) d[0]['spec']['template']['spec']['containers'][0]['env'][1][ 'value'] = GET_QUOTE_RESPONSE with open(file, "w") as stream: yaml.dump_all(d, stream, default_flow_style=False)
def main(): utils.custom_global_logging_setup() utils.setup_logging(THIS_FILE.with_suffix('.logconfig.yaml')) log.info('starting...') datafilepath = DATA_DIR.joinpath('official-2014.combined-withalt.m2') log.info('input: %s', datafilepath) with io.open(datafilepath, 'rt', encoding='utf-8') as text_istream: sentence_annotations = m2format.parse(text_istream) sentence_annotations = list(sentence_annotations) # sentence_annotations = sentence_annotations[5:6] # sentence_annotations = sentence_annotations[9:10] ########################################################################## # making homogeneous ########################################################################## sentences, annotations = make_homogeneous_sentence_annotations( sentence_annotations) ########################################################################## # concatenation ########################################################################## tokens = list(itertools.chain.from_iterable(sentences)) annotations = { annotator_id: list(concatenate_edits(annotator_edits), ) for annotator_id, annotator_edits in annotations.items() } assert set( max(e.region.end for e in edits) for edits in annotations.values()) == {len(tokens)} ########################################################################## # normalization to single token edits ########################################################################## annotations = { # stdlib [sorted] doing stable sorts, so, not expect to broke order of token inserts here annotator_id: sorted( itertools.chain.from_iterable( map(normalize_to_single_token_edits, annotator_edits)), key=lambda edit: (edit.region.beg, edit.region.end - edit.region.beg), ) for annotator_id, annotator_edits in annotations.items() } assert set( max(e.region.end for e in edits) for edits in annotations.values()) == {len(tokens)} ########################################################################## # analysis ########################################################################## def describe(text=None): def decorator(f): f.description = text if text is not None else f.__name__ return f return decorator @describe() def filter_both_annotators_saw_token(edits_a, edits_b): def positions_saw(edits): return set(e.region.beg for e in edits if e.type is not None) positions_allowed = positions_saw(edits_a) & positions_saw(edits_b) edits_a = [ e for e in edits_a if e.region.beg in positions_allowed and e.type is not None ] edits_b = [ e for e in edits_b if e.region.beg in positions_allowed and e.type is not None ] return edits_a, edits_b @describe() def erase_all_edit_types(edits_a, edits_b): NON_EXISTING_EDIT_TYPE = object() edits_a = [ m2format.Edit(e.region, e.tokens, NON_EXISTING_EDIT_TYPE) for e in edits_a ] edits_b = [ m2format.Edit(e.region, e.tokens, NON_EXISTING_EDIT_TYPE) for e in edits_b ] return edits_a, edits_b @describe() def erase_all_tokens(edits_a, edits_b): edits_a = [m2format.Edit(e.region, tuple(), e.type) for e in edits_a] edits_b = [m2format.Edit(e.region, tuple(), e.type) for e in edits_b] return edits_a, edits_b def filter_by_edit_type(edit_type_predicate, edits_a, edits_b): edits_a = [e for e in edits_a if edit_type_predicate(e.type)] edits_b = [e for e in edits_b if edit_type_predicate(e.type)] return edits_a, edits_b def analyze(annotations, pair_edits_transforms=tuple()): results = { 'transforms': [t.description for t in pair_edits_transforms], 'pairs': {}, 'f1_a_mean': None, 'f1_median': None, } f1s = [] annotators_ids = sorted(annotations.keys()) for a, b in itertools.combinations(annotators_ids, 2): edits_a, edits_b = annotations[a], annotations[b] # print(a, b, len(edits_a), len(edits_b)) for t in pair_edits_transforms: edits_a, edits_b = t(edits_a, edits_b) # print(a, b, len(edits_a), len(edits_b), 'after:', t.description) edits_a, edits_b = set(edits_a), set(edits_b) # print(a, b, len(edits_a), len(edits_b)) edits_ab = edits_a & edits_b na, nb, nab = len(edits_a), len(edits_b), len(edits_ab) f1_score = utils.f1_score(na, nb, nab) f1s.append(f1_score) results['pairs'][(a, b)] = [round(f1_score, 4), na, nb, nab] results['f1_a_mean'] = round(statistics.mean(f1s), 4) results['f1_median'] = round(statistics.median(f1s), 4) return results def generate_various_analysis(annotations): yield analyze(annotations) yield analyze(annotations, (filter_both_annotators_saw_token, )) yield analyze(annotations, (filter_both_annotators_saw_token, erase_all_edit_types)) error_types = set(e.type for edits in annotations.values() for e in edits if e.type is not None) error_types = sorted(error_types, reverse=True) def make_error_type_filter(edit_type): etf = functools.partial(filter_by_edit_type, lambda et: et == edit_type) return describe('edit.type == {}'.format(edit_type))(etf) def make_error_type_neq_filter(edit_type): etf = functools.partial(filter_by_edit_type, lambda et: et != edit_type) return describe('edit.type != {}'.format(edit_type))(etf) etf_error = make_error_type_neq_filter( m2format.IDENTITY_CORRECTION_TYPE) yield analyze(annotations, ( filter_both_annotators_saw_token, etf_error, erase_all_edit_types, )) yield analyze(annotations, ( filter_both_annotators_saw_token, etf_error, erase_all_edit_types, erase_all_tokens, )) error_type_filters = [make_error_type_filter(et) for et in error_types] for etf in error_type_filters: yield analyze(annotations, (filter_both_annotators_saw_token, etf)) results = generate_various_analysis(annotations) results = utils.log_iterator_progress('analysis', 1, results, log=log) outfilename = THIS_FILE.with_suffix('.results.yaml') outfilepath = OUT_DIR.joinpath(outfilename) log.info('output: %s', outfilepath) with io.open(outfilepath, 'wt', encoding='utf-8') as text_ostream: yaml = ruamel.yaml.YAML(typ='safe', pure=True) yaml.dump_all(results, text_ostream) log.info('finished')
def encode_payload(payload, decode=False): encoder = base64_decode if decode else base64_encode encoded_payload = [] for document in payload: encoded_payload.append(_encode(document, encoder)) return dump_all(encoded_payload, Dumper=RoundTripDumper)
def dump_yaml_file(data, path, round_tripping=False): with io.open(path, 'w', encoding='utf-8') as writer: if round_tripping: round_trip_dump(data, writer, allow_unicode=True, ) else: dump_all([data], writer, allow_unicode=True)