def _get_fe_coretype(fe_label, vus): if fe_label == 'O': raise InvalidParameterError( 'Unspecified FE label: {}'.format(fe_label)) if not vus: raise InvalidParameterError('Input ValenceUnit list is empty') fe_name = fe_label[2:] for vu in vus: if vu.fe.name == fe_name: return vu.fe.coretype raise Exception('Could not output FE coreType: no matching FE name \'{}\' ' 'found in ValenceUnit list {}'.format(fe_name, vus))
def _convert(args): if args.source_format == args.target_format: raise InvalidParameterError( 'Source and Target formats are the same! Please specify different ' 'source/target formats') if args.source_path == args.target_path: raise InvalidParameterError( 'Source and Target paths are the same! Please specify different ' 'source/target paths') if args.source_format == 'fnxml': annosets_dict = fnxml.get_annosets_dict(args.source_path, args.splits, args.with_exemplars) if args.source_format == 'bios': if args.sent == '__undefined__': raise InvalidParameterError( 'Unspecified sentence file. For bios unmarshalling you need ' 'to specify the --sent parameter pointing at the ' '.sentences file absolute filepath') annosets = biosu.unmarshall_annosets(args.source_path, args.sent) if args.source_format == 'semafor': if args.sent == '__undefined__': raise InvalidParameterError( 'Unspecified sentence file. For semafor unmarshalling you ' 'need to specify the --sent parameter pointing at the ' '.sentences file absolute filepath') annosets = semaforu.unmarshall_annosets(args.source_path, args.sent) # Starting marshalling if args.target_format == 'bios': os.makedirs(args.target_path, exist_ok=True) biosm.marshall_annosets_dict(annosets_dict, args.target_path, args.filter, args.output_sentences, args.excluded_frames, args.excluded_sentences, args.excluded_annosets) if args.target_format == 'semeval': if args.source_format == 'fnxml': splits_name = args.splits annosets = annosets_dict[splits_name] output_filepath = os.path.join(args.target_path, '{}.gold.xml'.format(splits_name)) if args.source_format == 'bios' or args.source_format == 'semafor': output_filepath = args.target_path semeval.marshall_annosets(annosets, output_filepath, args.excluded_frames, args.excluded_sentences, args.excluded_annosets) if args.target_format == 'semafor': os.makedirs(args.target_path, exist_ok=True) semaform.marshall_annosets_dict(annosets_dict, args.target_path, args.filter, args.output_sentences, args.excluded_frames, args.excluded_sentences, args.excluded_annosets)
def _get_annosets_dict_from_fn_xml(fn_splits_dirpath, splits, with_exemplars): if splits not in ('train', 'dev', 'test'): raise InvalidParameterError( 'Invalid splits name `{}`. Should be `train`, `dev` or `test`'. format(splits)) fe_dict = _get_fe_dict( xml_utils.get_xml_filepaths(fn_splits_dirpath, 'frame')) if splits == 'test': return { 'test': extract_annosets(os.path.join(fn_splits_dirpath, 'test'), with_fulltexts=True, with_exemplars=with_exemplars, fe_dict=fe_dict, flatten=True), 'dev': [], 'train': [] } if splits == 'dev': return { 'test': extract_annosets(os.path.join(fn_splits_dirpath, 'test'), with_fulltexts=True, with_exemplars=with_exemplars, fe_dict=fe_dict, flatten=True), 'dev': extract_annosets(os.path.join(fn_splits_dirpath, 'dev'), with_fulltexts=True, with_exemplars=with_exemplars, fe_dict=fe_dict, flatten=True), 'train': [] } if splits == 'train': return { 'test': extract_annosets(os.path.join(fn_splits_dirpath, 'test'), with_fulltexts=True, with_exemplars=with_exemplars, fe_dict=fe_dict, flatten=True), 'dev': extract_annosets(os.path.join(fn_splits_dirpath, 'dev'), with_fulltexts=True, with_exemplars=with_exemplars, fe_dict=fe_dict, flatten=True), 'train': extract_annosets(os.path.join(fn_splits_dirpath, 'train'), with_fulltexts=True, with_exemplars=with_exemplars, fe_dict=fe_dict, flatten=True) } return {}
def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options, output_sentences, excluded_frames, excluded_sentences, excluded_annosets): """Convert a dict of {splits:pyfn.AnnotationSet} to SEMAFOR splits files. The train spits will be converted to a .frame.elements file containing both frame and frame element labels depending on filtering options. The dev/test splits will be converted to a .frames file containing frame labels only. Args ---- annosets_dict: a splits to annosets dictionary (as generated by the framenet unmarshaller). target_dirpath: the absolute path to the target directory where to save the output file(s) filtering_options: a list of options to pass to the pyfn.utils.filter. ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces') output_sentences: True or False. Whether or not to also output a .sentences file listing all sentences (string), one per line. excluded_frames: a list of frame #id to exclude from the output excluded_sentences: a list of sentence #id to exclude from the output excluded_annosets: a list of annotationset #id to exclude from the output """ for splits_name, annosets in annosets_dict.items(): logger.info( 'Marshalling {} splits to semafor format'.format(splits_name)) semafor_filepath = files_utils.get_semafor_filepath( target_dirpath, splits_name) sent_filepath = files_utils.get_sent_filepath(target_dirpath, splits_name) sent_dict = {} if splits_name not in ['train', 'dev', 'test']: raise InvalidParameterError( 'Unsupported splits_name: {}'.format(splits_name)) if splits_name in ('dev', 'test'): # No special filtering on dev/test _marshall_semafor(annosets, [], sent_dict, semafor_filepath, excluded_frames, excluded_sentences, excluded_annosets, train_mode=False) elif splits_name == 'train': _marshall_semafor(annosets, filtering_options, sent_dict, semafor_filepath, excluded_frames, excluded_sentences, excluded_annosets, train_mode=True) # print out sentences file if output_sentences: marsh_utils.marshall_sent_dict(sent_dict, sent_filepath)
def _get_min_index(indexes): min_index = -1 for (start, end) in indexes: if start == -1 or end == -1: raise InvalidParameterError('Target start/end indexes are ' 'undefined') if min_index == -1: min_index = min(start, end) else: min_index = min(min_index, start, end) return min_index
def get_semafor_filepath(target_dirpath, splits_name): """Return the absolute path to the SEMAFOR file. If splits_name is 'train', function will return train.frame.elements. If splits_name is 'dev' or 'test', function will return train.frames. Both files will be seeked under the specified target_dirpath. """ if splits_name not in ['train', 'dev', 'test']: raise InvalidParameterError( 'Unsupported splits_name: {}'.format(splits_name)) if splits_name == 'train': return os.path.join(target_dirpath, 'train.frame.elements') return os.path.join(target_dirpath, '{}.frames'.format(splits_name))
def marshall_annosets(annosets, output_filepath, excluded_frames, excluded_sentences, excluded_annosets): """Marshall a list of pyfn.AnnotationSet objects to SEMEVAL XML. annosets: a list of annosets to marshall. output_filepath: the absolute path to the output .xml file excluded_frames: a list of frame #id to exclude from the output excluded_sentences: a list of sentence #id to exclude from the output excluded_annosets: a list of annotationset #id to exclude from the output """ logger.info('Marshalling pyfn.AnnotationSet objects to SEMEVAL XML...') if not annosets: raise InvalidParameterError('Input pyfn.AnnotationSet list is empty') logger.info('Saving output to {}'.format(output_filepath)) _marshall_annosets(annosets, output_filepath, excluded_frames, excluded_sentences, excluded_annosets)
def _marshall_annosets(annosets, output_filepath, excluded_frames, excluded_sentences, excluded_annosets): if not annosets: raise InvalidParameterError('No input annosets to marshall. Check ' 'input parameters and try again.') root = etree.Element('corpus') root.set('XMLCreated', datetime.datetime.now( pytz.utc).strftime('%a %b %d %H:%M:%S %Z %Y')) documents_tag = etree.SubElement(root, 'documents') document_tag = etree.SubElement(documents_tag, 'document') paragraphs_tag = etree.SubElement(document_tag, 'paragraphs') paragraph_tag = etree.SubElement(paragraphs_tag, 'paragraph') sentences_tag = etree.SubElement(paragraph_tag, 'sentences') sent_text = '' sent_id = 0 # to match the semval numbering of sentences annoset_id = 1 layer_id = 1 label_id = 1 for annoset in f_utils.filter_and_sort_annosets(annosets, [], excluded_frames, excluded_sentences, excluded_annosets): if annoset.sentence.text != sent_text: sentence = _get_sentence_tag(annoset, sentences_tag, sent_id) sent_id += 1 sent_text = annoset.sentence.text annosets_tag = etree.SubElement(sentence, 'annotationSets') annoset_tag = _get_annoset_tag(annosets_tag, annoset, annoset_id) annoset_id += 1 layers_tag = etree.SubElement(annoset_tag, 'layers') label_id = _add_target_labels(layers_tag, layer_id, annoset, label_id) layer_id += 1 if _has_fe_labels(annoset): label_id = _add_fe_labels(layers_tag, layer_id, annoset, label_id) layer_id += 1 tree = etree.ElementTree(root) tree.write(output_filepath, encoding='UTF-8', xml_declaration=True, pretty_print=True)
def __init__(self, config): """Constructor.""" if not isinstance(config, dict): raise InvalidParameterError( 'ImmutableConfig requires instance of dict as input parameter') super().__init__(self._freeze(config))
def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options, output_sentences, excluded_frames, excluded_sentences, excluded_annosets): """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files. Args ---- annosets_dict: a splits to annosets dictionary (as generated by the framenet unmarshaller). target_dirpath: the absolute path to the target directory where to save the output file(s) filtering_options: a list of options to pass to the pyfn.utils.filter. ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces') output_sentences: True or False. Whether or not to also output a .sentences file listing all sentences (string), one per line. excluded_frames: a list of frame #id to exclude from the output excluded_sentences: a list of sentence #id to exclude from the output excluded_annosets: a list of annotationset #id to exclude from the output """ for splits_name, annosets in annosets_dict.items(): bios_filepath = files_utils.get_bios_filepath(target_dirpath, splits_name) bios_semeval_filepath = files_utils.get_bios_semeval_filepath( target_dirpath, splits_name) sent_filepath = files_utils.get_sent_filepath(target_dirpath, splits_name) sent_dict = {} if splits_name not in ['train', 'dev', 'test']: raise InvalidParameterError( 'Invalid splits_name: {}'.format(splits_name)) if splits_name in ('dev', 'test'): annosets, _annosets = itertools.tee(annosets, 2) logger.info('Marshalling splits:pyfn.AnnotationSet dict to ' '.bios.semeval for {} splits with [] filtering ' 'options...'.format(splits_name)) _marshall_bios( annosets, [], sent_dict, # No special filtering on dev/test bios_semeval_filepath, excluded_frames, excluded_sentences, excluded_annosets, with_fe_anno=False) logger.info('Marshalling splits:pyfn.AnnotationSet dict to ' '.bios for {} splits with {} filtering ' 'options...'.format(splits_name, filtering_options)) _marshall_bios(_annosets, filtering_options, sent_dict, bios_filepath, excluded_frames, excluded_annosets, excluded_sentences, with_fe_anno=True) elif splits_name == 'train': logger.info('Marshalling splits:pyfn.AnnotationSet dict to ' '.bios for {} splits with {} filtering ' 'options...'.format(splits_name, filtering_options)) _marshall_bios(annosets, filtering_options, sent_dict, bios_filepath, excluded_frames, excluded_sentences, excluded_annosets, with_fe_anno=True) # print out sentences file if output_sentences: marsh_utils.marshall_sent_dict(sent_dict, sent_filepath)