示例#1
0
def _get_fe_coretype(fe_label, vus):
    if fe_label == 'O':
        raise InvalidParameterError(
            'Unspecified FE label: {}'.format(fe_label))
    if not vus:
        raise InvalidParameterError('Input ValenceUnit list is empty')
    fe_name = fe_label[2:]
    for vu in vus:
        if vu.fe.name == fe_name:
            return vu.fe.coretype
    raise Exception('Could not output FE coreType: no matching FE name \'{}\' '
                    'found in ValenceUnit list {}'.format(fe_name, vus))
示例#2
0
文件: main.py 项目: akb89/pyfn
def _convert(args):
    if args.source_format == args.target_format:
        raise InvalidParameterError(
            'Source and Target formats are the same! Please specify different '
            'source/target formats')
    if args.source_path == args.target_path:
        raise InvalidParameterError(
            'Source and Target paths are the same! Please specify different '
            'source/target paths')
    if args.source_format == 'fnxml':
        annosets_dict = fnxml.get_annosets_dict(args.source_path, args.splits,
                                                args.with_exemplars)
    if args.source_format == 'bios':
        if args.sent == '__undefined__':
            raise InvalidParameterError(
                'Unspecified sentence file. For bios unmarshalling you need '
                'to specify the --sent parameter pointing at the '
                '.sentences file absolute filepath')
        annosets = biosu.unmarshall_annosets(args.source_path, args.sent)
    if args.source_format == 'semafor':
        if args.sent == '__undefined__':
            raise InvalidParameterError(
                'Unspecified sentence file. For semafor unmarshalling you '
                'need to specify the --sent parameter pointing at the '
                '.sentences file absolute filepath')
        annosets = semaforu.unmarshall_annosets(args.source_path, args.sent)
    # Starting marshalling
    if args.target_format == 'bios':
        os.makedirs(args.target_path, exist_ok=True)
        biosm.marshall_annosets_dict(annosets_dict, args.target_path,
                                     args.filter, args.output_sentences,
                                     args.excluded_frames,
                                     args.excluded_sentences,
                                     args.excluded_annosets)
    if args.target_format == 'semeval':
        if args.source_format == 'fnxml':
            splits_name = args.splits
            annosets = annosets_dict[splits_name]
            output_filepath = os.path.join(args.target_path,
                                           '{}.gold.xml'.format(splits_name))
        if args.source_format == 'bios' or args.source_format == 'semafor':
            output_filepath = args.target_path
        semeval.marshall_annosets(annosets, output_filepath,
                                  args.excluded_frames,
                                  args.excluded_sentences,
                                  args.excluded_annosets)
    if args.target_format == 'semafor':
        os.makedirs(args.target_path, exist_ok=True)
        semaform.marshall_annosets_dict(annosets_dict, args.target_path,
                                        args.filter, args.output_sentences,
                                        args.excluded_frames,
                                        args.excluded_sentences,
                                        args.excluded_annosets)
示例#3
0
文件: framenet.py 项目: akb89/pyfn
def _get_annosets_dict_from_fn_xml(fn_splits_dirpath, splits, with_exemplars):
    if splits not in ('train', 'dev', 'test'):
        raise InvalidParameterError(
            'Invalid splits name `{}`. Should be `train`, `dev` or `test`'.
            format(splits))
    fe_dict = _get_fe_dict(
        xml_utils.get_xml_filepaths(fn_splits_dirpath, 'frame'))
    if splits == 'test':
        return {
            'test':
            extract_annosets(os.path.join(fn_splits_dirpath, 'test'),
                             with_fulltexts=True,
                             with_exemplars=with_exemplars,
                             fe_dict=fe_dict,
                             flatten=True),
            'dev': [],
            'train': []
        }
    if splits == 'dev':
        return {
            'test':
            extract_annosets(os.path.join(fn_splits_dirpath, 'test'),
                             with_fulltexts=True,
                             with_exemplars=with_exemplars,
                             fe_dict=fe_dict,
                             flatten=True),
            'dev':
            extract_annosets(os.path.join(fn_splits_dirpath, 'dev'),
                             with_fulltexts=True,
                             with_exemplars=with_exemplars,
                             fe_dict=fe_dict,
                             flatten=True),
            'train': []
        }
    if splits == 'train':
        return {
            'test':
            extract_annosets(os.path.join(fn_splits_dirpath, 'test'),
                             with_fulltexts=True,
                             with_exemplars=with_exemplars,
                             fe_dict=fe_dict,
                             flatten=True),
            'dev':
            extract_annosets(os.path.join(fn_splits_dirpath, 'dev'),
                             with_fulltexts=True,
                             with_exemplars=with_exemplars,
                             fe_dict=fe_dict,
                             flatten=True),
            'train':
            extract_annosets(os.path.join(fn_splits_dirpath, 'train'),
                             with_fulltexts=True,
                             with_exemplars=with_exemplars,
                             fe_dict=fe_dict,
                             flatten=True)
        }
    return {}
示例#4
0
文件: semafor.py 项目: akb89/pyfn
def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options,
                           output_sentences, excluded_frames,
                           excluded_sentences, excluded_annosets):
    """Convert a dict of {splits:pyfn.AnnotationSet} to SEMAFOR splits files.

    The train spits will be converted to a .frame.elements file containing
    both frame and frame element labels depending on filtering options.
    The dev/test splits will be converted to a .frames file containing
    frame labels only.

     Args
     ----
         annosets_dict: a splits to annosets dictionary (as generated by
         the framenet unmarshaller).
         target_dirpath: the absolute path to the target directory where to
         save the output file(s)
         filtering_options: a list of options to pass to the pyfn.utils.filter.
         ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces')
         output_sentences: True or False. Whether or not to also output a .sentences file
         listing all sentences (string), one per line.
         excluded_frames: a list of frame #id to exclude from the output
         excluded_sentences: a list of sentence #id to exclude from the output
         excluded_annosets: a list of annotationset #id to exclude from the output
    """
    for splits_name, annosets in annosets_dict.items():
        logger.info(
            'Marshalling {} splits to semafor format'.format(splits_name))
        semafor_filepath = files_utils.get_semafor_filepath(
            target_dirpath, splits_name)
        sent_filepath = files_utils.get_sent_filepath(target_dirpath,
                                                      splits_name)
        sent_dict = {}
        if splits_name not in ['train', 'dev', 'test']:
            raise InvalidParameterError(
                'Unsupported splits_name: {}'.format(splits_name))
        if splits_name in ('dev', 'test'):
            # No special filtering on dev/test
            _marshall_semafor(annosets, [],
                              sent_dict,
                              semafor_filepath,
                              excluded_frames,
                              excluded_sentences,
                              excluded_annosets,
                              train_mode=False)
        elif splits_name == 'train':
            _marshall_semafor(annosets,
                              filtering_options,
                              sent_dict,
                              semafor_filepath,
                              excluded_frames,
                              excluded_sentences,
                              excluded_annosets,
                              train_mode=True)
        # print out sentences file
        if output_sentences:
            marsh_utils.marshall_sent_dict(sent_dict, sent_filepath)
示例#5
0
文件: semafor.py 项目: akb89/pyfn
def _get_min_index(indexes):
    min_index = -1
    for (start, end) in indexes:
        if start == -1 or end == -1:
            raise InvalidParameterError('Target start/end indexes are '
                                        'undefined')
        if min_index == -1:
            min_index = min(start, end)
        else:
            min_index = min(min_index, start, end)
    return min_index
示例#6
0
def get_semafor_filepath(target_dirpath, splits_name):
    """Return the absolute path to the SEMAFOR file.

    If splits_name is 'train', function will return train.frame.elements.
    If splits_name is 'dev' or 'test', function will return train.frames.
    Both files will be seeked under the specified target_dirpath.
    """
    if splits_name not in ['train', 'dev', 'test']:
        raise InvalidParameterError(
            'Unsupported splits_name: {}'.format(splits_name))
    if splits_name == 'train':
        return os.path.join(target_dirpath, 'train.frame.elements')
    return os.path.join(target_dirpath, '{}.frames'.format(splits_name))
示例#7
0
def marshall_annosets(annosets, output_filepath, excluded_frames,
                      excluded_sentences, excluded_annosets):
    """Marshall a list of pyfn.AnnotationSet objects to SEMEVAL XML.

    annosets: a list of annosets to marshall.
    output_filepath: the absolute path to the output .xml file
    excluded_frames: a list of frame #id to exclude from the output
    excluded_sentences: a list of sentence #id to exclude from the output
    excluded_annosets: a list of annotationset #id to exclude from the output
    """
    logger.info('Marshalling pyfn.AnnotationSet objects to SEMEVAL XML...')
    if not annosets:
        raise InvalidParameterError('Input pyfn.AnnotationSet list is empty')
    logger.info('Saving output to {}'.format(output_filepath))
    _marshall_annosets(annosets, output_filepath, excluded_frames,
                       excluded_sentences, excluded_annosets)
示例#8
0
def _marshall_annosets(annosets, output_filepath, excluded_frames,
                       excluded_sentences, excluded_annosets):
    if not annosets:
        raise InvalidParameterError('No input annosets to marshall. Check '
                                    'input parameters and try again.')
    root = etree.Element('corpus')
    root.set('XMLCreated', datetime.datetime.now(
        pytz.utc).strftime('%a %b %d %H:%M:%S %Z %Y'))
    documents_tag = etree.SubElement(root, 'documents')
    document_tag = etree.SubElement(documents_tag, 'document')
    paragraphs_tag = etree.SubElement(document_tag, 'paragraphs')
    paragraph_tag = etree.SubElement(paragraphs_tag, 'paragraph')
    sentences_tag = etree.SubElement(paragraph_tag, 'sentences')
    sent_text = ''
    sent_id = 0  # to match the semval numbering of sentences
    annoset_id = 1
    layer_id = 1
    label_id = 1
    for annoset in f_utils.filter_and_sort_annosets(annosets, [],
                                                    excluded_frames,
                                                    excluded_sentences,
                                                    excluded_annosets):
        if annoset.sentence.text != sent_text:
            sentence = _get_sentence_tag(annoset, sentences_tag, sent_id)
            sent_id += 1
            sent_text = annoset.sentence.text
            annosets_tag = etree.SubElement(sentence, 'annotationSets')
        annoset_tag = _get_annoset_tag(annosets_tag, annoset, annoset_id)
        annoset_id += 1
        layers_tag = etree.SubElement(annoset_tag, 'layers')
        label_id = _add_target_labels(layers_tag, layer_id, annoset, label_id)
        layer_id += 1
        if _has_fe_labels(annoset):
            label_id = _add_fe_labels(layers_tag, layer_id, annoset, label_id)
            layer_id += 1
    tree = etree.ElementTree(root)
    tree.write(output_filepath, encoding='UTF-8', xml_declaration=True,
               pretty_print=True)
示例#9
0
 def __init__(self, config):
     """Constructor."""
     if not isinstance(config, dict):
         raise InvalidParameterError(
             'ImmutableConfig requires instance of dict as input parameter')
     super().__init__(self._freeze(config))
示例#10
0
def marshall_annosets_dict(annosets_dict, target_dirpath, filtering_options,
                           output_sentences, excluded_frames,
                           excluded_sentences, excluded_annosets):
    """Convert a dict of {splits:pyfn.AnnotationSet} to BIOS splits files.

    Args
    ----
        annosets_dict: a splits to annosets dictionary (as generated by
        the framenet unmarshaller).
        target_dirpath: the absolute path to the target directory where to
        save the output file(s)
        filtering_options: a list of options to pass to the pyfn.utils.filter.
        ('overlap_fes', 'disc_fes', 'disc_targets', 'no_fes', 'non_breaking_spaces')
        output_sentences: True or False. Whether or not to also output a .sentences file
        listing all sentences (string), one per line.
        excluded_frames: a list of frame #id to exclude from the output
        excluded_sentences: a list of sentence #id to exclude from the output
        excluded_annosets: a list of annotationset #id to exclude from the output
    """
    for splits_name, annosets in annosets_dict.items():
        bios_filepath = files_utils.get_bios_filepath(target_dirpath,
                                                      splits_name)
        bios_semeval_filepath = files_utils.get_bios_semeval_filepath(
            target_dirpath, splits_name)
        sent_filepath = files_utils.get_sent_filepath(target_dirpath,
                                                      splits_name)
        sent_dict = {}
        if splits_name not in ['train', 'dev', 'test']:
            raise InvalidParameterError(
                'Invalid splits_name: {}'.format(splits_name))
        if splits_name in ('dev', 'test'):
            annosets, _annosets = itertools.tee(annosets, 2)
            logger.info('Marshalling splits:pyfn.AnnotationSet dict to '
                        '.bios.semeval for {} splits with [] filtering '
                        'options...'.format(splits_name))
            _marshall_bios(
                annosets,
                [],
                sent_dict,  # No special filtering on dev/test
                bios_semeval_filepath,
                excluded_frames,
                excluded_sentences,
                excluded_annosets,
                with_fe_anno=False)
            logger.info('Marshalling splits:pyfn.AnnotationSet dict to '
                        '.bios for {} splits with {} filtering '
                        'options...'.format(splits_name, filtering_options))
            _marshall_bios(_annosets,
                           filtering_options,
                           sent_dict,
                           bios_filepath,
                           excluded_frames,
                           excluded_annosets,
                           excluded_sentences,
                           with_fe_anno=True)
        elif splits_name == 'train':
            logger.info('Marshalling splits:pyfn.AnnotationSet dict to '
                        '.bios for {} splits with {} filtering '
                        'options...'.format(splits_name, filtering_options))
            _marshall_bios(annosets,
                           filtering_options,
                           sent_dict,
                           bios_filepath,
                           excluded_frames,
                           excluded_sentences,
                           excluded_annosets,
                           with_fe_anno=True)
        # print out sentences file
        if output_sentences:
            marsh_utils.marshall_sent_dict(sent_dict, sent_filepath)