Пример #1
0
def build_lexeme(template: Template, form_data: werkzeug.datastructures.MultiDict) -> Lexeme:
    lang = template['language_code']
    forms = []
    form_representations = form_data.getlist('form_representation')
    for form_representation, form in zip(form_representations, template['forms']):
        if not form_representation:
            continue
        for form_representation_variant in form_representation.split('/'):
            if not form_representation_variant:
                flask.abort(400)
            forms.append(build_form(form, lang, form_representation_variant))
    lexeme_data = cast(Lexeme, {
        'type': 'lexeme',
        'forms': forms,
    })
    lexeme_id = form_data.get('lexeme_id', '')
    if lexeme_id:
        lexeme_data['id'] = lexeme_id
        wiki = 'test' if 'test' in template else 'www'
        match = match_template_to_lexeme_data(template, get_lexeme_data(lexeme_id, wiki))
        # TODO warn if match['conflicting_statements']?
        lexeme_data['claims'] = match['missing_statements']
    else:
        lemmas = build_lemmas(template, form_data)
        if lemmas is None:
            flask.abort(400)
        lexeme_data.update({
            'lemmas': lemmas,
            'language': template['language_item_id'],
            'lexicalCategory': template['lexical_category_item_id'],
            'claims': template.get('statements', {}),
        })
    return lexeme_data
Пример #2
0
def get_lemma(form_data: werkzeug.datastructures.MultiDict) -> Optional[str]:
    """Get the lemma for the lexeme from the given form data.

    The lemma is the first nonempty form representation variant.
    (Usually, the first representation variant of the first form,
    but in advanced mode, any form may be omitted, including the first one,
    which can be useful for e.g. pluralia tantum.)

    This logic is duplicated in findDuplicates.js::getLemma –
    keep the two in sync!"""
    for form_representation in form_data.getlist('form_representation'):
        for form_representation_variant in form_representation.split('/'):
            if form_representation_variant != '':
                return form_representation_variant
    return None
Пример #3
0
def add_form_data_to_template(
        form_data: werkzeug.datastructures.MultiDict,
        template,  # no static type – some vague kind of Template
        overwrite: bool = True,
):  # no static return type – some vague kind of Template
    template = copy.deepcopy(template)
    for (form_representation, form) in zip(form_data.getlist('form_representation'), template['forms']):
        if overwrite or not form.get('value'):
            form['value'] = form_representation
    if 'lexeme_id' in form_data:
        template['lexeme_id'] = form_data['lexeme_id']
    if 'generated_via' in form_data:
        template['generated_via'] = form_data['generated_via']
    if 'target_hash' in form_data:
        template['target_hash'] = form_data['target_hash']
    return template
Пример #4
0
def if_has_duplicates_redirect(
        template: Template,
        advanced: bool,
        form_data: werkzeug.datastructures.MultiDict,
) -> Optional[RRV]:
    if 'no_duplicate' in form_data:
        return None
    if 'lexeme_id' in form_data and form_data['lexeme_id']:
        return None

    duplicates = find_duplicates(template, form_data)
    if duplicates:
        return flask.render_template(
            'template.html',
            template=add_form_data_to_template(form_data, template),
            advanced=advanced,
            duplicates=duplicates,
            submitted_form_representations=form_data.getlist('form_representation'),
        )
    else:
        return None
Пример #5
0
def update_lexeme(
        lexeme_data: Lexeme,
        template: BoundTemplate,
        form_data: werkzeug.datastructures.MultiDict,
        representation_language_code: str,
        missing_statements: Optional[Statements] = None,
) -> Lexeme:
    lexeme_data = copy.deepcopy(lexeme_data)
    lexeme_data['base_revision_id'] = template['lexeme_revision']

    for form_data_representation, template_form in zip(form_data.getlist('form_representation'), template['forms']):
        template_form = cast(MatchedTemplateForm, template_form)
        form_data_representation_variants = form_data_representation.split('/')
        if form_data_representation_variants == ['']:
            form_data_representation_variants = []
        lexeme_forms = template_form.get('lexeme_forms', []).copy()
        # process “representations” that actually reference existing forms first
        for form_data_representation_variant in reversed(form_data_representation_variants):  # reversed so that the remove within the loop doesn’t disturb the iteration
            if not re.match(r'^L[1-9][0-9]*-F[1-9][0-9]*$', form_data_representation_variant):
                continue
            lexeme_form = find_form(lexeme_data, form_id=form_data_representation_variant)
            if lexeme_form in template.get('unmatched_lexeme_forms', []):
                template['unmatched_lexeme_forms'].remove(lexeme_form)
            elif lexeme_form in template.get('ambiguous_lexeme_forms', []):
                template['ambiguous_lexeme_forms'].remove(lexeme_form)
            else:
                flask.abort(400, 'Form %s is neither unmatched nor ambiguous, refusing to re-match it to a different template form' % form_data_representation_variant)
            # add missing grammatical features
            for grammatical_feature_item_id in template_form['grammatical_features_item_ids']:
                if grammatical_feature_item_id not in lexeme_form['grammaticalFeatures']:
                    lexeme_form['grammaticalFeatures'].append(grammatical_feature_item_id)
            # add missing statements (and complain about conflicting ones)
            form_matched_statements, form_missing_statements, form_conflicting_statements = match_template_entity_to_lexeme_entity('test' in template, template_form, lexeme_form)
            if form_conflicting_statements:
                flask.abort(400, 'Conflicting statements!')  # TODO better error reporting
            for property_id, statements in form_missing_statements.items():
                lexeme_form.setdefault('claims', {}).setdefault(property_id, []).extend(statements)
            form_data_representation_variants.remove(form_data_representation_variant)
        # find and remove matching forms (usually no modification necessary)
        for lexeme_form in reversed(lexeme_forms):  # reversed so that the remove within the loop doesn’t disturb the iteration
            if representation_language_code not in lexeme_form['representations']:
                continue
            lexeme_form_representation = lexeme_form['representations'][representation_language_code]
            if lexeme_form_representation['value'] in form_data_representation_variants:
                lexeme_forms.remove(lexeme_form)
                form_data_representation_variants.remove(lexeme_form_representation['value'])
                if template_form.get('grammatical_features_item_ids_optional', set()):
                    # the lexeme form may be missing optional grammatical features, add them
                    lexeme_form = find_form(lexeme_data, lexeme_form['id'])
                    for grammatical_feature_item_id in template_form['grammatical_features_item_ids']:
                        if grammatical_feature_item_id not in lexeme_form['grammaticalFeatures']:
                            assert grammatical_feature_item_id in template_form['grammatical_features_item_ids_optional'], \
                                'Only optional grammatical features may be missing from a matched form'
                            lexeme_form['grammaticalFeatures'].append(grammatical_feature_item_id)
                break
        # overwrite remaining lexeme forms with form data as long as we have both
        # currently simply in order, cleverer matching via edit distance may be possible but likely not necessary
        overwritten_forms = 0
        for form_data_representation_variant, lexeme_form in zip(form_data_representation_variants, lexeme_forms):
            lexeme_form = find_form(lexeme_data, lexeme_form['id'])
            lexeme_form_representation = lexeme_form['representations']\
                .setdefault(representation_language_code, {
                    'language': representation_language_code,
                    'value': '',  # overridden immediately below
                })
            assert form_data_representation_variant, 'Representation cannot be empty'
            lexeme_form_representation['value'] = form_data_representation_variant
            overwritten_forms += 1
        form_data_representation_variants = form_data_representation_variants[overwritten_forms:]
        lexeme_forms = lexeme_forms[overwritten_forms:]
        # add remaining form data as new OR delete remaining lexeme form representations or forms
        assert not (form_data_representation_variants and lexeme_forms), 'After previous loop, at least one list must be exhausted'
        for form_data_representation_variant in form_data_representation_variants:
            assert form_data_representation_variant, 'Representation cannot be empty'
            lexeme_form = build_form(template_form, representation_language_code, form_data_representation_variant)
            lexeme_data['forms'].append(lexeme_form)
            template_form.setdefault('lexeme_forms', []).append(lexeme_form)  # so it can be found as first_form below
        for lexeme_form in lexeme_forms:
            lexeme_form = find_form(lexeme_data, lexeme_form['id'])
            if representation_language_code in lexeme_form['representations']:
                if len(lexeme_form['representations']) == 1:
                    lexeme_form['remove'] = ''  # remove whole form
                else:
                    lexeme_form['representations'][representation_language_code]['remove'] = ''  # remove only this representation
            # otherwise it’s an unrelated form that wasn’t shown to begin with, leave it alone

    for property_id, statements in (missing_statements or {}).items():
        lexeme_data.setdefault('claims', {}).setdefault(property_id, []).extend(statements)

    first_form = next(iter(cast(MatchedTemplateForm, template['forms'][0]).get('lexeme_forms', [])), None)
    if first_form:
        if first_form_id := first_form.get('id'):
            first_form = find_form(lexeme_data, first_form_id)  # find edited version
            assert first_form is not None
        else:
            # it’s a new form, first_form is already the edited version
            pass
        if representation_language_code in first_form['representations']:
            lexeme_data['lemmas'][representation_language_code] = first_form['representations'][representation_language_code]
        else:
            lexeme_data['lemmas'].pop(representation_language_code, None)