Пример #1
0
def compute_perplexity(**kwargs):
    """Evaluate the LM by attempting to calculate its perplexity and changing
    some attribute values to reflect the attempt.
    """
    try:
        dbsession = get_dbsession_from_settings(kwargs['settings'])()
        langmod = dbsession.query(
            old_models.MorphemeLanguageModel).get(
                kwargs['morpheme_language_model_id'])
        timeout = kwargs['timeout']
        iterations = 5
        try:
            langmod.perplexity = langmod.compute_perplexity(timeout, iterations)
        except Exception as error:
            LOGGER.error('Exception when calling `compute_perplexity` on'
                         ' language model: %s %s', error.__class__.__name__,
                         error)
            langmod.perplexity = None
        if langmod.perplexity is None:
            langmod.perplexity_computed = False
        else:
            langmod.perplexity_computed = True
        langmod.perplexity_attempt = str(uuid4())
        langmod.modifier_id = kwargs['user_id']
        langmod.datetime_modified = h.now()
    finally:
        dbsession.commit()
        dbsession.close()
Пример #2
0
def generate_and_compile_morphology(**kwargs):
    """Generate a foma script for a morphology and (optionally) compile it.
    :param int kwargs['morphology_id']: id of a morphology.
    :param bool kwargs['compile']: if True, the script will be generated *and*
        compiled.
    :param int kwargs['user_id']: id of the user model performing the
        generation/compilation.
    :param float kwargs['timeout']: how many seconds to wait before killing the
        foma compile process.
    """
    try:
        dbsession = get_dbsession_from_settings(kwargs['settings'])()
        morphology = dbsession.query(
            old_models.Morphology).get(kwargs['morphology_id'])
        try:
            morphology.write(oldc.UNKNOWN_CATEGORY)
        except Exception as error:
            LOGGER.error('Exception when calling `write` on morphology: %s %s',
                         error.__class__.__name__, error)
        if kwargs.get('compile', True):
            try:
                morphology.compile(kwargs['timeout'])
            except Exception as error:
                LOGGER.error('Exception when calling `compile` on morphology:'
                             ' %s %s', error.__class__.__name__, error)
        morphology.generate_attempt = str(uuid4())
        morphology.modifier_id = kwargs['user_id']
        morphology.datetime_modified = h.now()
    finally:
        dbsession.commit()
        dbsession.close()
Пример #3
0
 def _get_create_data(self, data):
     user_data = self._get_user_data(data)
     now = h.now()
     user_model = self.logged_in_user
     user_data.update({
         'parent_directory':
         h.get_old_directory_path('morphologies',
                                  self.request.registry.settings),
         # TODO: the Pylons app implied that this constant could change...
         'word_boundary_symbol':
         oldc.WORD_BOUNDARY_SYMBOL,
         'rare_delimiter':
         oldc.RARE_DELIMITER,
         'morpheme_delimiters':
         self.db.get_morpheme_delimiters(type_='str'),
         'UUID':
         str(uuid4()),
         'enterer':
         user_model,
         'modifier':
         user_model,
         'datetime_modified':
         now,
         'datetime_entered':
         now
     })
     return user_data
Пример #4
0
 def _get_create_data(self, data):
     user_data = self._get_user_data(data)
     now = h.now()
     user_model = self.logged_in_user
     user_data.update({
         'parent_directory':
         h.get_old_directory_path('morphemelanguagemodels',
                                  self.request.registry.settings),
         'rare_delimiter':
         oldc.RARE_DELIMITER,
         'start_symbol':
         oldc.LM_START,
         'end_symbol':
         oldc.LM_END,
         'morpheme_delimiters':
         self.db.get_morpheme_delimiters(type_='str'),
         'UUID':
         str(uuid4()),
         'enterer':
         user_model,
         'modifier':
         user_model,
         'datetime_modified':
         now,
         'datetime_entered':
         now
     })
     return user_data
Пример #5
0
    def update(self):
        """Update a user's remembered forms and return them.

        - URL: ``PUT /rememberedforms/id``
        - Request body: JSON object of the form ``{"forms": [...]}`` where the
          array contains the form ``id`` values that will constitute the
          user's ``remembered_forms`` collection after update.

        :param str id: the ``id`` value of the user model whose
         ``remembered_forms`` attribute is to be updated.
        :returns: the list of remembered forms of the user.

        .. note:: Administrators can update any user's remembered forms;
           non-administrators can only update their own.
        """
        id_ = self.request.matchdict['id']
        LOGGER.info('Attempting to update the forms remembered by user %d.',
                    id_)
        user = self.request.dbsession.query(User).options(
            subqueryload(User.remembered_forms)).get(id_)
        schema = FormIdsSchemaNullable
        if not user:
            self.request.response.status_int = 404
            msg = 'There is no user with id {}'.format(id_)
            LOGGER.warning(msg)
            return {'error': msg}
        try:
            values = json.loads(self.request.body.decode(self.request.charset))
        except ValueError:
            self.request.response.status_int = 400
            LOGGER.warning(JSONDecodeErrorResponse)
            return JSONDecodeErrorResponse
        state = SchemaState(full_dict=values, db=self.db, id=id_)
        try:
            data = schema.to_python(values, state)
        except Invalid as error:
            self.request.response.status_int = 400
            errors = error.unpack_errors()
            LOGGER.warning(errors)
            return {'errors': errors}
        forms = [f for f in data['forms'] if f]
        unrestricted_users = self.db.get_unrestricted_users()
        unrestricted_forms = [
            f for f in forms
            if self.logged_in_user.is_authorized_to_access_model(
                f, unrestricted_users)
        ]
        if set(user.remembered_forms) != set(unrestricted_forms):
            user.remembered_forms = unrestricted_forms
            user.datetime_modified = h.now()
            LOGGER.info('Updated the forms remembered by user %d.', id_)
            return user.remembered_forms
        self.request.response.status_int = 400
        msg = ('The update request failed because the submitted data were not'
               ' new.')
        LOGGER.warning(msg)
        return {'error': msg}
Пример #6
0
def generate_language_model(**kwargs):
    """Write the requisite files (corpus, vocab, ARPA, LMTrie) of a morpheme LM
    to disk.
    :param str kwargs['morpheme_language_model_id']: ``id`` value of a morpheme
        LM.
    :param int/float kwargs['timeout']: seconds to allow for ARPA file creation.
    :param str kwargs['user_id']: ``id`` value of an OLD user.
    :returns: ``None``; side-effect is to change relevant attributes of LM
        object.
    """
    try:
        dbsession = get_dbsession_from_settings(kwargs['settings'])()
        langmod = dbsession.query(
            old_models.MorphemeLanguageModel).get(
                kwargs['morpheme_language_model_id'])
        trie_path = langmod.get_file_path('trie')
        trie_mod_time = langmod.get_modification_time(trie_path)
        langmod.generate_succeeded = False
        try:
            langmod.write_corpus()
        except Exception as error:
            LOGGER.error('Exception when calling `write_corpus` on language'
                         ' model: %s %s', error.__class__.__name__, error)
            langmod.generate_message = 'Error writing the corpus file. %s' % error
        try:
            langmod.write_vocabulary()
        except Exception as error:
            LOGGER.error('Exception when calling `write_vocabulary` on language'
                         ' model: %s %s', error.__class__.__name__, error)
            langmod.generate_message = 'Error writing the vocabulary file. %s' % error
        try:
            langmod.write_arpa(kwargs['timeout'])
        except Exception as error:
            LOGGER.error('Exception when calling `write_arpa` on language'
                         ' model: %s %s', error.__class__.__name__, error)
            langmod.generate_message = 'Error writing the ARPA file. %s' % error
        try:
            langmod.generate_trie()
        except Exception as error:
            LOGGER.error('Exception when calling `generate_trie` on language'
                         ' model: %s %s', error.__class__.__name__, error)
            langmod.generate_message = 'Error generating the LMTrie instance. %s' % error
        else:
            if langmod.get_modification_time(trie_path) != trie_mod_time:
                langmod.generate_succeeded = True
                langmod.generate_message = 'Language model successfully generated.'
            else:
                langmod.generate_message = 'Error generating the LMTrie instance.'
        langmod.generate_attempt = str(uuid4())
        langmod.modifier_id = kwargs['user_id']
        langmod.datetime_modified = h.now()
    finally:
        dbsession.commit()
        dbsession.close()
Пример #7
0
def compile_phonology(**kwargs):
    """Compile the foma script of a phonology and save it to the db with values
    that indicate compilation success.
    """
    try:
        dbsession = get_dbsession_from_settings(kwargs['settings'])()
        phonology = dbsession.query(
            old_models.Phonology).get(kwargs['phonology_id'])
        phonology.compile(kwargs['timeout'])
        phonology.datetime_modified = h.now()
        phonology.modifier_id = kwargs['user_id']
    finally:
        dbsession.commit()
        dbsession.close()
Пример #8
0
 def _add_standard_metadata(self, file_, data):
     """Add the standard metadata to the file model using the data dictionary.
     :param file_: file model object
     :param dict data: dictionary containing file attribute values.
     :returns: the updated file model object.
     """
     file_.description = h.normalize(data['description'])
     file_.utterance_type = data['utterance_type']
     file_.date_elicited = data['date_elicited']
     if data['elicitor']:
         file_.elicitor = data['elicitor']
     if data['speaker']:
         file_.speaker = data['speaker']
     file_.tags = [t for t in data['tags'] if t]
     file_.forms = [f for f in data['forms'] if f]
     now = h.now()
     file_.datetime_entered = now
     file_.datetime_modified = now
     file_.enterer = self.logged_in_user
     return file_
 def _get_create_data(self, data):
     user_data = self._get_user_data(data)
     now = h.now()
     user_model = self.logged_in_user
     user_data.update({
         'parent_directory':
         h.get_old_directory_path('morphologicalparsers',
                                  self.request.registry.settings),
         'UUID':
         str(uuid4()),
         'enterer':
         user_model,
         'modifier':
         user_model,
         'datetime_modified':
         now,
         'datetime_entered':
         now
     })
     return user_data
Пример #10
0
def generate_and_compile_parser(**kwargs):
    """Write the parser's morphophonology FST script to file and compile it if
    ``compile_`` is True.  Generate the language model and pickle it.
    """
    try:
        dbsession = get_dbsession_from_settings(kwargs['settings'])
        parser = dbsession.query(old_models.MorphologicalParser).get(
            kwargs['morphological_parser_id'])
        cache = Cache(parser, kwargs['settings'], get_dbsession_from_settings)
        parser.cache = cache
        parser.changed = False
        parser.write()
        dbsession.commit()
        if kwargs.get('compile', True):
            parser.compile(kwargs['timeout'])
        parser.modifier_id = kwargs['user_id']
        parser.datetime_modified = h.now()
        #parser.changed = True  # TESTS SHOULD PASS WITHOUT THIS!
        if parser.changed:
            parser.cache.clear(persist=True)
        dbsession.add(parser)
    finally:
        dbsession.commit()
        dbsession.close()
Пример #11
0
 def _get_update_data(self, user_data):
     now = h.now()
     user_model = self.logged_in_user
     user_data.update({'datetime_modified': now, 'modifier': user_model})
     return user_data
Пример #12
0
    def _write_to_file(self, corpus, format_):
        """Write the corpus to file in the specified format.
        Write the corpus to a binary file, create or update a corpus file model
        and associate it to the corpus model (if necessary).
        :param corpus: a corpus model.
        :param str format_: the format of the file to be written.
        :returns: the corpus modified appropriately (assuming success)
        :side effects: may write (a) file(s) to disk and update/create a corpus
            file model.
        .. note::
            It may be desirable/necessary to perform the corpus file writing
            asynchronously using a dedicated corpus-file-worker.
        """
        def error_msg(msg):
            return {
                'error':
                'Unable to write corpus %d to file with format "%s".'
                ' (%s)' % (corpus.id, format_, msg)
            }

        def update_corpus_file(corpus, filename, modifier, datetime_modified,
                               restricted):
            """Update the corpus file model of ``corpus`` that matches
            ``filename``.
            """
            corpus_file = [
                cf for cf in corpus.files if cf.filename == filename
            ][0]
            corpus_file.restricted = restricted
            corpus_file.modifier = modifier
            corpus_file.datetime_modified = corpus.datetime_modified = \
                datetime_modified

        def generate_new_corpus_file(*args):
            """Create a corpus file model with ``filename`` and append it to
            ``corpus.files``.
            """
            (corpus, filename, format_, creator, datetime_created,
             restricted) = args
            corpus_file = CorpusFile()
            corpus_file.restricted = restricted
            corpus.files.append(corpus_file)
            corpus_file.filename = filename
            corpus_file.format = format_
            corpus_file.creator = corpus_file.modifier = creator
            corpus_file.datetime_created = corpus_file.datetime_modified = \
                datetime_created
            corpus.datetime_modified = datetime_created

        def destroy_file(file_path):
            try:
                rmtree(file_path)
            except Exception:
                pass

        corpus_file_path = self._get_corpus_file_path(corpus, format_)
        update = os.path.exists(corpus_file_path)  # If True, we are upating
        restricted = False
        # Create the corpus file on the filesystem
        try:
            writer = oldc.CORPUS_FORMATS[format_]['writer']
            if corpus.form_search:  # ``form_search`` value negates any content.
                with codecs.open(corpus_file_path, 'w', 'utf8') as file_:
                    for form in corpus.forms:
                        if (not restricted and 'restricted'
                                in [t.name for t in form.tags]):
                            restricted = True
                        file_.write(writer(form))
            else:
                form_references = corpus.get_form_references(corpus.content)
                forms = {f.id: f for f in corpus.forms}
                with codecs.open(corpus_file_path, 'w', 'utf8') as file_:
                    for id_ in form_references:
                        form = forms[id_]
                        if (not restricted and 'restricted'
                                in [t.name for t in form.tags]):
                            restricted = True
                        file_.write(writer(form))
            gzipped_corpus_file_path = h.compress_file(corpus_file_path)
            _create_tgrep2_corpus_file(gzipped_corpus_file_path, format_)
        except Exception as error:
            destroy_file(corpus_file_path)
            self.request.response.status_int = 400
            error = error_msg(error)
            LOGGER.warning(error['error'])
            return error
        # Update/create the corpus_file object
        try:
            now = h.now()
            user = self.logged_in_user
            corpus_filename = os.path.split(corpus_file_path)[1]
            if update:
                try:
                    update_corpus_file(corpus, corpus_filename, user, now,
                                       restricted)
                except Exception:
                    generate_new_corpus_file(corpus, corpus_filename, format_,
                                             user, now, restricted)
            else:
                generate_new_corpus_file(corpus, corpus_filename, format_,
                                         user, now, restricted)
        except Exception as error:
            destroy_file(corpus_file_path)
            self.request.response.status_int = 400
            error = error_msg(error)
            LOGGER.warning(error['error'])
            return error
        LOGGER.info('Wrote corpus %s to a file on disk', corpus.id)
        self.request.dbsession.flush()
        return corpus
Пример #13
0
    def _update_collections_that_reference_this_collection(
            self, collection, **kwargs):
        """Update all collections that reference the input collection.
        :param collection: a collection model.
        :param bool kwargs['contents_changed']: indicates whether the input
            collection's ``contents`` value has changed.
        :param bool kwargs['deleted']: indicates whether the input collection has
            just been deleted.
        :returns: ``None``

        Update the ``contents``, ``contents_unpacked``, ``html`` and/or ``form``
        attributes of every collection that references the input collection plus all
        of the collections that reference those collections, etc.  This function is
        called upon successful update and delete requests.

        If the contents of the ``collection`` have changed (i.e.,
        ``kwargs['contents_changed']==True``) , then retrieve all collections
        that reference ``collection`` and all collections that reference those
        referers, etc., and update their ``contents_unpacked``, ``html`` and
        ``forms`` attributes.

        If the ``collection`` has been deleted (i.e., ``kwargs['deleted']==True``),
        then recursively retrieve all collections referencing ``collection`` and
        update their ``contents``, ``contents_unpacked``, ``html`` and ``forms``
        attributes.

        If ``collection`` has just been tagged as restricted (i.e.,
        ``kwargs['restricted']==True``), then recursively restrict all collections
        that reference it.

        In all cases, update the ``datetime_modified`` value of every collection that
        recursively references ``collection``.
        """
        restricted = kwargs.get('restricted', False)
        contents_changed = kwargs.get('contents_changed', False)
        deleted = kwargs.get('deleted', False)
        if restricted or contents_changed or deleted:
            collections_referencing_this_collection = \
                self._get_collections_referencing_this_collection(collection)
            collections_referencing_this_collection_dicts = [
                c.get_full_dict()
                for c in collections_referencing_this_collection
            ]
            now = h.now()
            if restricted:
                restricted_tag = self.db.get_restricted_tag()
                for collection_ in collections_referencing_this_collection:
                    collection_.tags.append(restricted_tag)
            if contents_changed:
                for collection_ in collections_referencing_this_collection:
                    self._update_contents_unpacked_etc(collection_)
            if deleted:
                for collection_ in collections_referencing_this_collection:
                    self._update_contents_unpacked_etc(
                        collection_, collection_id=collection.id, deleted=True)
            for collection_ in collections_referencing_this_collection:
                collection_.datetime_modified = now
                collection_.modifier = self.logged_in_user
            for colldict in collections_referencing_this_collection_dicts:
                self._backup_resource(colldict)
            self.request.dbsession.add_all(
                collections_referencing_this_collection)
            self.request.dbsession.flush()