def compile(self, id): """Compile the script of a phonology as a foma FST. :URL: ``PUT /phonologies/compile/id`` :param str id: the ``id`` value of the phonology whose script will be compiled. :returns: if the phonology exists and foma is installed, the phonology model is returned; ``GET /phonologies/id`` must be polled to determine when and how the compilation task has terminated. .. note:: The script is compiled asynchronously in a worker thread. See :mod:`onlinelinguisticdatabase.lib.foma_worker`. """ phonology = Session.query(Phonology).get(id) if phonology: if h.foma_installed(): foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'compile_phonology', 'args': { 'phonology_id': phonology.id, 'user_id': session['user'].id, 'timeout': h.phonology_compile_timeout } }) return phonology else: response.status_int = 400 return {'error': 'Foma and flookup are not installed.'} else: response.status_int = 404 return {'error': 'There is no phonology with id %s' % id}
def create_new_user(data): """Create a new user. :param dict data: the data for the user to be created. :returns: an SQLAlchemy model object representing the user. """ user = User() user.salt = h.generate_salt() user.password = unicode(h.encrypt_password(data['password'], str(user.salt))) user.username = h.normalize(data['username']) user.first_name = h.normalize(data['first_name']) user.last_name = h.normalize(data['last_name']) user.email = h.normalize(data['email']) user.affiliation = h.normalize(data['affiliation']) user.role = h.normalize(data['role']) user.markup_language = h.normalize(data['markup_language']) user.page_content = h.normalize(data['page_content']) user.html = h.get_HTML_from_contents(user.page_content, user.markup_language) # Many-to-One Data: input and output orthographies if data['input_orthography']: user.input_orthography= data['input_orthography'] if data['output_orthography']: user.output_orthography = data['output_orthography'] # OLD-generated Data user.datetime_modified = datetime.datetime.utcnow() # Create the user's directory h.create_user_directory(user) return user
def generate(self, id): """Generate the files that constitute the morpheme language model, crucially the file that holds the pickled LM trie. :URL: ``PUT /morpheme_language_model/id/generate`` :param str id: the ``id`` value of the morpheme language model whose files will be generated. :returns: the morpheme language model is returned; ``GET /morpheme_language_model/id`` must be polled to determine when the generation task has terminated. """ lm = Session.query(MorphemeLanguageModel).get(id) if not lm: response.status_int = 404 return { 'error': 'There is no morpheme language model with id %s' % id } args = { 'morpheme_language_model_id': lm.id, 'user_id': session['user'].id, 'timeout': h.morpheme_language_model_generate_timeout } foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'generate_language_model', 'args': args }) return lm
def create_new_user(data): """Create a new user. :param dict data: the data for the user to be created. :returns: an SQLAlchemy model object representing the user. """ user = User() user.salt = h.generate_salt() user.password = unicode( h.encrypt_password(data['password'], str(user.salt))) user.username = h.normalize(data['username']) user.first_name = h.normalize(data['first_name']) user.last_name = h.normalize(data['last_name']) user.email = h.normalize(data['email']) user.affiliation = h.normalize(data['affiliation']) user.role = h.normalize(data['role']) user.markup_language = h.normalize(data['markup_language']) user.page_content = h.normalize(data['page_content']) user.html = h.get_HTML_from_contents(user.page_content, user.markup_language) # Many-to-One Data: input and output orthographies if data['input_orthography']: user.input_orthography = data['input_orthography'] if data['output_orthography']: user.output_orthography = data['output_orthography'] # OLD-generated Data user.datetime_modified = datetime.datetime.utcnow() # Create the user's directory h.create_user_directory(user) return user
def generate_and_compile_morphological_parser(morphological_parser_id, compile_=True): morphological_parser = Session.query(MorphologicalParser).get(morphological_parser_id) if not morphological_parser: response.status_int = 404 return {'error': 'There is no morphological parser with id %s' % id} if compile_ and not h.foma_installed(): response.status_int = 400 return {'error': 'Foma and flookup are not installed.'} foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'generate_and_compile_parser', 'args': { 'morphological_parser_id': morphological_parser.id, 'compile': compile_, 'user_id': session['user'].id, 'timeout': h.morphological_parser_compile_timeout } }) return morphological_parser
def generate_and_compile_morphological_parser(morphological_parser_id, compile_=True): morphological_parser = Session.query(MorphologicalParser).get( morphological_parser_id) if not morphological_parser: response.status_int = 404 return {'error': 'There is no morphological parser with id %s' % id} if compile_ and not h.foma_installed(): response.status_int = 400 return {'error': 'Foma and flookup are not installed.'} foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'generate_and_compile_parser', 'args': { 'morphological_parser_id': morphological_parser.id, 'compile': compile_, 'user_id': session['user'].id, 'timeout': h.morphological_parser_compile_timeout } }) return morphological_parser
def compute_perplexity(self, id): """Compute the perplexity of the LM's corpus according to the LM. Randomly divide the corpus into training and test sets multiple times and compute the perplexity and return the average. See ``evaluate_morpheme_language_model`` in lib/foma_worker.py. """ lm = Session.query(MorphemeLanguageModel).get(id) if not lm: response.status_int = 404 return {'error': 'There is no morpheme language model with id %s' % id} args = { 'morpheme_language_model_id': lm.id, 'user_id': session['user'].id, 'timeout': h.morpheme_language_model_generate_timeout } foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'compute_perplexity', 'args': args }) return lm
def generate(self, id): """Generate the files that constitute the morpheme language model, crucially the file that holds the pickled LM trie. :URL: ``PUT /morpheme_language_model/id/generate`` :param str id: the ``id`` value of the morpheme language model whose files will be generated. :returns: the morpheme language model is returned; ``GET /morpheme_language_model/id`` must be polled to determine when the generation task has terminated. """ lm = Session.query(MorphemeLanguageModel).get(id) if not lm: response.status_int = 404 return {'error': 'There is no morpheme language model with id %s' % id} args = { 'morpheme_language_model_id': lm.id, 'user_id': session['user'].id, 'timeout': h.morpheme_language_model_generate_timeout } foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'generate_language_model', 'args': args }) return lm
def compute_perplexity(self, id): """Compute the perplexity of the LM's corpus according to the LM. Randomly divide the corpus into training and test sets multiple times and compute the perplexity and return the average. See ``evaluate_morpheme_language_model`` in lib/foma_worker.py. """ lm = Session.query(MorphemeLanguageModel).get(id) if not lm: response.status_int = 404 return { 'error': 'There is no morpheme language model with id %s' % id } args = { 'morpheme_language_model_id': lm.id, 'user_id': session['user'].id, 'timeout': h.morpheme_language_model_generate_timeout } foma_worker_q.put({ 'id': h.generate_salt(), 'func': 'compute_perplexity', 'args': args }) return lm
def tgrep2(self, id): """Search the corpus-as-treebank using Tgrep2. :URL: ``SEARCH/POST /corpora/id/tgrep2``. :Request body: JSON object with obligatory 'tgrep2pattern' attribute and optional 'paginator' and 'order_by' attributes. :param str id: the ``id`` value of the corpus. :returns: an array of forms as JSON objects """ if not h.command_line_program_installed('tgrep2'): response.status_int = 400 return {'error': 'TGrep2 is not installed.'} corpus = Session.query(Corpus).get(id) if corpus: try: treebank_corpus_file_object = filter(lambda cf: cf.format == u'treebank', corpus.files)[0] corpus_dir_path = get_corpus_dir_path(corpus) tgrep2_corpus_file_path = os.path.join(corpus_dir_path, '%s.t2c' % treebank_corpus_file_object.filename) except Exception: response.status_int = 400 return {'error': 'Corpus %d has not been written to file as a treebank.'} if not os.path.exists(tgrep2_corpus_file_path): response.status_int = 400 return {'error': 'Corpus %d has not been written to file as a treebank.'} #if not authorized_to_access_corpus_file(session['user'], treebank_corpus_file_object): # response.status_int = 403 # return h.unauthorized_msg try: request_params = json.loads(unicode(request.body, request.charset)) try: tgrep2pattern = request_params['tgrep2pattern'] assert isinstance(tgrep2pattern, basestring) except Exception: response.status_int = 400 return {'errors': {'tgrep2pattern': 'A tgrep2pattern attribute must be supplied and must have a unicode/string value'}} tmp_path = os.path.join(corpus_dir_path, '%s%s.txt' % (session['user'].username, h.generate_salt())) with open(os.devnull, "w") as fnull: with open(tmp_path, 'w') as stdout: # The -wu option causes TGrep2 to print only the root symbol of each matching tree process = Popen(['tgrep2', '-c', tgrep2_corpus_file_path, '-wu', tgrep2pattern], stdout=stdout, stderr=fnull) process.communicate() match_ids = filter(None, map(get_form_ids_from_tgrep2_output_line, open(tmp_path, 'r'))) os.remove(tmp_path) if match_ids: query = h.eagerload_form(Session.query(Form)).filter(Form.id.in_(match_ids)) query = h.filter_restricted_models('Form', query) query = h.add_order_by(query, request_params.get('order_by'), self.query_builder) result = h.add_pagination(query, request_params.get('paginator')) elif request_params.get('paginator'): paginator = request_params['paginator'] paginator['count'] = 0 result = {'paginator': paginator, 'items': []} else: result = [] return result except h.JSONDecodeError: response.status_int = 400 return h.JSONDecodeErrorResponse except Invalid, e: response.status_int = 400 return {'errors': e.unpack_errors()} except Exception, e: response.status_int = 400 return {'error': 'Unable to perform TGrep2 search: %s.' % e}
def tgrep2(self, id): """Search the corpus-as-treebank using Tgrep2. :URL: ``SEARCH/POST /corpora/id/tgrep2``. :Request body: JSON object with obligatory 'tgrep2pattern' attribute and optional 'paginator' and 'order_by' attributes. :param str id: the ``id`` value of the corpus. :returns: an array of forms as JSON objects """ if not h.command_line_program_installed('tgrep2'): response.status_int = 400 return {'error': 'TGrep2 is not installed.'} corpus = Session.query(Corpus).get(id) if corpus: try: treebank_corpus_file_object = filter( lambda cf: cf.format == u'treebank', corpus.files)[0] corpus_dir_path = get_corpus_dir_path(corpus) tgrep2_corpus_file_path = os.path.join( corpus_dir_path, '%s.t2c' % treebank_corpus_file_object.filename) except Exception: response.status_int = 400 return { 'error': 'Corpus %d has not been written to file as a treebank.' } if not os.path.exists(tgrep2_corpus_file_path): response.status_int = 400 return { 'error': 'Corpus %d has not been written to file as a treebank.' } #if not authorized_to_access_corpus_file(session['user'], treebank_corpus_file_object): # response.status_int = 403 # return h.unauthorized_msg try: request_params = json.loads( unicode(request.body, request.charset)) try: tgrep2pattern = request_params['tgrep2pattern'] assert isinstance(tgrep2pattern, basestring) except Exception: response.status_int = 400 return { 'errors': { 'tgrep2pattern': 'A tgrep2pattern attribute must be supplied and must have a unicode/string value' } } tmp_path = os.path.join( corpus_dir_path, '%s%s.txt' % (session['user'].username, h.generate_salt())) with open(os.devnull, "w") as fnull: with open(tmp_path, 'w') as stdout: # The -wu option causes TGrep2 to print only the root symbol of each matching tree process = Popen([ 'tgrep2', '-c', tgrep2_corpus_file_path, '-wu', tgrep2pattern ], stdout=stdout, stderr=fnull) process.communicate() match_ids = filter( None, map(get_form_ids_from_tgrep2_output_line, open(tmp_path, 'r'))) os.remove(tmp_path) if match_ids: query = h.eagerload_form(Session.query(Form)).filter( Form.id.in_(match_ids)) query = h.filter_restricted_models('Form', query) query = h.add_order_by(query, request_params.get('order_by'), self.query_builder) result = h.add_pagination(query, request_params.get('paginator')) elif request_params.get('paginator'): paginator = request_params['paginator'] paginator['count'] = 0 result = {'paginator': paginator, 'items': []} else: result = [] return result except h.JSONDecodeError: response.status_int = 400 return h.JSONDecodeErrorResponse except Invalid, e: response.status_int = 400 return {'errors': e.unpack_errors()} except Exception, e: response.status_int = 400 return {'error': 'Unable to perform TGrep2 search: %s.' % e}