def initial(self, request): """ Before dispatching to put(), delete()... 1) Checks current user authentication to prevent remote DB manipulation 2) Prepares self.list_objects from params """ if not request.user.is_authenticated(): raise Http404() # can't use return in initial() (although 401 maybe better than 404) # can't use @requires_auth because of positional 'self' within class # get validated params self.params = get_parameters(request) (self.base_list, self.change_list) = ListChange._validate(self.params) if not len(self.change_list.items): payload_ngrams = request.data['ngrams'] # print("no change_list in params but we got:", payload_ngrams) # change_list can be in payload too change_ngram_ids = [int(n) for n in payload_ngrams.split(',')] if (not len(change_ngram_ids)): raise ValidationException( 'The "ngrams" parameter requires one or more ngram_ids separated by comma' ) else: self.change_list = UnweightedList(change_ngram_ids)
def get(self, request): parameters = get_parameters(request) maplist_id = None scores_id = None if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] maplist_id = corpus.children('MAPLIST').first().id # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id elif "maplist" in parameters and "scoring" in parameters: maplist_id = int(parameters['mainlist']) scores_id = int(parameters['scoring']) else: raise ValidationException("A 'corpus' id or 'maplist' id is required, and a 'scoring' for occurences counts") ngraminfo = {} # ngram details sorted per ngram id listmembers = {'maplist':[]} # ngram ids sorted per list name # infos for all ngrams from maplist map_ngrams = query_list(maplist_id, details=True, scoring_metric_id= scores_id).all() # ex: [(8805, 'mean age', 4.0), # (1632, 'activity', 4.0), # (8423, 'present', 2.0), # (2928, 'objective', 2.0)] # shortcut to useful function during loop add_to_members = listmembers['maplist'].append for ng in map_ngrams: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # maplist ngrams will already be <=> ngraminfos # but the client side expects a membership lookup # as when there are multiple lists or some groupings add_to_members(ng_id) return JsonHttpResponse({ 'ngraminfos' : ngraminfo, 'listmembers' : listmembers, 'links' : {}, # no grouping links sent during glance (for speed) 'nodeids' : { 'mainlist': None, 'maplist' : maplist_id, 'stoplist': None, 'groups': None, 'scores': None, } })
def get(self, request, corpus_id): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'score': str, 'ngram_ids': list}) try: ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')] except: raise ValidationException( '"ngram_ids" needs integers separated by comma.') limit = DEFAULT_N_DOCS_HAVING_NGRAM nodes_list = [] corpus = session.query(Node).filter(Node.id == corpus_id).first() tfidf_id = (session.query(Node.id).filter( Node.typename == "TFIDF-CORPUS", Node.parent_id == corpus.id).first()) tfidf_id = tfidf_id[0] print(tfidf_id) # request data nodes_query = (session.query(Node, func.sum(NodeNodeNgram.score)).join( NodeNodeNgram, NodeNodeNgram.node2_id == Node.id).filter( NodeNodeNgram.node1_id == tfidf_id).filter( Node.typename == 'DOCUMENT', Node.parent_id == corpus.id).filter( or_(*[ NodeNodeNgram.ngram_id == ngram_id for ngram_id in ngram_ids ])).group_by(Node)) # get the total count before applying limit nodes_count = nodes_query.count() # now the query with the limit nodes_results_query = (nodes_query.order_by( func.sum(NodeNodeNgram.score).desc()).limit(limit)) for node, score in nodes_results_query: print(node, score) print("\t corpus:", corpus_id, "\t", node.name) node_dict = { 'id': node.id, 'score': score, } for key in ('title', 'publication_date', 'source', 'authors', 'fields'): if key in node.hyperdata: node_dict[key] = node.hyperdata[key] nodes_list.append(node_dict) return JsonHttpResponse({'count': nodes_count, 'records': nodes_list})
def put(self, request, corpus_id, check_each_doc=True): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_added': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_add = [ int(did) for did in req_params['docs'].split(',') ] if check_each_doc: # verification que ce sont bien des documents du bon corpus # un peu long => désactiver par défaut ? known_docs_q = (session.query( Node.id).filter(Node.parent_id == corpus_id).filter( Node.typename == 'DOCUMENT')) lookup = { known_doc.id: True for known_doc in known_docs_q.all() } # debug # print("lookup hash", lookup) rejected_list = [] for doc_node_id in nodeids_to_add: if (doc_node_id not in lookup): rejected_list.append(doc_node_id) if len(rejected_list): raise ValidationException( "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)" % (str(rejected_list), int(corpus_id))) # add them bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'), ((fav_node.id, doc_node_id, 1.0) for doc_node_id in nodeids_to_add)) # todo count really added (here: counts input param not result) response = {'count_added': len(nodeids_to_add)} return JsonHttpResponse(response)
def _validate(params): """ Checks "list" and "ngrams" parameters for their: - presence - type These two parameters are mandatory for any ListChange methods. ngrams are also converted to an UnweightedList object for easy add/remove """ if 'list' not in params: raise ValidationException( 'The route /api/ngramlists/change requires a "list" \ parameter, for instance /api/ngramlists/change?list_id=42' ) # if 'ngrams' not in params: # raise ValidationException('The route /api/ngramlists/change requires an "ngrams"\ # parameter, for instance /api/ngramlists/change?ngrams=1,2,3,4') # 2 x retrieval => 2 x UnweightedLists # ------------------------------------ base_list_id = None try: base_list_id = int(params['list']) # UnweightedList retrieved by id except: raise ValidationException( 'The "list" parameter requires an existing list id.') base_list = UnweightedList(base_list_id) change_ngram_ids = [] try: change_ngram_ids = [int(n) for n in params['ngrams'].split(',')] # UnweightedList created from items except: # ngrams no longer mandatory inline, see payload check afterwards pass change_list = UnweightedList(change_ngram_ids) return (base_list, change_list)
def get(self, request, node_id): # check that the node is a corpus # ? faster from cache than: corpus = session.query(Node)... if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) corpus = cache.Node[node_id] if corpus.typename != 'CORPUS': raise ValidationException( "Only nodes of type CORPUS can accept facet queries" + " (but this node has type %s)..." % corpus.typename) else: self.corpus = corpus # check that the hyperfield parameter makes sense _facet_available_subfields = [ 'source', 'publication_year', 'rubrique', 'language_iso2', 'language_iso3', 'language_name', 'authors' ] parameters = get_parameters(request) # validate() triggers an info message if subfield not in range parameters = validate( parameters, { 'type': dict, 'items': { 'hyperfield': { 'type': str, 'range': _facet_available_subfields } } }) subfield = parameters['hyperfield'] # do the aggregated sum (xcounts, total) = self._ndocs_by_facet(subfield) # response return JsonHttpResponse({ 'doc_count': total, 'by': { subfield: xcounts } })
def _get_fav_node(self, corpus_id): """ NB: fav_node can be None if no node is defined this query could be faster if we didn't check that corpus_id is a CORPUS ie: session.query(Node) .filter(Node.parent_id==corpus_id) .filter(Node.typename =='FAVORITES') """ corpus = cache.Node[corpus_id] if corpus.typename != 'CORPUS': raise ValidationException( "Only nodes of type CORPUS can accept favorites queries" + " (but this node has type %s)..." % corpus.typename) else: self.corpus = corpus fav_node = self.corpus.children('FAVORITES').first() return fav_node
def delete(self, request): """Removes the list of nodes corresponding to the query. TODO : Should be a delete method! """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'ids': list}) try: node_ids = [int(n) for n in parameters['ids'].split(',')] except: raise ValidationException( '"ids" needs integers separated by comma.') result = session.execute(delete(Node).where(Node.id.in_(node_ids))) session.commit() return JsonHttpResponse({'deleted': result.rowcount})
def patch(self, request, corpusnode_id): """ PATCH triggers recount of metrics for the specified corpus. ex PATCH http://localhost:8000/api/metrics/14072 ----- corpus_id """ print("==> update metrics request on ", corpusnode_id) if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) try: corpus = cache.Node[int(corpusnode_id)] except: corpus = None if corpus is None: raise ValidationException("%s is not a valid corpus node id." % corpusnode_id) else: t_before = datetime.now() # ============= scheduled(recount)(corpus.id) # ============= t_after = datetime.now() return JsonHttpResponse({ 'corpus_id': corpusnode_id, 'took': "%f s." % (t_after - t_before).total_seconds() })
def _query_nodes(request, node_id=None): if request.user.id is None: raise TypeError( "This API request must come from an authenticated user.") else: # we query among the nodes that belong to this user user = cache.User[request.user.id] # parameters validation # fixme: this validation does not allow custom keys in url (eg '?name=' for rename action) parameters = get_parameters(request) parameters = validate( parameters, { 'type': dict, 'items': { 'formated': { 'type': str, 'required': False, 'default': 'json' }, 'pagination_limit': { 'type': int, 'default': 10 }, 'pagination_offset': { 'type': int, 'default': 0 }, 'fields': { 'type': list, 'default': _node_default_fields, 'items': { 'type': str, 'range': _node_available_fields, } }, # choice of hyperdata fields 'hyperdata_filter': { 'type': list, 'required': False, 'items': { 'type': str, 'range': _hyperdata_available_fields, } }, # optional filtering parameters 'types': { 'type': list, 'required': False, 'items': { 'type': str, 'range': _node_available_types, } }, 'parent_id': { 'type': int, 'required': False }, } }) # debug # print('PARAMS', parameters) # additional validation for hyperdata_filter if (('hyperdata_filter' in parameters) and (not ('hyperdata' in parameters['fields']))): raise ValidationException( "Using the hyperdata_filter filter requires fields[]=hyperdata") # start the query query = user.nodes() # filter by id if node_id is not None: query = query.filter(Node.id == node_id) # filter by type if 'types' in parameters: query = query.filter(Node.typename.in_(parameters['types'])) # filter by parent if 'parent_id' in parameters: query = query.filter(Node.parent_id == parameters['parent_id']) # count count = query.count() # order query = query.order_by(Node.hyperdata['publication_date'], Node.id) # paginate the query if parameters['pagination_limit'] == -1: query = query[parameters['pagination_offset']:] else: query = query[ parameters['pagination_offset']:parameters['pagination_limit']] # return the result! # (the receiver function does the filtering of fields and hyperdata_filter) return parameters, query, count
def validate(value, expected, path='input'): # Is the expected type respected? if 'type' in expected: expected_type = expected['type'] if not isinstance(value, expected_type): if expected_type in (bool, int, float, str, datetime, ): try: if expected_type == bool: value = value not in {0, 0.0, '', '0', 'false'} elif expected_type == datetime: value = value + '2000-01-01T00:00:00Z'[len(value):] value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%SZ') else: value = expected_type(value) except ValueError: raise ValidationException('%s should be a JSON %s, but could not be parsed as such' % (path, _types_names[expected_type], )) else: raise ValidationException('%s should be a JSON %s' % (path, _types_names[expected_type], )) else: expected_type = type(value) # Is the value in the expected range? if 'range' in expected: expected_range = expected['range'] if isinstance(expected_range, tuple): if expected_type in (int, float): tested_value = value tested_name = 'value' elif expected_type in (str, list): tested_value = len(value) tested_name = 'length' if tested_value < expected_range[0]: raise ValidationException('%s should have a minimum %s of %d' % (path, tested_name, expected_range[0], )) if len(expected_range) > 1 and tested_value > expected_range[1]: raise ValidationException('%s should have a maximum %s of %d' % (path, tested_name, expected_range[1], )) elif isinstance(expected_range, (list, set, )) and value not in expected_range: expected_values = expected_range if isinstance(expected_range, list) else expected_range expected_values = [str(value) for value in expected_values if isinstance(value, expected_type)] if len(expected_values) < 16: expected_values_str = '", "'.join(expected_values) expected_values_str = '"' + expected_values_str + '"' else: expected_values_str = '", "'.join(expected_values[:16]) expected_values_str = '"' + expected_values_str + '"...' raise ValidationException('%s should take one of the following values: %s' % (path, expected_values_str, )) # Do we have to translate through a dictionary? if 'translate' in expected: translate = expected['translate'] if callable(translate): value = translate(value) if value is None and expected.get('required', False): raise ValidationException('%s has been given an invalid value' % (path, )) return value try: value = expected['translate'][value] except KeyError: if expected.get('translate_fallback_keep', False): return value if expected.get('required', False): raise ValidationException('%s has been given an invalid value' % (path, )) else: return expected.get('default', value) # Are we handling an iterable? if expected_type in (list, dict): if 'items' in expected: expected_items = expected['items'] if expected_type == list: for i, element in enumerate(value): value[i] = validate(element, expected_items, '%s[%d]' % (path, i, )) elif expected_type == dict: if expected_items: for key in value: if key not in expected_items: raise ValidationException('%s should not have a "%s" key.' % (path, key, )) for expected_key, expected_value in expected_items.items(): if expected_key in value: value[expected_key] = validate(value[expected_key], expected_value, '%s["%s"]' % (path, expected_key, )) elif 'required' in expected_value and expected_value['required']: raise ValidationException('%s should have a "%s" key.' % (path, expected_key, )) elif 'default' in expected_value: value[expected_key] = expected_value['default'] # Let's return the proper value! return value
def get(self, request): parameters = get_parameters(request) glance_limit = None mainlist_id = None scores_id = None groups_id = None other_list_ids = {'maplist': None, 'stoplist': None} # 1) retrieve a mainlist_id and other lists ########################################## # simple request: just refers to the parent corpus # ------------------------------------------------ if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id # retrieve the family of lists that have corpus as parent mainlist_id = corpus.children('MAINLIST').first().id groups_id = corpus.children('GROUPLIST').first().id other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id other_list_ids['maplist'] = corpus.children('MAPLIST').first().id # custom request: refers to each list individually # ------------------------------------------------- elif "mainlist" in parameters and "scoring" in parameters: mainlist_id = parameters['mainlist'] scores_id = parameters['scoring'] groups_id = None if 'groups' in parameters: groups_id = parameters['scoring'] for k in ['stoplist', 'maplist']: if k in parameters: other_list_ids[k] = parameters[k] # or request has an error # ----------------------- else: raise ValidationException( "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required" ) # 2) get the infos for each list ################################ ngraminfo = {} # ngram details sorted per ngram id linkinfo = {} # ngram groups sorted per ngram id listmembers = {} # ngram ids sorted per list name if "head" in parameters: # head <=> only mainlist AND only k top ngrams glance_limit = int(parameters['head']) mainlist_query = query_list(mainlist_id, details=True, pagination_limit=glance_limit, scoring_metric_id=scores_id) else: # infos for all ngrams from mainlist mainlist_query = query_list(mainlist_id, details=True, scoring_metric_id=scores_id) # infos for grouped ngrams, absent from mainlist hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True) # infos for stoplist terms, absent from mainlist stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True, scoring_metric_id=scores_id) # and for the other lists (stop and map) # no details needed here, just the member ids for li in other_list_ids: li_elts = query_list(other_list_ids[li], details=False).all() # simple array of ngram_ids listmembers[li] = [ng[0] for ng in li_elts] # and the groupings if groups_id: links = Translations(groups_id) linkinfo = links.groups # list of ngrams_which_need_detailed_info = [] if "head" in parameters: # head triggered simplified form: just the top of the mainlist # TODO add maplist membership ngrams_which_need_detailed_info = mainlist_query.all() else: ngrams_which_need_detailed_info = mainlist_query.all( ) + hidden_ngrams_query.all() + stop_ngrams_query.all() # the output form of details is: # ngraminfo[id] => [term, weight] for ng in ngrams_which_need_detailed_info: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # NB the client js will sort mainlist ngs from hidden ngs after ajax # using linkinfo (otherwise needs redundant listmembers for main) return JsonHttpResponse({ 'ngraminfos': ngraminfo, 'listmembers': listmembers, 'links': linkinfo, 'nodeids': { 'mainlist': mainlist_id, 'maplist': other_list_ids['maplist'], 'stoplist': other_list_ids['stoplist'], 'groups': groups_id, 'scores': scores_id, } })
def put(self, request): """ Basic external access for *creating an ngram* --------------------------------------------- 1 - checks user authentication before any changes 2 - checks if ngram to Ngram table in DB if yes returns ngram_id and optionally mainform_id otherwise continues 3 - adds the ngram to Ngram table in DB 4 - (if corpus param is present) adds the ngram doc counts to NodeNgram table in DB (aka "index the ngram" throught the docs of the corpus) 5 - returns json with: 'msg' => a success msg 'text' => the initial text content 'term' => the normalized text content 'id' => the new ngram_id 'count' => the number of docs with the ngram in the corpus (if corpus param is present) 'group' => the mainform_id if applicable possible inline parameters -------------------------- @param text=<ngram_string> [required] @param corpus=<CORPUS_ID> [optional] @param testgroup (true if present) [optional, requires corpus] """ # 1 - check user authentication if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res # the params params = get_parameters(request) print("PARAMS", [(i,v) for (i,v) in params.items()]) if 'text' in params: original_text = str(params.pop('text')) ngram_str = normalize_forms(normalize_chars(original_text)) else: raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\ It requires a "text" parameter,\ for instance /api/ngrams?text=hydrometallurgy') if ('testgroup' in params) and (not ('corpus' in params)): raise ValidationException("'testgroup' param requires 'corpus' param") # if we have a 'corpus' param (to do the indexing)... do_indexation = False if 'corpus' in params: # we retrieve the corpus... corpus_id = int(params.pop('corpus')) corpus_node = cache.Node[corpus_id] # and the user must also have rights on the corpus if request.user.id == corpus_node.user_id: do_indexation = True else: res = HttpResponse("Unauthorized") res.status_code = 401 return res # number of "words" in the ngram ngram_size = len(findall(r' +', ngram_str)) + 1 # do the additions try: log_msg = "" ngram_id = None mainform_id = None preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first() if preexisting is not None: ngram_id = preexisting.id log_msg += "ngram already existed (id %i)\n" % ngram_id # in the context of a corpus we can also check if has mainform # (useful for) if 'testgroup' in params: groupings_id = (session.query(Node.id) .filter(Node.parent_id == corpus_id) .filter(Node.typename == 'GROUPLIST') .first() ) had_mainform = (session.query(NodeNgramNgram.ngram1_id) .filter(NodeNgramNgram.node_id == groupings_id) .filter(NodeNgramNgram.ngram2_id == preexisting.id) .first() ) if had_mainform: mainform_id = had_mainform[0] log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id else: log_msg += "ngram was not in any group for this corpus" else: # 2 - insert into Ngrams new_ngram = Ngram(terms=ngram_str, n=ngram_size) session.add(new_ngram) session.commit() ngram_id = new_ngram.id log_msg += "ngram was added with new id %i\n" % ngram_id # 3 - index the term if do_indexation: n_added = index_new_ngrams([ngram_id], corpus_node) log_msg += 'ngram indexed in corpus %i\n' % corpus_id return JsonHttpResponse({ 'msg': log_msg, 'text': original_text, 'term': ngram_str, 'id' : ngram_id, 'group' : mainform_id, 'count': n_added if do_indexation else 'no corpus provided for indexation' }, 200) # just in case except Exception as e: return JsonHttpResponse({ 'msg': str(e), 'text': original_text }, 400)