Пример #1
0
    def get(self, request, corpus_id):

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        parameters = get_parameters(request)
        parameters = validate(parameters, {'score': str, 'ngram_ids': list})

        try:
            ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')]
        except:
            raise ValidationException(
                '"ngram_ids" needs integers separated by comma.')

        limit = DEFAULT_N_DOCS_HAVING_NGRAM
        nodes_list = []

        corpus = session.query(Node).filter(Node.id == corpus_id).first()

        tfidf_id = (session.query(Node.id).filter(
            Node.typename == "TFIDF-CORPUS",
            Node.parent_id == corpus.id).first())

        tfidf_id = tfidf_id[0]
        print(tfidf_id)
        # request data
        nodes_query = (session.query(Node, func.sum(NodeNodeNgram.score)).join(
            NodeNodeNgram, NodeNodeNgram.node2_id == Node.id).filter(
                NodeNodeNgram.node1_id == tfidf_id).filter(
                    Node.typename == 'DOCUMENT',
                    Node.parent_id == corpus.id).filter(
                        or_(*[
                            NodeNodeNgram.ngram_id == ngram_id
                            for ngram_id in ngram_ids
                        ])).group_by(Node))

        # get the total count before applying limit
        nodes_count = nodes_query.count()

        # now the query with the limit
        nodes_results_query = (nodes_query.order_by(
            func.sum(NodeNodeNgram.score).desc()).limit(limit))

        for node, score in nodes_results_query:
            print(node, score)
            print("\t corpus:", corpus_id, "\t", node.name)
            node_dict = {
                'id': node.id,
                'score': score,
            }
            for key in ('title', 'publication_date', 'source', 'authors',
                        'fields'):
                if key in node.hyperdata:
                    node_dict[key] = node.hyperdata[key]
            nodes_list.append(node_dict)

        return JsonHttpResponse({'count': nodes_count, 'records': nodes_list})
Пример #2
0
    def put(self, request, corpus_id, check_each_doc=True):
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # user is ok
        fav_node = self._get_fav_node(corpus_id)

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'count_added':
                0
            }
        else:
            req_params = validate(get_parameters(request), {
                'docs': list,
                'default': ""
            })
            nodeids_to_add = [
                int(did) for did in req_params['docs'].split(',')
            ]

            if check_each_doc:
                # verification que ce sont bien des documents du bon corpus
                # un peu long => désactiver par défaut ?
                known_docs_q = (session.query(
                    Node.id).filter(Node.parent_id == corpus_id).filter(
                        Node.typename == 'DOCUMENT'))
                lookup = {
                    known_doc.id: True
                    for known_doc in known_docs_q.all()
                }
                # debug
                # print("lookup hash", lookup)
                rejected_list = []
                for doc_node_id in nodeids_to_add:
                    if (doc_node_id not in lookup):
                        rejected_list.append(doc_node_id)
                if len(rejected_list):
                    raise ValidationException(
                        "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)"
                        % (str(rejected_list), int(corpus_id)))

            # add them
            bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'),
                        ((fav_node.id, doc_node_id, 1.0)
                         for doc_node_id in nodeids_to_add))

            # todo count really added (here: counts input param not result)
            response = {'count_added': len(nodeids_to_add)}

        return JsonHttpResponse(response)
Пример #3
0
    def get(self, request, corpus_id):
        """
        2 possibilities with/without param

        1) GET http://localhost:8000/api/nodes/2/favorites
        (returns the full list of fav docs within corpus 2)

        2) GET http://localhost:8000/api/nodes/2/favorites?docs=53,54
        (will test if docs 53 and 54 are among the favorites of corpus 2)
        (returns the intersection of fav docs with [53,54])
        """

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        fav_node = self._get_fav_node(corpus_id)

        req_params = validate(get_parameters(request), {
            'docs': list,
            'default': ""
        })

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'favdocs': []
            }
        elif 'docs' not in req_params:
            # each docnode associated to the favnode of this corpusnode
            q = (session.query(
                NodeNode.node2_id).filter(NodeNode.node1_id == fav_node.id))
            all_doc_ids = [row.node2_id for row in q.all()]
            response = {'favdocs': all_doc_ids}
        else:
            nodeids_to_check = [
                int(did) for did in req_params['docs'].split(',')
            ]

            # each docnode from the input list, if it is associated to the favnode
            q = (session.query(NodeNode.node2_id).filter(
                NodeNode.node1_id == fav_node.id).filter(
                    NodeNode.node2_id.in_(nodeids_to_check)))
            present_doc_ids = [row.node2_id for row in q.all()]
            absent_doc_ids = [
                did for did in nodeids_to_check if did not in present_doc_ids
            ]
            response = {'favdocs': present_doc_ids, 'missing': absent_doc_ids}

        return JsonHttpResponse(response)
Пример #4
0
    def get(self, request, node_id):
        # check that the node is a corpus
        #   ? faster from cache than: corpus = session.query(Node)...

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        corpus = cache.Node[node_id]
        if corpus.typename != 'CORPUS':
            raise ValidationException(
                "Only nodes of type CORPUS can accept facet queries" +
                " (but this node has type %s)..." % corpus.typename)
        else:
            self.corpus = corpus

        # check that the hyperfield parameter makes sense
        _facet_available_subfields = [
            'source', 'publication_year', 'rubrique', 'language_iso2',
            'language_iso3', 'language_name', 'authors'
        ]
        parameters = get_parameters(request)

        # validate() triggers an info message if subfield not in range
        parameters = validate(
            parameters, {
                'type': dict,
                'items': {
                    'hyperfield': {
                        'type': str,
                        'range': _facet_available_subfields
                    }
                }
            })

        subfield = parameters['hyperfield']

        # do the aggregated sum
        (xcounts, total) = self._ndocs_by_facet(subfield)

        # response
        return JsonHttpResponse({
            'doc_count': total,
            'by': {
                subfield: xcounts
            }
        })
Пример #5
0
    def delete(self, request):
        """Removes the list of nodes corresponding to the query.
        TODO : Should be a delete method!
        """
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        parameters = get_parameters(request)
        parameters = validate(parameters, {'ids': list})
        try:
            node_ids = [int(n) for n in parameters['ids'].split(',')]
        except:
            raise ValidationException(
                '"ids" needs integers separated by comma.')

        result = session.execute(delete(Node).where(Node.id.in_(node_ids)))
        session.commit()

        return JsonHttpResponse({'deleted': result.rowcount})
Пример #6
0
    def delete(self, request, corpus_id):
        """
        DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54
        (will delete docs 53 and 54 from the favorites of corpus 2)
        """
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # user is ok
        fav_node = self._get_fav_node(corpus_id)

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'count_removed':
                0
            }
        else:
            req_params = validate(get_parameters(request), {
                'docs': list,
                'default': ""
            })
            nodeids_to_delete = [
                int(did) for did in req_params['docs'].split(',')
            ]
            try:
                # it deletes from favourites but not from DB
                result = session.execute(
                    delete(NodeNode).where(
                        NodeNode.node1_id == fav_node.id).where(
                            NodeNode.node2_id.in_(nodeids_to_delete)))
                session.commit()
                response = {'count_removed': result.rowcount}
            finally:
                session.close()
        return JsonHttpResponse(response)
Пример #7
0
def _query_nodes(request, node_id=None):

    if request.user.id is None:
        raise TypeError(
            "This API request must come from an authenticated user.")
    else:
        # we query among the nodes that belong to this user
        user = cache.User[request.user.id]

    # parameters validation
    # fixme: this validation does not allow custom keys in url (eg '?name=' for rename action)
    parameters = get_parameters(request)

    parameters = validate(
        parameters,
        {
            'type': dict,
            'items': {
                'formated': {
                    'type': str,
                    'required': False,
                    'default': 'json'
                },
                'pagination_limit': {
                    'type': int,
                    'default': 10
                },
                'pagination_offset': {
                    'type': int,
                    'default': 0
                },
                'fields': {
                    'type': list,
                    'default': _node_default_fields,
                    'items': {
                        'type': str,
                        'range': _node_available_fields,
                    }
                },
                # choice of hyperdata fields
                'hyperdata_filter': {
                    'type': list,
                    'required': False,
                    'items': {
                        'type': str,
                        'range': _hyperdata_available_fields,
                    }
                },
                # optional filtering parameters
                'types': {
                    'type': list,
                    'required': False,
                    'items': {
                        'type': str,
                        'range': _node_available_types,
                    }
                },
                'parent_id': {
                    'type': int,
                    'required': False
                },
            }
        })

    # debug
    # print('PARAMS', parameters)

    # additional validation for hyperdata_filter
    if (('hyperdata_filter' in parameters)
            and (not ('hyperdata' in parameters['fields']))):
        raise ValidationException(
            "Using the hyperdata_filter filter requires fields[]=hyperdata")

    # start the query
    query = user.nodes()

    # filter by id
    if node_id is not None:
        query = query.filter(Node.id == node_id)
    # filter by type
    if 'types' in parameters:
        query = query.filter(Node.typename.in_(parameters['types']))
    # filter by parent
    if 'parent_id' in parameters:
        query = query.filter(Node.parent_id == parameters['parent_id'])
    # count
    count = query.count()
    # order
    query = query.order_by(Node.hyperdata['publication_date'], Node.id)

    # paginate the query
    if parameters['pagination_limit'] == -1:
        query = query[parameters['pagination_offset']:]
    else:
        query = query[
            parameters['pagination_offset']:parameters['pagination_limit']]
    # return the result!
    # (the receiver function does the filtering of fields and hyperdata_filter)
    return parameters, query, count
Пример #8
0
    def post(self, request, project_id):

        # example only

        input = request.data or {
            'x': {
                'with_empty': True,
                'resolution': 'decade',
                'value': 'publication_date',
            },
            'y': {
                # 'divided_by': 'total_ngrams_count',
                # 'divided_by': 'total_documents_count',
            },
            'filter': {
                # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'],
                # 'ngrams': ['insecticide', 'pesticide'],
                # 'corpora': [52633],
                # 'date': {'min': '1995-12-31'}
            },
            # 'format': 'csv',
        }
        print(input)
        # input validation
        input = validate(
            input,
            {
                'type': dict,
                'default': {},
                'items': {
                    'x': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # which hyperdata to choose for the date
                            'value': {
                                'type': str,
                                'default': 'publication_date',
                                'range': {
                                    'publication_date',
                                }
                            },
                            # time resolution
                            'resolution': {
                                'type': str,
                                'range': self._resolutions.keys(),
                                'default': 'month'
                            },
                            # should we add zeroes for empty values?
                            'with_empty': {
                                'type': bool,
                                'default': False
                            },
                        }
                    },
                    'y': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # mesured value
                            'value': {
                                'type': str,
                                'default': 'ngrams_count',
                                'range': {
                                    'ngrams_count', 'documents_count',
                                    'ngrams_tfidf'
                                }
                            },
                            # value by which we should normalize
                            'divided_by': {
                                'type': str,
                                'range': {
                                    'total_documents_count', 'documents_count',
                                    'total_ngrams_count'
                                }
                            },
                        }
                    },
                    # filtering
                    'filter': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # filter by metadata
                            'hyperdata': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': dict,
                                    'items': {
                                        'key': {
                                            'type': str,
                                            'range': self._operators.keys()
                                        },
                                        'operator': {
                                            'type': str
                                        },
                                        'value': {
                                            'type': str
                                        },
                                    }
                                }
                            },
                            # filter by date
                            'date': {
                                'type': dict,
                                'items': {
                                    'min': {
                                        'type': datetime.datetime
                                    },
                                    'max': {
                                        'type': datetime.datetime
                                    },
                                },
                                'default': {}
                            },
                            # filter by corpora
                            'corpora': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': int
                                }
                            },
                            # filter by ngrams
                            'ngrams': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': str
                                }
                            },
                        }
                    },
                    # output format
                    'format': {
                        'type': str,
                        'default': 'json',
                        'range': {'json', 'csv'}
                    },
                }
            })
        # build query: prepare columns
        X = aliased(NodeHyperdata)
        column_x = func.date_trunc(input['x']['resolution'], X.value_utc)
        column_y = {
            'documents_count': func.count(Node.id.distinct()),
            'ngrams_count': func.sum(NodeNgram.weight),
            # 'ngrams_tfidf':     func.sum(NodeNodeNgram.weight),
        }[input['y']['value']]
        # build query: base
        print(input)
        query_base = (
            session.query(column_x).select_from(Node).join(
                NodeNgram, NodeNgram.node_id == Node.id).join(
                    X, X.node_id == NodeNgram.node_id)
            #.filter(X.key == input['x']['value'])
            .group_by(column_x).order_by(column_x))
        # build query: base, filter by corpora or project
        if 'corpora' in input['filter'] and input['filter']['corpora']:
            query_base = (query_base.filter(
                Node.parent_id.in_(input['filter']['corpora'])))
        else:
            ParentNode = aliased(Node)
            query_base = (query_base.join(
                ParentNode, ParentNode.id == Node.parent_id).filter(
                    ParentNode.parent_id == project_id))
        # build query: base, filter by date
        if 'date' in input['filter']:
            if 'min' in input['filter']['date']:
                query_base = query_base.filter(
                    X.value >= input['filter']['date']['min'])
            if 'max' in input['filter']['date']:
                query_base = query_base.filter(
                    X.value <= input['filter']['date']['max'])
        # build query: filter by ngrams
        query_result = query_base.add_columns(column_y)
        if 'ngrams' in input['filter'] and input['filter']['ngrams']:
            query_result = (query_result.join(
                Ngram, Ngram.id == NodeNgram.ngram_id).filter(
                    Ngram.terms.in_(input['filter']['ngrams'])))
        # build query: filter by metadata
        if 'hyperdata' in input['filter']:
            for h, hyperdata in enumerate(input['filter']['hyperdata']):
                print(h, hyperdata)
                # get hyperdata in database
                #if hyperdata_model is None:
                #    continue
                #hyperdata_id, hyperdata_type = hyperdata_model
                # create alias and query it
                operator = self._operators[hyperdata['operator']]
                type_string = type2string(
                    INDEXED_HYPERDATA[hyperdata['key']]['type'])
                value = self._converters[type_string](hyperdata['value'])
                query_result = (query_result.join(
                    NodeHyperdata,
                    NodeHyperdata.node_id == NodeNgram.node_id).filter(
                        NodeHyperdata.key == hyperdata['key']).filter(
                            operator(NodeHyperdata.value, value)))
        # build result: prepare data
        date_value_list = query_result.all()
        #print(date_value_list)

        if date_value_list:
            date_min = date_value_list[0][0].replace(tzinfo=None)
            date_max = date_value_list[-2][0].replace(tzinfo=None)
        # build result: prepare interval
        result = collections.OrderedDict()
        if input['x']['with_empty'] and date_value_list:
            compute_next_date = self._resolutions[input['x']['resolution']]
            date = date_min
            while date <= date_max:
                result[date] = 0.0
                date = compute_next_date(date)
        # build result: integrate
        for date, value in date_value_list[0:-1]:
            result[date.replace(tzinfo=None)] = value
        # build result: normalize
        query_normalize = None
        if date_value_list and 'divided_by' in input['y'] and input['y'][
                'divided_by']:
            if input['y']['divided_by'] == 'total_documents_count':
                query_normalize = query_base.add_column(
                    func.count(Node.id.distinct()))
            elif input['y']['divided_by'] == 'total_ngrams_count':
                query_normalize = query_base.add_column(
                    func.sum(NodeNgram.weight))
        if query_normalize is not None:
            for date, value in query_normalize[0:-1]:
                date = date.replace(tzinfo=None)
                if date in result:
                    result[date] /= value
        # return result with proper formatting
        if input['format'] == 'json':
            return JsonHttpResponse(
                {
                    'query': input,
                    'result': sorted(result.items()),
                }, 201)
        elif input['format'] == 'csv':
            return CsvHttpResponse(sorted(result.items()), ('date', 'value'),
                                   201)