Exemplo n.º 1
0
def query(request):
    """
    Pubmed year by year results

    # alist = [
    # {'string': '2011[dp] serendipity', 'queryKey': '1',
    #  'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
    # {'string': '2012[dp] serendipity', 'queryKey': '1',
    #  'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
    #  ... ]

    (reused as thequeries in query_save)
    """
    print(request.method)
    alist = []

    if request.method == "POST":
        query = request.POST["query"]
        if request.POST["N"] == "NaN":
            N = QUERY_SIZE_N_MAX
        else:
            N = int(request.POST["N"])

        if N > QUERY_SIZE_N_MAX:
            msg = "Invalid sample size N = %i (max = %i)" % (N,
                                                             QUERY_SIZE_N_MAX)
            print("ERROR(scrap: pubmed stats): ", msg)
            raise ValueError(msg)

        print(
            "LOG::TIME:_ " + datetime.datetime.now().isoformat() + " query =",
            query)
        print("LOG::TIME:_ " + datetime.datetime.now().isoformat() + " N =", N)
        instancia = Scraper()

        # serialFetcher (n_last_years, query, query_size)
        alist = instancia.serialFetcher(5, query, N)

    data = alist
    return JsonHttpResponse(data)
Exemplo n.º 2
0
    def delete(self, request, corpus_id):
        """
        DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54
        (will delete docs 53 and 54 from the favorites of corpus 2)
        """
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # user is ok
        fav_node = self._get_fav_node(corpus_id)

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'count_removed':
                0
            }
        else:
            req_params = validate(get_parameters(request), {
                'docs': list,
                'default': ""
            })
            nodeids_to_delete = [
                int(did) for did in req_params['docs'].split(',')
            ]

            # it deletes from favourites but not from DB
            result = session.execute(
                delete(NodeNode).where(NodeNode.node1_id == fav_node.id).where(
                    NodeNode.node2_id.in_(nodeids_to_delete)))
            session.commit()
            response = {'count_removed': result.rowcount}

        return JsonHttpResponse(response)
Exemplo n.º 3
0
    def patch(self, request, corpusnode_id):
        """
        PATCH triggers recount of metrics for the specified corpus.

        ex PATCH http://localhost:8000/api/metrics/14072
                                                   -----
                                                 corpus_id
        """
        print("==> update metrics request on ", corpusnode_id)

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        try:
            corpus = cache.Node[int(corpusnode_id)]
        except:
            corpus = None

        if corpus is None:
            raise ValidationException("%s is not a valid corpus node id." %
                                      corpusnode_id)

        else:
            t_before = datetime.now()
            # =============
            scheduled(recount)(corpus.id)
            # =============
            t_after = datetime.now()

            return JsonHttpResponse({
                'corpus_id':
                corpusnode_id,
                'took':
                "%f s." % (t_after - t_before).total_seconds()
            })
Exemplo n.º 4
0
def save(request, project_id):
    '''save'''
    if request.method == "POST":

        query = request.POST.get("query")
        try:
            N = int(request.POST.get("N"))
        except:
            N = 0
        print(query, N)
        #for next time
        #ids = request.POST["ids"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if N == 0:
            raise Http404()
        if N > QUERY_SIZE_N_MAX:
            N = QUERY_SIZE_N_MAX

        try:
            project_id = int(project_id)
        except ValueError:
            raise Http404()
        # do we have a valid project?
        project = session.query(Node).filter(Node.id == project_id).first()
        if project is None:
            raise Http404()
        user = cache.User[request.user.id]
        if not user.owns(project):
            return HttpResponseForbidden()
        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": "en"
                      })

        #download_file
        crawler_bot = load_crawler(source)()
        #for now no way to force downloading X records

        #the long running command
        filename = crawler_bot.download(query)
        corpus.add_resource(
            type=source["type"]
            #,  name = source["name"]
            ,
            path=crawler_bot.path)

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
    def get(self, request):

        parameters = get_parameters(request)
        glance_limit = None
        mainlist_id = None
        scores_id = None
        groups_id = None
        other_list_ids = {'maplist': None, 'stoplist': None}

        # 1) retrieve a mainlist_id and other lists
        ##########################################

        # simple request: just refers to the parent corpus
        # ------------------------------------------------
        if "corpus" in parameters:
            corpus_id = parameters['corpus']
            corpus = cache.Node[corpus_id]
            # with a corpus_id, the explicit scoring pointer is optional
            if "scoring" in parameters:
                scores_id = parameters['scoring']
            else:
                scores_id = corpus.children('OCCURRENCES').first().id
            # retrieve the family of lists that have corpus as parent
            mainlist_id = corpus.children('MAINLIST').first().id
            groups_id = corpus.children('GROUPLIST').first().id
            other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id
            other_list_ids['maplist'] = corpus.children('MAPLIST').first().id

        # custom request: refers to each list individually
        # -------------------------------------------------
        elif "mainlist" in parameters and "scoring" in parameters:
            mainlist_id = parameters['mainlist']
            scores_id = parameters['scoring']
            groups_id = None
            if 'groups' in parameters:
                groups_id = parameters['scoring']
            for k in ['stoplist', 'maplist']:
                if k in parameters:
                    other_list_ids[k] = parameters[k]

        # or request has an error
        # -----------------------
        else:
            raise ValidationException(
                "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required"
            )

        # 2) get the infos for each list
        ################################
        ngraminfo = {}  # ngram details sorted per ngram id
        linkinfo = {}  # ngram groups sorted per ngram id
        listmembers = {}  # ngram ids sorted per list name
        if "head" in parameters:
            # head <=> only mainlist AND only k top ngrams
            glance_limit = int(parameters['head'])
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        pagination_limit=glance_limit,
                                        scoring_metric_id=scores_id)
        else:
            # infos for all ngrams from mainlist
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        scoring_metric_id=scores_id)
            # infos for grouped ngrams, absent from mainlist
            hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True)

            # infos for stoplist terms, absent from mainlist
            stop_ngrams_query = query_list(other_list_ids['stoplist'],
                                           details=True,
                                           scoring_metric_id=scores_id)

            # and for the other lists (stop and map)
            # no details needed here, just the member ids
            for li in other_list_ids:
                li_elts = query_list(other_list_ids[li], details=False).all()
                # simple array of ngram_ids
                listmembers[li] = [ng[0] for ng in li_elts]

            # and the groupings
            if groups_id:
                links = Translations(groups_id)
                linkinfo = links.groups

        # list of
        ngrams_which_need_detailed_info = []
        if "head" in parameters:
            # head triggered simplified form: just the top of the mainlist
            # TODO add maplist membership
            ngrams_which_need_detailed_info = mainlist_query.all()
        else:
            ngrams_which_need_detailed_info = mainlist_query.all(
            ) + hidden_ngrams_query.all() + stop_ngrams_query.all()

        # the output form of details is:
        # ngraminfo[id] => [term, weight]
        for ng in ngrams_which_need_detailed_info:
            ng_id = ng[0]
            ngraminfo[ng_id] = ng[1:]

            # NB the client js will sort mainlist ngs from hidden ngs after ajax
            #    using linkinfo (otherwise needs redundant listmembers for main)

        return JsonHttpResponse({
            'ngraminfos': ngraminfo,
            'listmembers': listmembers,
            'links': linkinfo,
            'nodeids': {
                'mainlist': mainlist_id,
                'maplist': other_list_ids['maplist'],
                'stoplist': other_list_ids['stoplist'],
                'groups': groups_id,
                'scores': scores_id,
            }
        })
    def get(self, request):
        parameters = get_parameters(request)

        maplist_id = None
        scores_id = None

        if "corpus" in parameters:
            corpus_id = parameters['corpus']
            corpus = cache.Node[corpus_id]
            maplist_id = corpus.children('MAPLIST').first().id
            # with a corpus_id, the explicit scoring pointer is optional
            if "scoring" in parameters:
                scores_id = parameters['scoring']
            else:
                scores_id = corpus.children('OCCURRENCES').first().id

        elif "maplist" in parameters and "scoring" in parameters:
            maplist_id = int(parameters['mainlist'])
            scores_id = int(parameters['scoring'])
        else:
            raise ValidationException(
                "A 'corpus' id or 'maplist' id is required, and a 'scoring' for occurences counts"
            )

        ngraminfo = {}  # ngram details sorted per ngram id
        listmembers = {'maplist': []}  # ngram ids sorted per list name

        # infos for all ngrams from maplist
        map_ngrams = query_list(maplist_id,
                                details=True,
                                scoring_metric_id=scores_id).all()

        # ex:  [(8805, 'mean age', 4.0),
        #        (1632, 'activity', 4.0),
        #        (8423, 'present', 2.0),
        #        (2928, 'objective', 2.0)]

        # shortcut to useful function during loop
        add_to_members = listmembers['maplist'].append

        for ng in map_ngrams:
            ng_id = ng[0]
            ngraminfo[ng_id] = ng[1:]

            # maplist ngrams will already be <=> ngraminfos
            # but the client side expects a membership lookup
            # as when there are multiple lists or some groupings
            add_to_members(ng_id)

        return JsonHttpResponse({
            'ngraminfos': ngraminfo,
            'listmembers': listmembers,
            'links': {},  # no grouping links sent during glance (for speed)
            'nodeids': {
                'mainlist': None,
                'maplist': maplist_id,
                'stoplist': None,
                'groups': None,
                'scores': None,
            }
        })
    def put(self, request):
        """
        Add some group elements to a group node
          => adds new couples from GroupsBuffer._to_add of terms view

        TODO see use of util.lists.Translations

        Parameters are all in the url (for symmetry with DELETE method)
           api/ngramlists/groups?node=783&1228[]=891,1639
                                     => creates 1228 - 891
                                            and 1228 - 1639

        general format is:   mainform_id[]=subform_id1,subform_id2 etc
                                     => creates mainform_id - subform_id1
                                            and mainform_id - subform_id2

        NB: also checks if the couples exist before because the ngram table
            will send the entire group (old existing links + new links)
        """
        # from the url
        params = get_parameters(request)
        # the node param is unique
        group_node = params.pop('node')
        # the others params are links to change
        couples = self.links_to_couples(params)

        # debug
        # print("==couples from url =================================++++=")
        # print(couples)

        # local version of "insert if not exists" -------------------->8--------
        # (1) check already existing elements
        check_query = (session.query(NodeNgramNgram).filter(
            NodeNgramNgram.node_id == group_node).filter(
                tuple_(NodeNgramNgram.ngram1_id,
                       NodeNgramNgram.ngram2_id).in_(couples)))

        existing = {}
        for synonyms in check_query.all():
            existing[(synonyms.ngram1_id, synonyms.ngram2_id)] = True

        # debug
        #print("==existing")
        #print(existing)

        # (2) compute difference locally
        couples_to_add = [(mform, sform) for (mform, sform) in couples
                          if (mform, sform) not in existing]

        # debug
        # print("== couples_to_add =================================++++=")
        # print(couples_to_add)

        # (3) add new groupings
        bulk_insert(NodeNgramNgram,
                    ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
                    ((group_node, mainform, subform, 1.0)
                     for (mainform, subform) in couples_to_add))

        # ------------------------------------------------------------>8--------

        return JsonHttpResponse({
            'count_added': len(couples_to_add),
        }, 200)
    def patch(self, request):
        """
        A copy of POST (merging list) but with the source == just an internal corpus_id

        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched
            from:         the corpus from which we take the source lists to merge in
            todo:         an array of the list types ("map", "main", "stop") to merge in

        """
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        params = get_parameters(request)
        print(params)

        # the corpus with the target lists to be patched
        corpus_id = int(params.pop("onto_corpus"))
        corpus_node = cache.Node[corpus_id]

        print(params)

        if request.user.id != corpus_node.user_id:
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        list_types = {'map': 'MAPLIST', 'main': 'MAINLIST', 'stop': 'STOPLIST'}

        # internal DB retrieve source_lists
        source_corpus_id = int(params.pop("from_corpus"))
        source_node = cache.Node[source_corpus_id]

        todo_lists = params.pop("todo").split(',')  # ex: ['map', 'stop']
        source_lists = {}
        for key in todo_lists:
            source_lists[key] = UnweightedList(
                source_node.children(list_types[key]).first().id)

        # add the groupings too
        source_lists['groupings'] = Translations(
            source_node.children("GROUPLIST").first().id)

        # attempt to merge and send response
        try:
            # merge the source_lists onto those of the target corpus
            delete = todo_lists if bool(params.get('overwrite')) else []

            if len(delete) == len(list_types):
                delete.append('groupings')

            log_msg = merge_ngramlists(source_lists,
                                       onto_corpus=corpus_node,
                                       del_originals=delete)

            return JsonHttpResponse({
                'log': log_msg,
            }, 200)

        except Exception as e:
            return JsonHttpResponse({
                'err': str(e),
            }, 400)
Exemplo n.º 9
0
def save(request, project_id, return_corpus=False):
    print("testISTEX:")
    print(request.method)
    alist = ["bar", "foo"]
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()

    # do we have a valid project?
    project = (session.query(Node).filter(Node.id == project_id).filter(
        Node.typename == 'PROJECT')).first()

    if project is None:
        raise Http404()

    # do we have a valid user?
    user = request.user
    if not user.is_authenticated():
        return redirect('/auth/?next=%s' % request.path)
    if project.user_id != user.id:
        return HttpResponseForbidden()

    query_string = ""
    if request.method == "POST":
        query = "-"
        query_string = "-"

        #N = QUERY_SIZE_N_MAX

        if "query" in request.POST:
            query = request.POST["query"]
            query_string = query.replace(" ", "+")  # url encoded q

        if "N" in request.POST:
            if request.POST["N"] == "NaN":
                N = QUERY_SIZE_N_MAX
            else:
                N = int(request.POST["N"])  # query_size from views_opti

            if N > QUERY_SIZE_N_MAX:
                N = QUERY_SIZE_N_MAX
                #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
                #print("ERROR (scrap: istex d/l ): ",msg)
                #raise ValueError(msg)

        print("Scrapping Istex: '%s' (%i)" % (query_string, N))

        urlreqs = []
        pagesize = 50
        tasks = Scraper()
        chunks = list(tasks.chunks(range(N), pagesize))

        for k in chunks:
            if (k[0] + pagesize) > N: pagesize = N - k[0]
            urlreqs.append(
                "http://api.istex.fr/document/?q=" + query_string +
                "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"
                + "from=" + str(k[0]) + "&size=" + str(pagesize))

        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": None
                      })

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2)  #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()

        for url in urlreqs:
            tasks.q.put(url)  #put a task in th queue
        tasks.q.join()  # wait until everything is finished

        dwnldsOK = 0
        for filename in tasks.firstResults:
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
                    type=get_resource(RESOURCE_TYPE_ISTEX)["type"],
                    path=filename)
                dwnldsOK += 1

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        if dwnldsOK == 0:
            return JsonHttpResponse(["fail"])
        ###########################
        ###########################
        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        if return_corpus:
            return corpus

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
Exemplo n.º 10
0
def save( request , project_id ) :
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()
    # do we have a valid project?

    project = session.query( Node ).filter(Node.id == project_id).first()

    if project is None:
        raise Http404()


    user = cache.User[request.user.id]
    if not user.owns(project):
        return HttpResponseForbidden()


    if request.method == "POST":
        queries = request.POST["query"]
        name    = request.POST["string"]

        # here we just realize queries already prepared by getGlobalStats
        #    ===> no need to repeat N parameter like in testISTEX <===

        instancia  = Scraper()
        thequeries = json.loads(queries)

        # fyi the sum of our prepared yearly proportional quotas
        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))

        urlreqs = []
        for yearquery in thequeries:
            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
        alist = ["tudo fixe" , "tudo bem"]


        # corpus node instanciation as a Django model
        corpus = project.add_child( name=name
                                  , typename = "CORPUS"
                                  )

        # """
        # urlreqs: List of urls to query.
        # - Then, to each url in urlreqs you do:
        #     eFetchResult = urlopen(url)
        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
        # """

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2) #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()
        for url in urlreqs:
            tasks.q.put( url ) #put a task in the queue
        tasks.q.join() # wait until everything is finished

        dwnldsOK = 0

        for filename in tasks.firstResults :
            print(filename)
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"]
                                   , path = filename
                                   , url  = None
                                   )
                print("Adding the resource")
                dwnldsOK+=1

        session.add(corpus)
        session.commit()
        corpus_id = corpus.id

        if dwnldsOK == 0 :
            return JsonHttpResponse(["fail"])
        try:
            scheduled(parse_extract_indexhyperdata)(corpus_id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------
        sleep(1)
        return HttpResponseRedirect('/projects/' + str(project_id))

    data = alist
    return JsonHttpResponse(data)
Exemplo n.º 11
0
    def put(self, request):
        """
        Basic external access for *creating an ngram*
        ---------------------------------------------

         1 - checks user authentication before any changes

         2 - checks if ngram to Ngram table in DB
              if yes returns ngram_id and optionally mainform_id
              otherwise continues

         3 - adds the ngram to Ngram table in DB

         4 - (if corpus param is present)
             adds the ngram doc counts to NodeNgram table in DB
             (aka "index the ngram" throught the docs of the corpus)

         5 - returns json with:
             'msg'   => a success msg
             'text'  => the initial text content
             'term'  => the normalized text content
             'id'    => the new ngram_id
             'count' => the number of docs with the ngram in the corpus
                        (if corpus param is present)
             'group' => the mainform_id if applicable

        possible inline parameters
        --------------------------
        @param    text=<ngram_string>         [required]
        @param    corpus=<CORPUS_ID>          [optional]
        @param    testgroup (true if present) [optional, requires corpus]
        """

        # 1 - check user authentication
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        # the params
        params = get_parameters(request)

        print("PARAMS", [(i,v) for (i,v) in params.items()])

        if 'text' in params:
            original_text = str(params.pop('text'))
            ngram_str = normalize_forms(normalize_chars(original_text))
        else:
            raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\
                                        It requires a "text" parameter,\
                                        for instance /api/ngrams?text=hydrometallurgy')

        if ('testgroup' in params) and (not ('corpus' in params)):
            raise ValidationException("'testgroup' param requires 'corpus' param")

        # if we have a 'corpus' param (to do the indexing)...
        do_indexation = False
        if 'corpus' in params:
            # we retrieve the corpus...
            corpus_id = int(params.pop('corpus'))
            corpus_node = cache.Node[corpus_id]
            # and the user must also have rights on the corpus
            if request.user.id == corpus_node.user_id:
                do_indexation = True
            else:
                res = HttpResponse("Unauthorized")
                res.status_code = 401
                return res

        # number of "words" in the ngram
        ngram_size = len(findall(r' +', ngram_str)) + 1

        # do the additions
        try:
            log_msg = ""
            ngram_id = None
            mainform_id = None

            preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first()

            if preexisting is not None:
                ngram_id = preexisting.id
                log_msg += "ngram already existed (id %i)\n" % ngram_id

                # in the context of a corpus we can also check if has mainform
                # (useful for)
                if 'testgroup' in params:
                    groupings_id = (session.query(Node.id)
                                           .filter(Node.parent_id == corpus_id)
                                           .filter(Node.typename == 'GROUPLIST')
                                           .first()
                                    )
                    had_mainform = (session.query(NodeNgramNgram.ngram1_id)
                                          .filter(NodeNgramNgram.node_id == groupings_id)
                                          .filter(NodeNgramNgram.ngram2_id == preexisting.id)
                                          .first()
                                    )
                    if had_mainform:
                        mainform_id = had_mainform[0]
                        log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id
                    else:
                        log_msg += "ngram was not in any group for this corpus"

            else:
                # 2 - insert into Ngrams
                new_ngram = Ngram(terms=ngram_str, n=ngram_size)
                session.add(new_ngram)
                session.commit()
                ngram_id = new_ngram.id
                log_msg += "ngram was added with new id %i\n" % ngram_id

            # 3 - index the term
            if do_indexation:
                n_added = index_new_ngrams([ngram_id], corpus_node)
                log_msg += 'ngram indexed in corpus %i\n' % corpus_id

            return JsonHttpResponse({
                'msg': log_msg,
                'text': original_text,
                'term': ngram_str,
                'id' : ngram_id,
                'group' : mainform_id,
                'count': n_added if do_indexation else 'no corpus provided for indexation'
                }, 200)

        # just in case
        except Exception as e:
            return JsonHttpResponse({
                'msg': str(e),
                'text': original_text
                }, 400)
Exemplo n.º 12
0
    def get(self, request, project_id, corpus_id):
        '''
        Graph.get :: Get graph data as REST api.
        Get all the parameters first
        graph?field1=ngrams&field2=ngrams&
        graph?field1=ngrams&field2=ngrams&start=''&end=''

        NB  save new graph mode
          (option saveOnly=True without a cooc_id)
           can return the new cooc id in the json
           before counting + filling data in async
        '''

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # Get the node we are working with
        corpus = session.query(Node).filter(Node.id==corpus_id).first()


        # TODO Parameters to save in hyperdata of the Node Cooc
        # WARNING: we could factorize the parameters as dict but ...
        #         ...  it causes a bug in asynchronous function !
        # Check celery upgrades before.
        # Example (for the future):
        #        parameters = dict()
        #        parameters['field1'] = field1
        #        parameters['field2'] = field2

        # Get all the parameters in the URL
        cooc_id      = request.GET.get     ('cooc_id'   , None         )
        saveOnly     = request.GET.get     ('saveOnly'  , None         )

        field1       = str(request.GET.get ('field1'    , 'ngrams'     ))
        field2       = str(request.GET.get ('field2'    , 'ngrams'     ))

        start        = request.GET.get     ('start'     , None         )
        end          = request.GET.get     ('end'       , None         )

        mapList_id   = int(request.GET.get ('mapList'   , 0            ))
        groupList_id = int(request.GET.get ('groupList' , 0            ))

        threshold    = int(request.GET.get ('threshold' , 1            ))
        bridgeness   = int(request.GET.get ('bridgeness', -1           ))
        format_      = str(request.GET.get ('format'    , 'json'       ))
        type_        = str(request.GET.get ('type'      , 'node_link'  ))
        distance     = str(request.GET.get ('distance'  , 'conditional'))

        # Get default map List of corpus
        if mapList_id == 0 :
            mapList_id = ( session.query ( Node.id )
                                    .filter( Node.typename  == "MAPLIST"
                                           , Node.parent_id == corpus.id
                                           )
                                    .first()
                          )

            mapList_id = mapList_id[0]

            if mapList_id == None :
                raise ValueError("MAPLIST node needed for cooccurrences")


        # Get default value if no group list
        if groupList_id  == 0 :
            groupList_id  = ( session.query ( Node.id )
                                     .filter( Node.typename  == "GROUPLIST"
                                            , Node.parent_id == corpus.id
                                            )
                                     .first()
                            )

            groupList_id  = groupList_id[0]

            if groupList_id == None :
                raise ValueError("GROUPLIST node needed for cooccurrences")


        # Declare accepted fields
        accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
        accepted_field2 = ['ngrams',                               ]
        options         = ['start', 'end', 'threshold', 'distance', 'cooc_id' ]


        try:
            # Check if parameters are accepted
            if (field1 in accepted_field1) and (field2 in accepted_field2):
                data = get_graph( corpus=corpus, cooc_id = cooc_id
                               , field1=field1           , field2=field2
                               , mapList_id = mapList_id , groupList_id = groupList_id
                               , start=start             , end=end
                               , threshold =threshold
                               , distance=distance       , bridgeness=bridgeness
                               , saveOnly=saveOnly
                               )


                # data :: Either (Dic Nodes Links) (Dic State Length)

                # data_test :: Either String Bool
                data_test = data.get("state", True)

                if data_test is True:
                    # normal case --------------------------------
                    if format_ == 'json':
                        return JsonHttpResponse(
                                 compress_graph(data),
                                 status=200
                               )
                    # --------------------------------------------

                else:
                    # All other cases (more probable are higher in the if list)

                    if data["state"] == "saveOnly":
                        # async data case
                        link = "http://%s/projects/%d/corpora/%d/myGraphs" % (request.get_host(), corpus.parent_id, corpus.id)
                        return JsonHttpResponse({
                            'id': data["target_id"],
                            'name': data["target_name"],
                            'date': data["target_date"],
                            'msg': '''Your graph is being saved:
                                      %s
                                      ''' % format_html(link)
                            }, status=200)

                    elif data["state"] == "corpusMin":
                        # async data case
                        link = "http://%s/projects/%d/" % (request.get_host(), corpus.parent_id)
                        return JsonHttpResponse({
                            'msg': '''Problem: your corpus is too small (only %d documents).

                                      Solution: Add more documents (more than %d documents)
                                      in order to get a graph.

                                      You can manage your corpus here:
                                      %s
                                      ''' % ( data["length"]
                                            , graph_constraints['corpusMin']
                                            , format_html(link)
                                            ),
                            }, status=400)

                    elif data["state"] == "mapListError":
                        # async data case
                        link = 'http://%s/projects/%d/corpora/%d/terms' % (request.get_host(), corpus.parent_id, corpus.id)
                        return JsonHttpResponse({
                            'msg': '''Problem: your map list is too small (currently %d terms).

                                      Solution: Add some terms (more than %d terms)
                                      in order to get a graph.

                                      You can manage your map terms here:
                                      %s
                                      ''' % ( data["length"]
                                            , graph_constraints['mapList']
                                            , format_html(link)
                                            ),
                            }, status=400)


                    elif data["state"] == "corpusMax":
                        # async data case
                        link = 'http://%s/projects/%d/corpora/%d/myGraphs' % (request.get_host(), corpus.parent_id, corpus.id)
                        return JsonHttpResponse({
                            'msg': '''Warning: Async graph generation since your corpus is
                                      big (about %d documents).

                                      Wait a while and discover your graph very soon.

                                      Click on the link below and see your current graph
                                      processing on top of the list:

                                      %s
                                      ''' % (data["length"], format_html(link)),
                            }, status=200)



                    else :
                       return JsonHttpResponse({
                            'msg': '''Programming error.''',
                            }, status=400)


            elif len(data["nodes"]) < 2 and len(data["links"]) < 2:
                # empty data case
                return JsonHttpResponse({
                    'msg': '''Empty graph warning
                              No cooccurences found in this corpus for the words of this maplist
                              (maybe add more terms to the maplist or increase the size of your
                              corpus ?)''',
                    }, status=400)

            else:
                # parameters error case
                return JsonHttpResponse({
                    'msg': '''Usage warning
                              Please choose only one field from each range:
                                - "field1": %s
                                - "field2": %s
                                - "options": %s''' % (accepted_field1, accepted_field2, options)
                    }, status=400)

        # for any other errors that we forgot to test
        except Exception as error:
            print(error)
            return JsonHttpResponse({
                'msg' : 'Unknown error (showing the trace):\n%s' % "\n".join(format_tb(error.__traceback__))
                }, status=400)
Exemplo n.º 13
0
 def get(self, request):
     corpus_id_list = list(map(int, request.GET['corpus_id'].split(',')))
     return JsonHttpResponse({
         'data': get_metadata(corpus_id_list),
     })
Exemplo n.º 14
0
    def post(self, request, project_id):

        # example only

        input = request.data or {
            'x': {
                'with_empty': True,
                'resolution': 'decade',
                'value': 'publication_date',
            },
            'y': {
                # 'divided_by': 'total_ngrams_count',
                # 'divided_by': 'total_documents_count',
            },
            'filter': {
                # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'],
                # 'ngrams': ['insecticide', 'pesticide'],
                # 'corpora': [52633],
                # 'date': {'min': '1995-12-31'}
            },
            # 'format': 'csv',
        }
        print(input)
        # input validation
        input = validate(
            input,
            {
                'type': dict,
                'default': {},
                'items': {
                    'x': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # which hyperdata to choose for the date
                            'value': {
                                'type': str,
                                'default': 'publication_date',
                                'range': {
                                    'publication_date',
                                }
                            },
                            # time resolution
                            'resolution': {
                                'type': str,
                                'range': self._resolutions.keys(),
                                'default': 'month'
                            },
                            # should we add zeroes for empty values?
                            'with_empty': {
                                'type': bool,
                                'default': False
                            },
                        }
                    },
                    'y': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # mesured value
                            'value': {
                                'type': str,
                                'default': 'ngrams_count',
                                'range': {
                                    'ngrams_count', 'documents_count',
                                    'ngrams_tfidf'
                                }
                            },
                            # value by which we should normalize
                            'divided_by': {
                                'type': str,
                                'range': {
                                    'total_documents_count', 'documents_count',
                                    'total_ngrams_count'
                                }
                            },
                        }
                    },
                    # filtering
                    'filter': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # filter by metadata
                            'hyperdata': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': dict,
                                    'items': {
                                        'key': {
                                            'type': str,
                                            'range': self._operators.keys()
                                        },
                                        'operator': {
                                            'type': str
                                        },
                                        'value': {
                                            'type': str
                                        },
                                    }
                                }
                            },
                            # filter by date
                            'date': {
                                'type': dict,
                                'items': {
                                    'min': {
                                        'type': datetime.datetime
                                    },
                                    'max': {
                                        'type': datetime.datetime
                                    },
                                },
                                'default': {}
                            },
                            # filter by corpora
                            'corpora': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': int
                                }
                            },
                            # filter by ngrams
                            'ngrams': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': str
                                }
                            },
                        }
                    },
                    # output format
                    'format': {
                        'type': str,
                        'default': 'json',
                        'range': {'json', 'csv'}
                    },
                }
            })
        # build query: prepare columns
        X = aliased(NodeHyperdata)
        column_x = func.date_trunc(input['x']['resolution'], X.value_utc)
        column_y = {
            'documents_count': func.count(Node.id.distinct()),
            'ngrams_count': func.sum(NodeNgram.weight),
            # 'ngrams_tfidf':     func.sum(NodeNodeNgram.weight),
        }[input['y']['value']]
        # build query: base
        print(input)
        query_base = (
            session.query(column_x).select_from(Node).join(
                NodeNgram, NodeNgram.node_id == Node.id).join(
                    X, X.node_id == NodeNgram.node_id)
            #.filter(X.key == input['x']['value'])
            .group_by(column_x).order_by(column_x))
        # build query: base, filter by corpora or project
        if 'corpora' in input['filter'] and input['filter']['corpora']:
            query_base = (query_base.filter(
                Node.parent_id.in_(input['filter']['corpora'])))
        else:
            ParentNode = aliased(Node)
            query_base = (query_base.join(
                ParentNode, ParentNode.id == Node.parent_id).filter(
                    ParentNode.parent_id == project_id))
        # build query: base, filter by date
        if 'date' in input['filter']:
            if 'min' in input['filter']['date']:
                query_base = query_base.filter(
                    X.value >= input['filter']['date']['min'])
            if 'max' in input['filter']['date']:
                query_base = query_base.filter(
                    X.value <= input['filter']['date']['max'])
        # build query: filter by ngrams
        query_result = query_base.add_columns(column_y)
        if 'ngrams' in input['filter'] and input['filter']['ngrams']:
            query_result = (query_result.join(
                Ngram, Ngram.id == NodeNgram.ngram_id).filter(
                    Ngram.terms.in_(input['filter']['ngrams'])))
        # build query: filter by metadata
        if 'hyperdata' in input['filter']:
            for h, hyperdata in enumerate(input['filter']['hyperdata']):
                print(h, hyperdata)
                # get hyperdata in database
                #if hyperdata_model is None:
                #    continue
                #hyperdata_id, hyperdata_type = hyperdata_model
                # create alias and query it
                operator = self._operators[hyperdata['operator']]
                type_string = type2string(
                    INDEXED_HYPERDATA[hyperdata['key']]['type'])
                value = self._converters[type_string](hyperdata['value'])
                query_result = (query_result.join(
                    NodeHyperdata,
                    NodeHyperdata.node_id == NodeNgram.node_id).filter(
                        NodeHyperdata.key == hyperdata['key']).filter(
                            operator(NodeHyperdata.value, value)))
        # build result: prepare data
        date_value_list = query_result.all()
        #print(date_value_list)

        if date_value_list:
            date_min = date_value_list[0][0].replace(tzinfo=None)
            date_max = date_value_list[-2][0].replace(tzinfo=None)
        # build result: prepare interval
        result = collections.OrderedDict()
        if input['x']['with_empty'] and date_value_list:
            compute_next_date = self._resolutions[input['x']['resolution']]
            date = date_min
            while date <= date_max:
                result[date] = 0.0
                date = compute_next_date(date)
        # build result: integrate
        for date, value in date_value_list[0:-1]:
            result[date.replace(tzinfo=None)] = value
        # build result: normalize
        query_normalize = None
        if date_value_list and 'divided_by' in input['y'] and input['y'][
                'divided_by']:
            if input['y']['divided_by'] == 'total_documents_count':
                query_normalize = query_base.add_column(
                    func.count(Node.id.distinct()))
            elif input['y']['divided_by'] == 'total_ngrams_count':
                query_normalize = query_base.add_column(
                    func.sum(NodeNgram.weight))
        if query_normalize is not None:
            for date, value in query_normalize[0:-1]:
                date = date.replace(tzinfo=None)
                if date in result:
                    result[date] /= value
        # return result with proper formatting
        if input['format'] == 'json':
            return JsonHttpResponse(
                {
                    'query': input,
                    'result': sorted(result.items()),
                }, 201)
        elif input['format'] == 'csv':
            return CsvHttpResponse(sorted(result.items()), ('date', 'value'),
                                   201)