示例#1
0
    def _requires_auth(request, *args, **kwargs):
        if not request.user.is_authenticated():
            url = '/auth/login/?next=%s' % urlencode(request.path)
            return redirect(url)
        try:
            # normal return the subfunction when user ok
            return func(request, *args, **kwargs)

        # user was authenticated but something made the session expire
        except DetachedInstanceError as die:
            print("===\n Detached instance error: trying to rollback session")
            print(die)
            from gargantext.util.db import session
            session.rollback()
            print("=== session rollback ok!")
            # re init the global cache (it must still have detached instances)
            from gargantext.util.db_cache import cache
            cache.clean_all()
            print("=== cache reinit ok!")
            # and relogin for safety
            url = '/auth/login/?next=%s' % urlencode(request.path)
            return redirect(url)
示例#2
0
def save(request, project_id):
    '''save'''
    if request.method == "POST":

        query = request.POST.get("query")
        try:
            N = int(request.POST.get("N"))
        except:
            N = 0
        print(query, N)
        #for next time
        #ids = request.POST["ids"]
        source = get_resource(RESOURCE_TYPE_SCOAP)
        if N == 0:
            raise Http404()
        if N > QUERY_SIZE_N_MAX:
            N = QUERY_SIZE_N_MAX

        try:
            project_id = int(project_id)
        except ValueError:
            raise Http404()
        # do we have a valid project?
        project = session.query(Node).filter(Node.id == project_id).first()
        if project is None:
            raise Http404()
        user = cache.User[request.user.id]
        if not user.owns(project):
            return HttpResponseForbidden()
        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": "en"
                      })

        #download_file
        crawler_bot = load_crawler(source)()
        #for now no way to force downloading X records

        #the long running command
        filename = crawler_bot.download(query)
        corpus.add_resource(
            type=source["type"]
            #,  name = source["name"]
            ,
            path=crawler_bot.path)

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
示例#3
0
def save(request, project_id, return_corpus=False):
    print("testISTEX:")
    print(request.method)
    alist = ["bar", "foo"]
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()

    # do we have a valid project?
    project = (session.query(Node).filter(Node.id == project_id).filter(
        Node.typename == 'PROJECT')).first()

    if project is None:
        raise Http404()

    # do we have a valid user?
    user = request.user
    if not user.is_authenticated():
        return redirect('/auth/?next=%s' % request.path)
    if project.user_id != user.id:
        return HttpResponseForbidden()

    query_string = ""
    if request.method == "POST":
        query = "-"
        query_string = "-"

        #N = QUERY_SIZE_N_MAX

        if "query" in request.POST:
            query = request.POST["query"]
            query_string = query.replace(" ", "+")  # url encoded q

        if "N" in request.POST:
            if request.POST["N"] == "NaN":
                N = QUERY_SIZE_N_MAX
            else:
                N = int(request.POST["N"])  # query_size from views_opti

            if N > QUERY_SIZE_N_MAX:
                N = QUERY_SIZE_N_MAX
                #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
                #print("ERROR (scrap: istex d/l ): ",msg)
                #raise ValueError(msg)

        print("Scrapping Istex: '%s' (%i)" % (query_string, N))

        urlreqs = []
        pagesize = 50
        tasks = Scraper()
        chunks = list(tasks.chunks(range(N), pagesize))

        for k in chunks:
            if (k[0] + pagesize) > N: pagesize = N - k[0]
            urlreqs.append(
                "http://api.istex.fr/document/?q=" + query_string +
                "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"
                + "from=" + str(k[0]) + "&size=" + str(pagesize))

        # corpus node instanciation as a Django model

        corpus = Node(name=query,
                      user_id=request.user.id,
                      parent_id=project_id,
                      typename='CORPUS',
                      hyperdata={
                          "action": "Scrapping data",
                          "language_id": None
                      })

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2)  #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()

        for url in urlreqs:
            tasks.q.put(url)  #put a task in th queue
        tasks.q.join()  # wait until everything is finished

        dwnldsOK = 0
        for filename in tasks.firstResults:
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
                    type=get_resource(RESOURCE_TYPE_ISTEX)["type"],
                    path=filename)
                dwnldsOK += 1

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        if dwnldsOK == 0:
            return JsonHttpResponse(["fail"])
        ###########################
        ###########################
        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

        if return_corpus:
            return corpus

        return render(
            template_name='pages/projects/wait.html',
            request=request,
            context={
                'user': request.user,
                'project': project,
            },
        )

    data = [query_string, query, N]
    print(data)
    return JsonHttpResponse(data)
示例#4
0
def save( request , project_id ) :
    # implicit global session
    # do we have a valid project id?
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()
    # do we have a valid project?

    project = session.query( Node ).filter(Node.id == project_id).first()

    if project is None:
        raise Http404()


    user = cache.User[request.user.id]
    if not user.owns(project):
        return HttpResponseForbidden()


    if request.method == "POST":
        queries = request.POST["query"]
        name    = request.POST["string"]

        # here we just realize queries already prepared by getGlobalStats
        #    ===> no need to repeat N parameter like in testISTEX <===

        instancia  = Scraper()
        thequeries = json.loads(queries)

        # fyi the sum of our prepared yearly proportional quotas
        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))

        urlreqs = []
        for yearquery in thequeries:
            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
        alist = ["tudo fixe" , "tudo bem"]


        # corpus node instanciation as a Django model
        corpus = project.add_child( name=name
                                  , typename = "CORPUS"
                                  )

        # """
        # urlreqs: List of urls to query.
        # - Then, to each url in urlreqs you do:
        #     eFetchResult = urlopen(url)
        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
        # """

        tasks = Scraper()

        for i in range(8):
            t = threading.Thread(target=tasks.worker2) #thing to do
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()
        for url in urlreqs:
            tasks.q.put( url ) #put a task in the queue
        tasks.q.join() # wait until everything is finished

        dwnldsOK = 0

        for filename in tasks.firstResults :
            print(filename)
            if filename != False:
                # add the uploaded resource to the corpus
                corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"]
                                   , path = filename
                                   , url  = None
                                   )
                print("Adding the resource")
                dwnldsOK+=1

        session.add(corpus)
        session.commit()
        corpus_id = corpus.id

        if dwnldsOK == 0 :
            return JsonHttpResponse(["fail"])
        try:
            scheduled(parse_extract_indexhyperdata)(corpus_id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------
        sleep(1)
        return HttpResponseRedirect('/projects/' + str(project_id))

    data = alist
    return JsonHttpResponse(data)