def search(request_params: MultiDict, archives: Optional[List[str]] = None) -> Response: """ Perform a simple search. This supports requests from both the form-based view (provided here) AND from the mini search widget displayed on all arXiv.org pages. At a minimum, expects the parameter ``value`` in the GET request. This may be a match value for a search query, or an arXiv ID. Parameters ---------- request_params : :class:`.MultiDict` archives : list A list of archives within which the search should be performed. Returns ------- dict Search result response data. int HTTP status code. dict Headers to add to the response. Raises ------ :class:`.InternalServerError` Raised when there is a problem communicating with ES, or there was an unexpected problem executing the query. """ if archives is not None and len(archives) == 0: raise NotFound('No such archive') # We may need to intervene on the request parameters, so we'll # reinstantiate as a mutable MultiDict. if isinstance(request_params, ImmutableMultiDict): request_params = MultiDict(request_params.items(multi=True)) logger.debug('simple search form') response_data = {} # type: Dict[str, Any] logger.debug('simple search request') if 'query' in request_params: try: # first check if the URL includes an arXiv ID arxiv_id: Optional[str] = identifier.parse_arxiv_id( request_params['query']) # If so, redirect. logger.debug(f"got arXiv ID: {arxiv_id}") except ValueError as e: logger.debug('No arXiv ID detected; fall back to form') arxiv_id = None else: arxiv_id = None if arxiv_id: headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)} return {}, status.HTTP_301_MOVED_PERMANENTLY, headers # Here we intervene on the user's query to look for holdouts from the # classic search system's author indexing syntax (surname_f). We # rewrite with a comma, and show a warning to the user about the # change. response_data['has_classic_format'] = False if 'searchtype' in request_params and 'query' in request_params: if request_params['searchtype'] in ['author', 'all']: _query, _classic = catch_underscore_syntax(request_params['query']) response_data['has_classic_format'] = _classic request_params['query'] = _query # Fall back to form-based search. form = SimpleSearchForm(request_params) if form.query.data: # Temporary workaround to support classic help search if form.searchtype.data == 'help': return {}, status.HTTP_301_MOVED_PERMANENTLY,\ {'Location': f'/help/search?q={form.query.data}'} # Support classic "expeirmental" search elif form.searchtype.data == 'full_text': return {}, status.HTTP_301_MOVED_PERMANENTLY,\ {'Location': 'http://search.arxiv.org:8081/' f'?in=&query={form.query.data}'} q: Optional[Query] if form.validate(): logger.debug('form is valid') q = _query_from_form(form) if archives is not None: q = _update_with_archives(q, archives) # Pagination is handled outside of the form. q = paginate(q, request_params) try: # Execute the search. We'll use the results directly in # template rendering, so they get added directly to the # response content. response_data.update(asdict(index.search(q))) except index.IndexConnectionError as e: # There was a (hopefully transient) connection problem. Either # this will clear up relatively quickly (next request), or # there is a more serious outage. logger.error('IndexConnectionError: %s', e) raise InternalServerError( "There was a problem connecting to the search index. This is " "quite likely a transient issue, so please try your search " "again. If this problem persists, please report it to " "[email protected].") from e except index.QueryError as e: # Base exception routers should pick this up and show bug page. logger.error('QueryError: %s', e) raise InternalServerError( "There was a problem executing your query. Please try your " "search again. If this problem persists, please report it to " "[email protected].") from e except index.OutsideAllowedRange as e: raise BadRequest( "Hello clever friend. You can't get results in that range" " right now.") from e except Exception as e: logger.error('Unhandled exception: %s', str(e)) raise else: logger.debug('form is invalid: %s', str(form.errors)) if 'order' in form.errors or 'size' in form.errors: # It's likely that the user tried to set these parameters manually, # or that the search originated from somewhere else (and was # configured incorrectly). simple_url = url_for('ui.search') raise BadRequest( f"It looks like there's something odd about your search" f" request. Please try <a href='{simple_url}'>starting" f" over</a>.") q = None response_data['query'] = q response_data['form'] = form return response_data, status.HTTP_200_OK, {}
def to_python(self, value: str) -> str: """Parse URL path part to Python rep (str).""" try: return identifier.parse_arxiv_id(value) except ValueError as e: raise ValidationError('Not a valid arXiv ID') from e
def query( params: MultiDict, ) -> Tuple[ClassicSearchResponseData, HTTPStatus, Dict[str, Any]]: """ Handle a search request from the Clasic API. First, the method maps old request parameters to new parameters: - search_query -> query - start -> start - max_results -> size Then the request is passed to :method:`search()` and returned. If ``id_list`` is specified in the parameters and ``search_query`` is NOT specified, then each request is passed to :method:`paper()` and results are aggregated. If ``id_list`` is specified AND ``search_query`` is also specified, then the results from :method:`search()` are filtered by ``id_list``. Parameters ---------- params : :class:`MultiDict` GET query parameters from the request. Returns ------- SearchResponseData Response data (to serialize). int HTTP status code. dict Extra headers for the response. Raises ------ :class:`BadRequest` Raised when the search_query and id_list are not specified. """ params = params.copy() # Parse classic search query. search_query = params.get("search_query", None) # Parse id_list. id_list = params.get("id_list", "") if id_list: id_list = id_list.split(",") # Check arxiv id validity for arxiv_id in id_list: try: parse_arxiv_id(arxiv_id) except ValueError: raise ValidationError( message="incorrect id format for {}".format(arxiv_id), link=( "http://arxiv.org/api/errors#" "incorrect_id_format_for_{}" ).format(arxiv_id), ) else: id_list = None # Parse result size. try: max_results = int(params.get("max_results", 10)) except ValueError: raise ValidationError( message="max_results must be an integer", link="http://arxiv.org/api/errors#max_results_must_be_an_integer", ) if max_results < 0: raise ValidationError( message="max_results must be non-negative", link="http://arxiv.org/api/errors#max_results_must_be_" "non-negative", ) # Parse result start point. try: start = int(params.get("start", 0)) except ValueError: raise ValidationError( message="start must be an integer", link="http://arxiv.org/api/errors#start_must_be_an_integer", ) if start < 0: raise ValidationError( message="start must be non-negative", link="http://arxiv.org/api/errors#start_must_be_non-negative", ) # sort by and sort order value = params.get("sortBy", SortBy.relevance) try: sort_by = SortBy(value) except ValueError: raise ValidationError( message=f"sortBy must be in: {', '.join(SortBy)}", link="https://arxiv.org/help/api/user-manual#sort", ) value = params.get("sortOrder", SortDirection.descending) try: sort_direction = SortDirection(value) except ValueError: raise ValidationError( message=f"sortOrder must be in: {', '.join(SortDirection)}", link="https://arxiv.org/help/api/user-manual#sort", ) try: classic_query = ClassicAPIQuery( order=SortOrder(by=sort_by, direction=sort_direction), search_query=search_query, id_list=id_list, size=max_results, page_start=start, ) except ValueError: raise BadRequest( "Either a search_query or id_list must be specified" " for the classic API." ) # pass to search indexer, which will handle parsing document_set: DocumentSet = index.SearchSession.current_session().search( classic_query ) logger.debug( "Got document set with %i results", len(document_set["results"]) ) return ( ClassicSearchResponseData(results=document_set, query=classic_query), HTTPStatus.OK, {}, )