Exemplo n.º 1
0
def search(request_params: MultiDict,
           archives: Optional[List[str]] = None) -> Response:
    """
    Perform a simple search.

    This supports requests from both the form-based view (provided here) AND
    from the mini search widget displayed on all arXiv.org pages.

    At a minimum, expects the parameter ``value`` in the GET request. This may
    be a match value for a search query, or an arXiv ID.

    Parameters
    ----------
    request_params : :class:`.MultiDict`
    archives : list
        A list of archives within which the search should be performed.

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    :class:`.InternalServerError`
        Raised when there is a problem communicating with ES, or there was an
        unexpected problem executing the query.

    """
    if archives is not None and len(archives) == 0:
        raise NotFound('No such archive')

    # We may need to intervene on the request parameters, so we'll
    # reinstantiate as a mutable MultiDict.
    if isinstance(request_params, ImmutableMultiDict):
        request_params = MultiDict(request_params.items(multi=True))

    logger.debug('simple search form')
    response_data = {}  # type: Dict[str, Any]

    logger.debug('simple search request')
    if 'query' in request_params:
        try:
            # first check if the URL includes an arXiv ID
            arxiv_id: Optional[str] = identifier.parse_arxiv_id(
                request_params['query'])
            # If so, redirect.
            logger.debug(f"got arXiv ID: {arxiv_id}")
        except ValueError as e:
            logger.debug('No arXiv ID detected; fall back to form')
            arxiv_id = None
    else:
        arxiv_id = None

    if arxiv_id:
        headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)}
        return {}, status.HTTP_301_MOVED_PERMANENTLY, headers

    # Here we intervene on the user's query to look for holdouts from the
    # classic search system's author indexing syntax (surname_f). We
    # rewrite with a comma, and show a warning to the user about the
    # change.
    response_data['has_classic_format'] = False
    if 'searchtype' in request_params and 'query' in request_params:
        if request_params['searchtype'] in ['author', 'all']:
            _query, _classic = catch_underscore_syntax(request_params['query'])
            response_data['has_classic_format'] = _classic
            request_params['query'] = _query

    # Fall back to form-based search.
    form = SimpleSearchForm(request_params)

    if form.query.data:
        # Temporary workaround to support classic help search
        if form.searchtype.data == 'help':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': f'/help/search?q={form.query.data}'}

        # Support classic "expeirmental" search
        elif form.searchtype.data == 'full_text':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': 'http://search.arxiv.org:8081/'
                             f'?in=&query={form.query.data}'}

    q: Optional[Query]
    if form.validate():
        logger.debug('form is valid')
        q = _query_from_form(form)

        if archives is not None:
            q = _update_with_archives(q, archives)

        # Pagination is handled outside of the form.
        q = paginate(q, request_params)

        try:
            # Execute the search. We'll use the results directly in
            #  template rendering, so they get added directly to the
            #  response content.
            response_data.update(asdict(index.search(q)))
        except index.IndexConnectionError as e:
            # There was a (hopefully transient) connection problem. Either
            #  this will clear up relatively quickly (next request), or
            #  there is a more serious outage.
            logger.error('IndexConnectionError: %s', e)
            raise InternalServerError(
                "There was a problem connecting to the search index. This is "
                "quite likely a transient issue, so please try your search "
                "again. If this problem persists, please report it to "
                "[email protected].") from e
        except index.QueryError as e:
            # Base exception routers should pick this up and show bug page.
            logger.error('QueryError: %s', e)
            raise InternalServerError(
                "There was a problem executing your query. Please try your "
                "search again.  If this problem persists, please report it to "
                "[email protected].") from e
        except index.OutsideAllowedRange as e:
            raise BadRequest(
                "Hello clever friend. You can't get results in that range"
                " right now.") from e

        except Exception as e:
            logger.error('Unhandled exception: %s', str(e))
            raise
    else:
        logger.debug('form is invalid: %s', str(form.errors))
        if 'order' in form.errors or 'size' in form.errors:
            # It's likely that the user tried to set these parameters manually,
            # or that the search originated from somewhere else (and was
            # configured incorrectly).
            simple_url = url_for('ui.search')
            raise BadRequest(
                f"It looks like there's something odd about your search"
                f" request. Please try <a href='{simple_url}'>starting"
                f" over</a>.")
        q = None
    response_data['query'] = q
    response_data['form'] = form
    return response_data, status.HTTP_200_OK, {}
Exemplo n.º 2
0
 def to_python(self, value: str) -> str:
     """Parse URL path part to Python rep (str)."""
     try:
         return identifier.parse_arxiv_id(value)
     except ValueError as e:
         raise ValidationError('Not a valid arXiv ID') from e
Exemplo n.º 3
0
def query(
    params: MultiDict,
) -> Tuple[ClassicSearchResponseData, HTTPStatus, Dict[str, Any]]:
    """
    Handle a search request from the Clasic API.

    First, the method maps old request parameters to new parameters:
    - search_query -> query
    - start -> start
    - max_results -> size

    Then the request is passed to :method:`search()` and returned.

    If ``id_list`` is specified in the parameters and ``search_query`` is
    NOT specified, then each request is passed to :method:`paper()` and
    results are aggregated.

    If ``id_list`` is specified AND ``search_query`` is also specified,
    then the results from :method:`search()` are filtered by ``id_list``.

    Parameters
    ----------
    params : :class:`MultiDict`
        GET query parameters from the request.

    Returns
    -------
    SearchResponseData
        Response data (to serialize).
    int
        HTTP status code.
    dict
        Extra headers for the response.

    Raises
    ------
    :class:`BadRequest`
        Raised when the search_query and id_list are not specified.

    """
    params = params.copy()

    # Parse classic search query.
    search_query = params.get("search_query", None)

    # Parse id_list.
    id_list = params.get("id_list", "")
    if id_list:
        id_list = id_list.split(",")
        # Check arxiv id validity
        for arxiv_id in id_list:
            try:
                parse_arxiv_id(arxiv_id)
            except ValueError:
                raise ValidationError(
                    message="incorrect id format for {}".format(arxiv_id),
                    link=(
                        "http://arxiv.org/api/errors#"
                        "incorrect_id_format_for_{}"
                    ).format(arxiv_id),
                )
    else:
        id_list = None

    # Parse result size.
    try:
        max_results = int(params.get("max_results", 10))
    except ValueError:
        raise ValidationError(
            message="max_results must be an integer",
            link="http://arxiv.org/api/errors#max_results_must_be_an_integer",
        )
    if max_results < 0:
        raise ValidationError(
            message="max_results must be non-negative",
            link="http://arxiv.org/api/errors#max_results_must_be_"
            "non-negative",
        )

    # Parse result start point.
    try:
        start = int(params.get("start", 0))
    except ValueError:
        raise ValidationError(
            message="start must be an integer",
            link="http://arxiv.org/api/errors#start_must_be_an_integer",
        )
    if start < 0:
        raise ValidationError(
            message="start must be non-negative",
            link="http://arxiv.org/api/errors#start_must_be_non-negative",
        )

    # sort by and sort order
    value = params.get("sortBy", SortBy.relevance)
    try:
        sort_by = SortBy(value)
    except ValueError:
        raise ValidationError(
            message=f"sortBy must be in: {', '.join(SortBy)}",
            link="https://arxiv.org/help/api/user-manual#sort",
        )
    value = params.get("sortOrder", SortDirection.descending)
    try:
        sort_direction = SortDirection(value)
    except ValueError:
        raise ValidationError(
            message=f"sortOrder must be in: {', '.join(SortDirection)}",
            link="https://arxiv.org/help/api/user-manual#sort",
        )

    try:
        classic_query = ClassicAPIQuery(
            order=SortOrder(by=sort_by, direction=sort_direction),
            search_query=search_query,
            id_list=id_list,
            size=max_results,
            page_start=start,
        )
    except ValueError:
        raise BadRequest(
            "Either a search_query or id_list must be specified"
            " for the classic API."
        )

    # pass to search indexer, which will handle parsing
    document_set: DocumentSet = index.SearchSession.current_session().search(
        classic_query
    )
    logger.debug(
        "Got document set with %i results", len(document_set["results"])
    )

    return (
        ClassicSearchResponseData(results=document_set, query=classic_query),
        HTTPStatus.OK,
        {},
    )