def test_limit_results(reset_db):
    # Insert document in DB
    for idx, doc_text in enumerate(_DOCUMENT_TEXTS):
        DocumentStore.create_document(_LOGIN, 'Title of doc %d' % idx, 'test',
                                      doc_text)

    # exact match is expected to return a perfect score
    all_matched_results = list(
        DocumentStore.search_chunks(_LOGIN, 'one plus one makes two'))
    limited_matched_results = list(
        DocumentStore.search_chunks(_LOGIN,
                                    'one plus one makes two',
                                    limit_per_doc=1))
    assert len(all_matched_results) > len(limited_matched_results)
    assert len(limited_matched_results) == 1
    assert all_matched_results[0].matched_content == limited_matched_results[
        0].matched_content
    limited_matched_results = list(
        DocumentStore.search_chunks(_LOGIN,
                                    'one plus one makes two',
                                    limit_per_doc=2))
    assert len(all_matched_results) > len(limited_matched_results)
    assert len(limited_matched_results) == 2
    assert all_matched_results[0].matched_content == limited_matched_results[
        0].matched_content
    assert all_matched_results[1].matched_content == limited_matched_results[
        1].matched_content
def test_documents():
    DocumentStore.create_document('fake-user',
                                  'doc1',
                                  'Test document',
                                  'This is a test',
                                  replace=True,
                                  document_id='doc1')
    DocumentStore.get_documents('fake-user')
    response = Responder.get_answers_from_documents('fake-user',
                                                    'What is this?',
                                                    document_ids=['doc1'])
    pprint(response)
    assert response[0]['sourceId'] == 'doc1'
Exemplo n.º 3
0
def delete_all_user_data(user_id):
    """Delete user, depending on global settings this will be in production or info DB."""
    user = User.get('user_id', user_id)
    if user is None:
        raise UserException(ERROR_USER_DOES_NOT_EXIST % user_id)
    documents = DocumentStore.get_documents(user.token)
    for document in documents:
        DocumentStore.delete_document(user.token, document['id'])
    saved_replies = AnnotationStore.get_annotations(user.token,
                                                    saved_replies=True)
    for saved_reply in saved_replies:
        AnnotationStore.delete_annotation(user.token, saved_reply['id'])
    annotations = AnnotationStore.get_annotations(user.token,
                                                  saved_replies=False)
    for annotation in annotations:
        AnnotationStore.delete_annotation(user.token, annotation['id'])

    user.delete_instance()
    info("User " + user_id + " data deleted successfully")
    info("Looking for session data")
    del_counter = 0
    sessions = Session.all('user_id', user_id)
    for session in sessions:
        session.delete_instance()
        del_counter += 1
    info("Deleted " + str(del_counter) + " sessions")
    del_counter = 0
    events = Event.all('user_id', user_id)
    for event in events:
        event.delete_instance()
        del_counter += 1
    info("Deleted " + str(del_counter) + " events")
    del_counter = 0
    bots = Bot.all('user_id', user_id)
    for bot in bots:
        bot.delete_instance()
        del_counter += 1
    info("Deleted " + str(del_counter) + " bots")
    del_counter = 0
    coverage_entries = Coverage.all('user_id', user_id)
    for coverage in coverage_entries:
        coverage.delete_instance()
        del_counter += 1
    info("Deleted " + str(del_counter) + " coveragae entries")
    del_counter = 0
    email_events = EmailEvent.all('user_id', user_id)
    for event in email_events:
        event.delete_instance()
        del_counter += 1
    info("Deleted " + str(del_counter) + " email events")
def test_exact_match(reset_db):
    # Insert document in DB
    for idx, doc_text in enumerate(_DOCUMENT_TEXTS):
        DocumentStore.create_document(_LOGIN, 'Title of doc %d' % idx, 'test',
                                      doc_text)

    # exact match is expected to return a perfect score
    matched_results = list(
        DocumentStore.search_chunks(_LOGIN, 'one plus one makes two'))
    assert matched_results[0].matched_content == 'one plus one makes two'
    assert matched_results[0].matched_score == 1.0

    # close match is expected when the punctuation and trailing spaces are removed
    matched_results = list(
        DocumentStore.search_chunks(_LOGIN, ' one ,plus+ one makes two !!'))
    assert matched_results[0].matched_content == 'one plus one makes two'
    assert matched_results[0].matched_score == 0.99
def test_chunk_search(reset_db):
    get_embeddings_function = len  # in prod this would be the embedding generation function

    # Insert document in DB
    for idx, doc_text in enumerate(_DOCUMENT_TEXTS):
        DocumentStore.create_document(_LOGIN,
                                      'Title of doc %d' % idx,
                                      'test',
                                      doc_text,
                                      get_embedding=get_embeddings_function)

    # Get the search results of chunks, with a single SQL query,
    # do this if all you need is the number of results and/or text content and/or confidence scores
    matched_results = list(
        DocumentStore.search_chunks(_LOGIN, 'who were the normans?'))
    assert len(matched_results) == 8
    assert " were descended from Norse " in matched_results[0].matched_content

    assert matched_results[0].matched_score > matched_results[1].matched_score
    assert matched_results[1].matched_score > 0.0

    # Reader workers can make an extra SQL query to retrieve the fields of DocumentChunk as strings
    # {'chunk_idx': '0',
    # 'document_id': 'dd5c8526091ce0e937a062da23833808b4e54d9ce41cdc101173265b6a718bbd',
    # 'embedding': '742',
    # 'number_of_words': '113',
    # 'overlap_after': '',  #empty because there is no text after this chunk
    # 'overlap_before': '', #empty because there is no text before this chunk
    # 'text_span': '[0, 742]',
    # 'unique_id': "('*****@*****.**', "
    #              "'dd5c8526091ce0e937a062da23833808b4e54d9ce41cdc101173265b6a718bbd')",
    # 'user_id': '*****@*****.**'}
    fields = matched_results[0].get_indexable_string_fields()
    assert fields['embedding'] == str(len(matched_results[0].matched_content)
                                      )  # in prod would be string of an array

    # Only if absolutely necessary but should not be used when machine reading
    # you can retrieve the full DocumentRecord object
    # which includes all chunks, this is "slow" as we are unpickling from the DB
    document_record: DocumentRecord = matched_results[0].get_retrievable()
    assert document_record.unique_id == str(
        (document_record.user_id, document_record.document_id))
    assert document_record.text == _DOCUMENT_TEXTS[0]
    assert isinstance(next(iter(document_record.chunks.values())),
                      DocumentChunk)
    assert len(document_record.chunks) == 1
Exemplo n.º 6
0
def _upload_document(request):
    user_token = request['user'].token
    title = required_parameter(request, 'title')
    if 'text' in request['args']:
        document_content = request['args']['text']
        document_type = 'text'
    elif 'file' in request.files:
        document_file = request.files.get('file')
        document_content = document_file.body.decode()
        document_type = 'file'
    else:
        raise UserException(ERROR_REQUIRED_PARAMETER % "text' or 'file")

    if 'documentid' in request['args'] and request['args']['documentid'] != '':
        document_id = request['args']['documentid']
    else:
        document_id = sha256(document_content.encode('utf-8')).hexdigest()

    if 'origin' in request['args'] and request['args']['origin'] != '':
        origin = request['args']['origin']
    else:
        origin = ''

    document_type = optional_parameter(request, 'type', document_type)
    replace = 'replace' in request['args'] and request['args']['replace'].lower() == 'true'

    DocumentStore.create_document(user_id=user_token,
                                  document_id=document_id,
                                  title=title,
                                  text=document_content,
                                  origin=origin,
                                  document_type=document_type,
                                  replace=replace,
                                  get_embedding=Responder.get_document_embeddings)

    return {'documentId': document_id}
def test_simple_search(reset_db):
    for idx, doc_text in enumerate(_DOCUMENT_TEXTS):
        DocumentStore.create_document(_LOGIN, 'Title of doc %d' % idx, 'test',
                                      doc_text)

    assert len(DocumentStore.get_documents(_LOGIN)) == 9
    assert len(DocumentStore.get_documents(_LOGIN, search_term='normans')) == 1
    assert len(DocumentStore.get_documents(_LOGIN,
                                           search_term='f80q35jf98')) == 0
    assert DocumentStore.get_documents(
        _LOGIN, search_term='one')[0]['text'] == _DOCUMENT_TEXTS[3]
Exemplo n.º 8
0
    def get_answers_from_documents(
        user_token: str,
        question: str,
        document_ids: Optional[List[str]] = None,
        offset: int = 0,
        number_of_items: int = 1,
        text: str = None,
        threshold: str = 'MEDIUM',
        speed_or_accuracy: str = 'balanced',
    ) -> List[dict]:
        """
        Returns answers from a user's documents

        :param user_token:      User's ID token
        :param question:        Question in string format
        :param document_ids:    Limit search to specified document IDs
        :param text:            Search for an answer in the given text
        """
        results = []
        if text is not None:
            temp_id = 'Inline text-' + sha256(text.encode('utf-8')).hexdigest()
            DocumentStore.create_document(user_token,
                                          "Inline text",
                                          "Inline text",
                                          text,
                                          document_id=temp_id,
                                          replace=True)
            if document_ids is not None:
                document_ids.append(temp_id)
            else:
                document_ids = [temp_id]
        speed_or_accuracy_coef = SPEED_OR_ACCURACY_CHUNKS_MAP[
            speed_or_accuracy]
        if speed_or_accuracy_coef > 0:
            limit_per_doc = int(
                ceil(number_of_items * NUM_WORKERS_PER_REQUEST *
                     speed_or_accuracy_coef))
        else:
            limit_per_doc = None
        chunk_results = list(
            DocumentStore.search_chunks(user_token,
                                        question,
                                        document_ids=document_ids,
                                        limit_per_doc=limit_per_doc))
        if len(chunk_results) == 0:
            # We don't have any matching documents
            return []
        worker_chunks = Responder.split_chunks(chunk_results,
                                               NUM_WORKERS_PER_REQUEST)
        respond = partial(Responder.machine_reader_logits, question)
        future_answers: List = connect().map(respond, worker_chunks)
        machine_reader_configuration = Responder.get_machine_reader_configuration(
            offset, number_of_items)
        reduced_answers = connect().submit(Responder.reduce_results,
                                           future_answers,
                                           machine_reader_configuration,
                                           worker_chunks)
        results.extend(reduced_answers.result())
        if text is not None:
            DocumentStore.delete_document(user_token, temp_id)

        threshold_value = THRESHOLD_MAP['document'].get(
            threshold, THRESHOLD_MAP['document']['MEDIUM'])

        results = list(
            filter(lambda reply: reply['confidence'] >= threshold_value,
                   results))

        return results
Exemplo n.º 9
0
def cleanup():
    for annotation in AnnotationStore.get_annotations('fake-user'):
        AnnotationStore.delete_annotation('fake-user', annotation['id'])
    for document in DocumentStore.get_documents('fake-user'):
        DocumentStore.delete_document('fake-user', document['id'])
def test_create_and_delete(reset_db):
    created1 = DocumentStore.create_document(_LOGIN, 'first doc', 'test',
                                             _DOCUMENT_TEXTS[0])
    assert created1 == {
        'documentId':
        'dd5c8526091ce0e937a062da23833808b4e54d9ce41cdc101173265b6a718bbd'
    }
    assert DocumentStore.get_documents(_LOGIN) == DocumentStore.get_documents(
        _LOGIN, document_ids=[created1['documentId']])

    created = DocumentStore.create_document(_LOGIN,
                                            'first doc',
                                            'test',
                                            _DOCUMENT_TEXTS[-1],
                                            document_id='bla.txt')
    assert created == {'documentId': 'bla.txt'}
    assert len(DocumentStore.get_documents(_LOGIN)) == 2
    assert DocumentStore.get_documents(
        _LOGIN,
        document_ids=[created['documentId']])[0]['text'] == _DOCUMENT_TEXTS[-1]

    with pytest.raises(UserException):  # same document_id
        DocumentStore.create_document(_LOGIN,
                                      'first docs',
                                      'test',
                                      _DOCUMENT_TEXTS[1],
                                      document_id='bla.txt')
    with pytest.raises(UserException):  # same auto generated document_id
        DocumentStore.create_document(_LOGIN, 'first docs', 'test',
                                      _DOCUMENT_TEXTS[0])
    with pytest.raises(UserException):  # does not exist
        DocumentStore.delete_document('bla', 'bla')
    deleted = DocumentStore.delete_document(_LOGIN, 'bla.txt')
    assert deleted['documentId'] == 'bla.txt'
    assert DocumentStore.get_documents(_LOGIN) == DocumentStore.get_documents(
        _LOGIN, document_ids=[created1['documentId']])
    with pytest.raises(UserException):  # already deleted
        DocumentStore.delete_document(_LOGIN, 'bla.txt')
    deleted = DocumentStore.delete_document(_LOGIN, created1['documentId'])
    assert deleted['documentId'] == created1['documentId']
    assert len(DocumentStore.get_documents(_LOGIN)) == 0
def _stats(request):
    user_id = request['user'].user_id
    events = Event.select().where(Event.user_id == user_id).order_by(Event.created.desc())
    total = events.count()
    automatic = 0
    assisted = 0
    unanswered = 0
    total_duration = 0
    average_response_time = 0
    questions = []
    source_count = {}
    coverage = []

    # Find total number of saved replies
    total_saved_replies = len(AnnotationStore.get_annotations(request['user'].token, saved_replies=True))

    # Find total number of documents
    documents = DocumentStore.get_documents(request['user'].token)
    total_documents = len(documents)

    for event in events:
        question = {
            'created': event.created.isoformat(),
            'duration': event.duration,
            'question': event.question
        }
        if event.answered:
            answer = event.answers[0]
            total_duration += event.duration
            question['answer'] = answer['answerText']
            if answer['sourceType'] == 'saved_reply':
                automatic += 1
                question['status'] = 'automatic'
                question['matchedQuestion'] = answer['matchedQuestion']
            else:
                assisted += 1
                question['status'] = 'assisted'
                if answer['sourceId'] in source_count:
                    source_count[answer['sourceId']] += 1
                else:
                    source_count[answer['sourceId']] = 1
        else:
            unanswered += 1
            question['status'] = 'unanswered'
        questions.append(question)

    source_count['saved_reply'] = automatic
    source_count['unanswered'] = unanswered
    sources = sorted(source_count.items(), key=lambda x: x[1], reverse=True)
    sources_percent = []
    if total > 0:
        for source in sources:
            if source[0] == 'saved_reply':
                document_title = 'Saved replies'
            elif source[0] == 'unanswered':
                document_title = 'Unanswered'
            else:
                document_title = source[0]
                if source[0] in documents:
                    if documents:
                        if len(documents[source[0]]['title']) > 0:
                            document_title = documents[source[0]]['title']
                else:
                    document_title = 'Deleted document'
            sources_percent.append({'source': source[0], 'title': document_title, 'percent': (source[1] / total) * 100})
        average_response_time = total_duration / total

    coverage_stats = Coverage.select().where(Coverage.user_id == user_id)
    for stat in coverage_stats:
        coverage.append({'coverage': stat.coverage, 'time': stat.created})

    return {'averageResponseTime': average_response_time, 'totalSavedReplies': total_saved_replies,
            'totalDocuments': total_documents, 'totalQuestions': total, 'automatic': automatic, 'assisted': assisted,
            'unanswered': unanswered, 'sources': sources_percent, 'questions': questions, 'coverage': coverage}
Exemplo n.º 12
0
def _delete_document(request):
    user_token = request['user'].token
    document_id = required_parameter(request, 'documentId')
    DocumentStore.delete_document(user_token, document_id)
    return {'documentId': document_id}
Exemplo n.º 13
0
def _get_documents(request, number_of_items=30, offset=0, document_ids=None):
    user_token = request['user'].token
    search_term = optional_parameter(request, 'searchTerm', None)
    documents = DocumentStore.get_documents(user_token, document_ids=document_ids, search_term=search_term)
    return {'totalItems': len(documents), 'items': documents[offset:offset + number_of_items]}