Exemplo n.º 1
0
def test_update_content(app):
    # Setup a content document
    input_doc = {
        "data": {
            "id": "test_id",
            "url": "some test url 123",
            "content": {
                "title": "some_test_title",
                "keywords": [{
                    "keyword": "change_from_test",
                    "confidence": 0.2010
                }],
                "texts": ["some test text"]
            }
        }
    }
    prod_col = Config.get_mongo_collection("prod")
    manual_col = Config.get_mongo_collection("manual")
    factory.post_document(input_doc["data"].copy(), prod_col)

    try:
        # Make a change
        new_title = "title has been changed"
        input_doc["data"]["content"]["title"] = new_title
        response = app.test_client().post('/v1/web/content/',
                                          data=json.dumps(input_doc))
        response = app.test_client().get('/v1/web/content/?id=test_id')
        response_doc = json.loads(response.data.decode())

        print(response_doc)
        assert response_doc["manual"]["title"] == new_title
    finally:
        # Delete test content
        factory.delete_document({"id": "test_id"}, manual_col)
        factory.delete_document({"id": "test_id"}, prod_col)
Exemplo n.º 2
0
    def set_db(self):
        """ Set the working database """
        url, port = Config.get_db_connection()

        user, password = Config.get_mongo_db_credentials()
        db = Config.get_mongo_db()

        self._set_database(url, db, user, password, port)
Exemplo n.º 3
0
    def visit_node(self, data, model_template, models, title=None):
        """ Recursively traverse the children and create new Contents from
        paragraphs. """
        accepted_tags = Config.get_value(["model", "accepted_tags"])

        for child in data:
            if "children" in child:
                title_text = "{} - {}".format(title, child["text"]) \
                             if title else child["text"]
                self.visit_node(child["children"], model_template,
                                models, title=title_text)

            elif child["tag"] in accepted_tags:
                # Hit a leaf node in recursion tree. We extract the text here
                # and continue.
                keywords = [KeyWord(*kw)
                            for kw in get_keywords(self.__vectorizer,
                                                   self.__feature_names,
                                                   "{} {}"
                                                   .format(title,
                                                           child["text"]))]

                content = Content(title, child["text"],
                                  child["links"], keywords)
                new_model = copy.deepcopy(model_template)
                new_model["id"] = child["id"]
                new_model["content"] = content.get_content()
                models.append(new_model)

        return models
Exemplo n.º 4
0
def test_get_docs_from_url(app):
    # Setup a content document
    input_doc = {
        "data": {
            "id": "test_id_for_url",
            "url": "some test url",
            "content": {
                "title": "some_test_title",
                "keywords": [{
                    "keyword": "change_from_test",
                    "confidence": 0.2010
                }],
                "texts": ["some test text"]
            }
        }
    }
    prod_col = Config.get_mongo_collection("prod")
    factory.post_document(input_doc["data"].copy(), prod_col)

    try:
        response = app.test_client().get('/v1/web/docs/?url=some test url')
        response_json = json.loads(response.data.decode())
        assert response_json[0]["id"] == "test_id_for_url"
    finally:
        factory.delete_document({"id": "test_id_for_url"}, prod_col)
Exemplo n.º 5
0
def test_get_all_conflicts(app):
    # Setup two conflicts
    conflicts = [{
        "conflict_id": "test_conflict_id_{}".format(i),
        "title": "test_conflict_title_{}".format(i)
    } for i in range(2)]

    conflict_col = Config.get_mongo_collection("conflicts")
    # Post both focuments to conflict_ids
    for conflict in conflicts:
        factory.post_document(conflict, conflict_col)

    response = app.test_client().get('/v1/web/conflict_ids')

    try:
        response = app.test_client().get('/v1/web/conflict_ids')
        response_json = json.loads(response.data.decode())

        for conflict in conflicts:
            assert conflict["conflict_id"] in [
                resp["id"] for resp in response_json
            ]
    finally:
        # Delete test conflits
        for conflict in conflicts:
            factory.delete_document({"conflict_id": conflict["conflict_id"]},
                                    conflict_col)
Exemplo n.º 6
0
def check_manually_changed(factory, document):
    if document["manually_changed"]:
        id = document["id"]
        manual_col = Config.get_mongo_collection("manual")
        return next(
            factory.get_database().get_collection(manual_col).find({"id": id}),
            None)
    else:
        return document
Exemplo n.º 7
0
    def get_document(self,
                     query,
                     prod_col=Config.get_mongo_collection("prod"),
                     manual_col=Config.get_mongo_collection("manual"),
                     number_of_docs=30):
        """
        Searches for documents using MongoDB in a given document collection.
        Get 15 results from prod. Get 15 from Manual.
        Go through every doc in prod and delete the ones with
        manually_changed=true.  Then return every remaining document, remember
        it's not sorted now, but for what we need it for this is not necessary.
        """
        prod_col = self.get_collection(prod_col)
        cursor = prod_col.find({'$text': {
            '$search': query
        }}, {'score': {
            '$meta': 'textScore'
        }})
        # Sort and retrieve some of the top scoring documents.
        cursor.sort([('score', {'$meta': 'textScore'})]).limit(number_of_docs)

        docs = []
        for doc in cursor:
            if doc["manually_changed"] is False:
                docs.append(doc)

        manual_col = self.get_collection(manual_col)
        cursor = manual_col.find({'$text': {
            '$search': query
        }}, {'score': {
            '$meta': 'textScore'
        }})
        # Sort and retrieve some of the top scoring documents.
        cursor.sort([('score', {'$meta': 'textScore'})]).limit(number_of_docs)
        for doc in cursor:
            docs.append(doc)

        return docs
Exemplo n.º 8
0
def _handle_not_found(query_text):
    '''
    Inserts this specific query text into the unknown queries collection as
    well as returning a fallback string.
    '''
    try:
        unknown_col = Config.get_mongo_collection("unknown")
        factory.get_database().get_collection(unknown_col).insert_one(
            {"query_text": query_text})
    except pymongo.errors.DuplicateKeyError:
        # If we already have this specific query in the unknown_queries
        # collection we don't need to add it again.
        pass

    return NOT_FOUND
Exemplo n.º 9
0
def test_get_content(app):
    # Setup a content document
    document = {
        "id": "test_content_id",
        "content": "some_test_content",
        "url": "test_url"
    }
    prod_col = Config.get_mongo_collection("prod")
    factory.post_document(document, prod_col)

    try:
        url = "/v1/web/content/?id=test_content_id"
        response = app.test_client().get(url)
        response_json = json.loads(response.data.decode())
        assert response_json["prod"] == "some_test_content"
    finally:
        # Delete test content
        factory.delete_document({"id": "test_content_id"}, prod_col)
Exemplo n.º 10
0
def insert_documents(data):
    """ Insert all provided documents. Checks if the document has been manually
    changed before - if it has, and the new document does not match, it is
    marked as a conflict """
    factory = ModelFactory.get_instance()
    factory.set_db()

    temp_col = Config.get_mongo_collection("temp_scraped")
    manual_col = Config.get_mongo_collection("manual")
    unknown_col = Config.get_mongo_collection("unknown")
    prod_col = Config.get_mongo_collection("prod")
    conflict_col = Config.get_mongo_collection("conflicts")

    print("Starting insertion of {} documents".format(len(data)))
    pbar = ProgressBar()
    for i, doc in enumerate(pbar(data)):
        factory.post_document(doc, temp_col)
    print("Successfully inserted {} documents".format(i + 1))

    manual_docs = factory.get_collection(manual_col).find()

    conflicts = []
    for manual_doc in manual_docs:
        if "id" in manual_doc:
            idx = manual_doc["id"]
        else:
            continue

        # Mark corresponding entry in temp collection as manually changed
        factory.get_database() \
               .get_collection(temp_col) \
               .update_one({"id": idx}, {"$set": {"manually_changed": True}})

        prod_doc = next(factory.get_collection(prod_col).find({"id": idx}),
                        None)
        temp_doc = next(factory.get_collection(temp_col).find({"id": idx}),
                        None)

        if prod_doc and temp_doc:
            if not temp_doc["content"] == prod_doc["content"]:
                title = temp_doc["content"]["title"]
                conflicts.append({"id": idx,
                                  "title": title})

    print("Conflicts: {}".format(conflicts))
    factory.get_collection(conflict_col).create_index([("title", 1)],
                                                      unique=True)
    for conflict in conflicts:
        try:
            factory.post_document(conflict, conflict_col)
        except pymongo.errors.DuplicateKeyError:
            # In case there are dupliacte, unsolved conflicts
            pass

    # Update production collection
    db = factory.get_database()
    try:
        db.get_collection(prod_col).rename("old_prod")
    except pymongo.errors.OperationFailure:
        # If the prod collection does not exist
        pass

    try:
        db.get_collection(temp_col).rename(prod_col)
    except Exception as e:
        print("Failed to update production db collection")
        print(e)
        db.get_collection("old_prod").rename(prod_col)
    finally:
        db.get_collection("old_prod").drop()
        db.get_collection(temp_col).drop()

    # Update all indexes
    factory.set_index(prod_col)
    factory.set_index(manual_col)
    factory.set_index(temp_col)
    # Removes duplicates
    factory.get_collection(unknown_col).create_index([("query_text", 1)],
                                                     unique=True)

    return conflicts
Exemplo n.º 11
0
from sklearn.metrics.pairwise import cosine_similarity

from spellchecker import SpellChecker

from nltk.corpus import wordnet as wn

from chatbot.model.model_factory import ModelFactory
from chatbot.nlp.keyword import get_tfidf_model, get_stopwords, lemmatize, nb
from chatbot.nlp.synset import SynsetWrapper
from chatbot.util.config_util import Config
from chatbot.util.logger_util import set_logger

if str(os.getenv("LOG")) == "TRUE":
    set_logger()

NOT_FOUND = Config.get_value(['query_system', 'not_found'])
MULTIPLE_ANSWERS = Config.get_value(['query_system', 'multiple_answers'])
CHAR_LIMIT = Config.get_value(['query_system', 'character_limit'])
MAX_ANSWERS = Config.get_value(['query_system', 'max_answers'])
URL_FROM_TEXT = Config.get_value(['query_system', 'url_from_text'])

factory = ModelFactory.get_instance()
factory.set_db()


def _handle_not_found(query_text):
    '''
    Inserts this specific query text into the unknown queries collection as
    well as returning a fallback string.
    '''
    try:
class InfoGatheringSpider(scrapy.Spider):
    # Name of the spider. This is the name to use from the Scrapy CLI.
    name = 'info_gathering'

    config = Config.get_value(['scraper'])

    # The following few lines contain command line flags.
    # All flags default to false, so do not explicitly set them as so.
    # See the GitHub Wiki for information about how these are used.

    # Enable to display additional debugging information to output when the
    # crawler is run.  In practice, this will pretty print the exported tree
    # when a page is scraped.

    debug = 'debug' if config['debug'] else None

    # Elements which sometimes are used to indicate a header.
    alternative_headers = config['alternative_headers']

    # Root url for all web pages
    root_url = config['url']['root_url']

    # The links to start the crawling process on.
    start_urls = [root_url]

    # Paths on the site which are allowed. Only paths which match
    # these will ever be visited.
    allowed_paths = list(map(re.compile, config['url']['allowed_paths']))

    # Pages in this list will be visited and links on them will
    # be visited, however the data will not be scraped.
    scrape_blacklist = list(map(re.compile, config['blacklist']['scrape']))

    # These links will never be visited, even if the path is allowed above.
    visit_blacklist = list(map(re.compile, config['blacklist']['visit']))

    # These selectors will be removed from all pages, as they contain very
    # little actual information, and are equal on all pages.
    garbage_elements = set(config['blacklist']['elements'])

    # Elements containing text equal to one of these sentences will be
    # removed from all pages.

    garbage_text = set(config['blacklist']['texts'])

    # Elements containing an url in href that starts with the following
    # will be removed
    garbage_start_urls = set(config['blacklist']['garbage_start_urls'])

    # Elements containing an url in href that ends with the following
    # will be removed.
    garbage_resources = set(config['blacklist']['resources'])

    # The text used for the title on 404 pages. Used to detect silent 404
    # error.
    not_found_text = config['blacklist']['not_found_text']

    # Hierarchy for sorting categories.
    # Elements with level=None will follow normal html hierarchy
    hierarchy = config['hierarchy']

    # If a tag is listed here, sequences of tabs belonging to one of these
    # types will all be merged into one tag. For example, directly following
    # paragraph tags will be merged into one big paragraph, separated with
    # newlines.  The value corresponding to each key is the word limit for when
    # the following tag can be merged together
    concatenation_tags_word_limit = config['concatenation']

    # Of the elements in the hierarchy, these tags will not be created as nodes
    # if their parent is in the set of parents.
    ignored_child_tags = config['blacklist']['ignored_child_tags_for_parents']

    def normalize(self, text):
        return unicodedata.normalize('NFKC', text)

    def extract_metadata(self, root, soup, page_id):
        ''' Extract keywords metadata from the header of the page and add them
        as children of the tree root element. '''

        # Attempt finding the keywords meta tag on the page.
        keywords = soup.find('meta', attrs={'name': 'keywords'})

        if keywords and 'content' in keywords.attrs:
            # Add the keywords beneath the title in the tree, if the meta tag
            # has the content attribute correctly specified.
            TreeElement('meta',
                        page_id,
                        keywords.attrs['content'],
                        parent=root)

    def locate_parent(self, elem_tag, current_parent, root):
        ''' Locate the parent element on which we should insert the next
        node in the tree, based on our hierarchy of tags. '''

        # Data about this elements position in the hierarchy.
        elem_level = None
        if elem_tag in self.hierarchy:
            elem_level = self.hierarchy[elem_tag]

        # The parent which will be used for the next node in the tree.
        parent = None

        # Search for the appropriate parent element.
        search_parent = current_parent

        while True:
            # If we reach the root node, use it.
            if search_parent == root:
                parent = root
                break

            # We reached a tag of the same type, so use it.
            if search_parent.tag == elem_tag:
                parent = search_parent.parent
                break

            # Whether the search parent is in the hierarchy or not.
            search_parent_level = None
            if search_parent.tag in self.hierarchy:
                search_parent_level = self.hierarchy[search_parent.tag]

            if search_parent_level:
                # If both tags are in the hierarchy, check their level.
                if elem_level:
                    if elem_level > search_parent_level:
                        parent = search_parent
                        break

                    if elem_level == search_parent_level:
                        # If elements are in same level in hierarchy.
                        parent = search_parent.parent
                        break
                else:
                    # Element where hierarchy is not defined.
                    parent = search_parent
                    break

            # Update the current parent while searching.
            search_parent = search_parent.parent

        # Return the parent element candidate.
        return parent

    def generate_tree(self, response):
        ''' Creates a tree structure describing the given page. This structure
        is based on headers, creating a hierarchy based on text pieces which
        are positioned in between different types of headers. '''

        # Reset id to 0 when on a new page.
        TreeElement.counter = 0

        # Hash the page URL, it will be used as an ID.
        page_id = sha1(response.url.encode()).hexdigest()

        # Parse the HTML using BeautifulSoup. Make sure we use LXML for
        # parsing.
        soup = BeautifulSoup(response.text, 'lxml')

        # We only care about elements on the page which are defined in the
        # hierarchy.
        elements = soup.find_all(self.hierarchy.keys())

        # We remove the header and footer tags from the page to reduce
        # bloat, as these contain little useful information.
        for garbage_selector in self.garbage_elements:
            for garbage_element in soup.select(garbage_selector):
                garbage_element.decompose()

        # Locate the title element. It might be used for the tree root.
        title = self.normalize(soup.find('title').text.strip())

        # Do not continue with this page if we detect it as a silent 404.
        if self.not_found_text in title:
            return

        # Use the title as the tree root.
        root = TreeElement('title', page_id, title)

        # Attempt extracting the keywords and adding them to the tree.
        self.extract_metadata(root, soup, page_id)

        # Current position in the hierarchy.
        current_parent = root

        for elem in elements:
            # Replace BR tags with newlines.
            for br in elem.find_all('br'):
                br.replace_with('\n')

            # Remove trailing and tailing spaces from the node contents.
            elem_text = self.normalize(elem.text.strip())

            # Find the type of this element.
            elem_tag = elem.name

            # Do not allow tree nodes with empty text.
            if not elem_text:
                continue

            # Set list for list point element
            if elem_tag == 'li':
                elem_text = '- ' + elem_text

            # Do not include elements with element text containing
            # blacklisted sentences.
            if any(sentence in elem_text for sentence in self.garbage_text):
                continue

            if self.alternative_headers:
                # If a paragraph contains for example a strong tag, we can
                # treat that combination as a header. This check avoids adding
                # the strong tag in addition to the custom header.
                if elem_tag in self.alternative_headers and \
                               current_parent.tag == 'h6' and \
                               self.normalize(current_parent.text) \
                               == elem_text:
                    continue

                if elem_tag == 'p':
                    # Find all alternative header tags inside this paragraph.
                    headers = elem.find_all(self.alternative_headers)

                    # Check if there is only 1 alternative header tag, and
                    # check if it contains all of the text inside the
                    # paragraph.
                    if len(headers) == 1 and elem_text \
                            == self.normalize(headers[0].text.strip()):
                        # Locate the parent in which a H6 tag would be
                        # inserted.
                        parent = self.locate_parent('h6', current_parent, root)

                        # Add a custom H6 element.
                        current_parent = TreeElement(
                            'h6',
                            page_id,
                            elem_text,
                            parent,
                        )
                        continue

            # Locate the parent element to use based on the hierarchy.
            parent = self.locate_parent(elem_tag, current_parent, root)

            # Concatenate tags like paragraph tags which directly follow each
            # other.
            if elem_tag in self.concatenation_tags_word_limit and \
                    parent.children:
                last_child = parent.children[-1]

                # Start a new paragraph if the last child already has children
                if last_child and last_child.tag == elem_tag and \
                        not last_child.children:
                    # Concatenate the texts until limit reached
                    if len(elem_text.split()) \
                            <= self.concatenation_tags_word_limit[elem_tag]:
                        last_child.text += '\n' + elem_text
                        continue

            # Add the anchor's href url when finding an anchor
            # If anchor, don't create a new element, but add url instead to
            # parent
            if elem_tag == 'a':
                # Create a valid url from the href url if any
                url = self.create_valid_url(elem.get('href'))

                # If the url from href is invalid, ignore anchor tag
                if url is None:
                    continue

                # If the URL is unequal to the elem text
                if url != elem_text:
                    # Add the element text to parent instead of creating a
                    # new element
                    if elem_text in self.normalize(parent.text):
                        current_parent.links.append([elem_text, url])
                        continue

                    current_parent.links.append([url, url])

            elif elem_tag in self.ignored_child_tags \
                    and current_parent.tag \
                    in self.ignored_child_tags[elem_tag]:
                # If the parent's text includes this element's text,
                # don't create a node for this element.
                continue
            else:
                # Create the new element.
                current_parent = TreeElement(
                    elem_tag,
                    page_id,
                    elem_text,
                    parent,
                )

        return root

    # Returns a valid url based on blacklisting and type
    def create_valid_url(self, url):
        ''' Takes in an url from an anchor tag's href.
        Returns None if the url is None, blacklisted or invalid.
        Returns an absolute url otherwise. '''

        # If the url isn't defined
        if url is None:
            return None

        # Check if the url stars with blacklisted characters
        for start_url in self.garbage_start_urls:
            if url.startswith(start_url):
                return None

        # Check if the url is a blacklisted resource or file type
        for end_url in self.garbage_resources:
            if url.endswith(end_url):
                # This url is blacklisted, ignore this element
                return None

        # If the url is relative or a valid resource link
        if not bool(urlparse(url).netloc):
            # Concatenate the root and relative url
            url = urljoin(self.root_url, url)

        return url

    def pretty_print_tree(self, root):
        ''' Print a scraped tree for debugging. '''

        for pre, fill, node in RenderTree(root):
            # We remove newlines from the text with spaces to preserve
            # the shape of the tree when printing in the terminal.
            print('{}{}: {}'.format(pre, node.tag,
                                    node.text.replace('\n', ' ')))

        # Also add a new line before the next tree.
        print()

    def parse(self, response):
        ''' Parses pages which have been requested from the server. '''

        # Only store HTML responses, not other attachments.
        if isinstance(response, HtmlResponse):
            if not any(
                    re.match(regex, response.url)
                    for regex in self.scrape_blacklist):
                # Generate a tree structure describing this page.
                root = self.generate_tree(response)

                # The parser might choose to ignore this page, for example when
                # we detect that the page is a 404 page. In that case, skip the
                # page.
                if root:
                    # Pretty print the node tree if the DEBUG flag is set.
                    if self.debug:
                        self.pretty_print_tree(root)

                    # Export the tree using the DictExporter. Scrapy will then
                    # convert this dictionary to a JSON structure for us,
                    # automatically.
                    exporter = DictExporter()
                    tree = exporter.export(root)

                    yield {
                        # Export the page URL and the tree structure.
                        'url': response.url,
                        'tree': tree,
                    }

            # Follow all links from allowed domains.
            for next_page in LinkExtractor().extract_links(response):
                for allowed_path in self.allowed_paths:
                    # Only follow links that are in the list of allowed paths.
                    if re.match(allowed_path, next_page.url) and not \
                            any(re.match(regex, next_page.url)
                                for regex in self.visit_blacklist):
                        yield response.follow(next_page, self.parse)
                        break
Exemplo n.º 13
0
import pytest
import json

from chatbot.api import server
from chatbot.model.model_factory import ModelFactory
from chatbot.util.config_util import Config

factory = ModelFactory.get_instance()
factory.set_db()

prod_col = Config.get_mongo_collection("prod")
manual_col = Config.get_mongo_collection("manual")
conflict_col = Config.get_mongo_collection("conflicts")
unknown_col = Config.get_mongo_collection("unknown")


@pytest.fixture(scope='module')
def client():
    return server.app.test_client()


def test_swagger(client):
    response = client.get('/')
    assert response.status_code == 200


def test_response(client):
    query = 'some test response'
    try:
        response = client.get('/v2/response/{}/'.format(query))
        assert response.status_code == 200
Exemplo n.º 14
0
import copy
import json

from chatbot.util.config_util import Config
from chatbot.nlp.keyword import lemmatize

SYNSET_FILE = Config.get_value(['query_system', 'custom_synset_file'])


class SynsetWrapper():
    ''' Wrapper for a custom synset list. Interfaces with a text file where
    each line consists of synonyms split by comma '''
    __instance = None

    @staticmethod
    def get_instance():
        ''' Static access method '''
        if SynsetWrapper.__instance is None:
            SynsetWrapper()
        return SynsetWrapper.__instance

    def __init__(self):
        ''' Virtually private constrcutor '''
        if SynsetWrapper.__instance is not None:
            raise Exception('This class is a singleton!')
        else:
            self.__read_synset_file()
            SynsetWrapper.__instance = self

    def get_synset(self, token):
        ''' Return a synset for a given token '''
Exemplo n.º 15
0
from sklearn.metrics.pairwise import cosine_similarity

from spellchecker import SpellChecker

from nltk.corpus import wordnet as wn

from chatbot.model.model_factory import ModelFactory
from chatbot.nlp.keyword import get_tfidf_model, get_stopwords, lemmatize, nb
from chatbot.nlp.synset import SynsetWrapper
from chatbot.util.config_util import Config
from chatbot.util.logger_util import set_logger

if str(os.getenv("LOG")) == "TRUE":
    set_logger()

NOT_FOUND = Config.get_value(['query_system', 'not_found'])
MULTIPLE_ANSWERS = Config.get_value(['query_system', 'multiple_answers'])
CHAR_LIMIT = Config.get_value(['query_system', 'character_limit'])
MAX_ANSWERS = Config.get_value(['query_system', 'max_answers'])
URL_FROM_TEXT = Config.get_value(['query_system', 'url_from_text'])

ANSWER_THRESHOLD = Config.get_value(['query_system', 'answer_threshold'])
SIMILARITY_THRESHOLD = Config.get_value(
    ['query_system', 'similarity_threshold'])

factory = ModelFactory.get_instance()
factory.set_db()


def _handle_not_found(query_text):
    '''