Пример #1
0
 def test_creation(self):
     cache = MongoDBCache("test")
     self.assertFalse(cache.exists_collection())
     cache.write_cache("test", [])
     self.assertTrue(cache.exists_collection())
     cache2 = MongoDBCache("test")
     self.assertTrue(cache2.exists_collection())
     cache.delete_cache()
     self.assertFalse(cache.exists_collection())
     self.assertFalse(cache2.exists_collection())
Пример #2
0
 def _test_write_read_other(self):
     cache = MongoDBCache("testrw", mongodb_location="YOURLOCATION")
     cache_value = cache.read_cache("elephant")
     self.assertIsNone(cache_value)
     cache.write_cache("elephant", [("elephants are big", 0)])
     cache_value = cache.read_cache("elephant")
     self.assertIsNotNone(cache_value)
     suggestions = cache_value
     self.assertEqual(["elephants are big"], suggestions)
     cache.delete_cache()
Пример #3
0
 def test_write_read(self):
     cache = MongoDBCache("testrw")
     cache.delete_cache()
     cache = MongoDBCache("testrw")
     cache_value = cache.read_cache("elephant")
     self.assertIsNone(cache_value)
     cache.write_cache("elephant", [["elephants are big", 0]])
     cache_value = cache.read_cache("elephant")
     self.assertIsNotNone(cache_value)
     suggestions = cache_value
     self.assertEqual([["elephants are big", 0]], suggestions)
     cache.delete_cache()
Пример #4
0
class GoogleBookSubmodule(SubmoduleInterface):
    def __init__(self,
                 module_reference,
                 use_cache=True,
                 cache_name="google-book-cache"):
        super().__init__()
        self._module_reference = module_reference
        self._name = "Google Book Submodule"
        self.use_cache = use_cache
        self.cache = MongoDBCache(cache_name,
                                  mongodb_location=DEFAULT_MONGODB_LOCATION)
        self.internal_cache = None

    def clean(self):
        super().clean()
        del self.internal_cache
        self.internal_cache = None

    def _setup_cache(self):
        self._cache = dict()
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        if os.path.isfile(cache_file):
            with open(cache_file) as f:
                for line in f:
                    line = line.strip().split("\t")
                    if len(line) == 2:
                        self._cache[line[0]] = int(line[1])

    def read_cache(self, query):
        if self.use_cache:
            if self.internal_cache is None:
                self.internal_cache = self.cache.read_all()
            if query in self.internal_cache:
                return float(self.internal_cache[query])
        return None

    def write_cache(self, query, total):
        if self.use_cache:
            self.cache.write_cache(query, total)

    def _get_occurrences_books(self, query, only_cache):
        cache_value = self.read_cache(query)
        if cache_value is not None:
            return cache_value
        if only_cache or service is None:
            return -1
        req = service.volumes().list(q=query, maxResults=1)
        response = req.execute()
        if match_query(query, response):
            total = response["totalItems"]
        else:
            total = 0
        self.write_cache(query, total)
        time.sleep(1.0 / calls_per_seconds)
        return total

    def process(self, input_interface):
        logging.info("Start the verification using google book")
        global service
        if service is None:
            try:
                service = apiclient.discovery.build('books',
                                                    'v1',
                                                    developerKey=api_key)
            except Exception as exception:
                logging.warning("When initializing Google Book: " +
                                str(exception))
            service = None
        if service is None:
            logging.info("No service found for Google Book")
            only_cache = True
        else:
            only_cache = False
        maxi = 0
        for generated_fact in input_interface.get_generated_facts():
            query = _get_query_from_fact(generated_fact)
            occurrences = -1
            try:
                occurrences = self._get_occurrences_books(query, only_cache)
            except Exception as exception0:
                logging.warning(str(exception0))
                only_cache = True
            maxi = max(maxi, occurrences)
        if maxi == 0:
            maxi = 1
        for generated_fact in input_interface.get_generated_facts():
            query = _get_query_from_fact(generated_fact)
            try:
                occurrences = self._get_occurrences_books(query, only_cache)
            except Exception as exception1:
                logging.warning(str(exception1))
                only_cache = True
                continue
            if occurrences != -1:
                generated_fact.get_score().add_score(occurrences / maxi,
                                                     self._module_reference,
                                                     self)
        return input_interface
class WikipediaCooccurrenceSubmodule(ContentComparator):
    def __init__(self,
                 module_reference,
                 use_cache=True,
                 cache_name="wikipedia-cache"):
        super().__init__(module_reference)
        self._name = "Wikipedia Cooccurrence"
        self.use_cache = use_cache
        self._lang = "en"
        self.cache = MongoDBCache(cache_name,
                                  mongodb_location=DEFAULT_MONGODB_LOCATION)

    def _get_wikipedia_page_content(self, name):
        content = self.read_cache(name)
        if content is not None:
            return content
        search = wikipedia.search(name)
        # For now, we only consider the first result
        if search:
            try:
                content = wikipedia.page(search[0]).content
            except wikipedia.DisambiguationError as e:
                # Not clear how often it happens
                if e.options:
                    try:
                        content = wikipedia.page(e.options[0]).content
                    except wikipedia.DisambiguationError as e2:
                        if e2.options:
                            temp = e2.options[0].replace("(", "")\
                                .replace(")", "")
                            try:
                                content = wikipedia.page(temp).content
                            except wikipedia.DisambiguationError as e3:
                                pass
                            except wikipedia.exceptions.PageError:
                                logging.warning("Wikipedia page not found: " +
                                                name)
                    except wikipedia.exceptions.PageError:
                        logging.warning("Wikipedia page not found: " + name)
            except wikipedia.exceptions.PageError:
                logging.warning("Wikipedia page not found: " + name)
        self.write_cache(name, content)
        return content

    def write_cache(self, wikipedia_page, content):
        if self.use_cache:
            filename = wikipedia_page.replace(" ", "_").replace("/", "_")
            self.cache.write_cache(filename, content)

    def read_cache(self, wikipedia_page):
        if self.use_cache:
            filename = wikipedia_page.replace(" ", "_").replace("/", "_")
            cache_value = self.cache.read_cache(filename)
            return cache_value
        return None

    def get_contents(self, subject):
        return [self._get_wikipedia_page_content(subject)]

    def setup_processing(self, input_interface):
        wikipedia.set_lang(self._lang)