def test_langdetect(): assert langdetect('') is None assert langdetect('Dear sir, please open the door') == 'en'
def filter(self): """Filter this :class:`~.db.Cluster` and its children :class:`~.db.Quote`\ s to see if they're worth keeping. First, iterate through all the children :class:`~.db.Quote`\ s of the cluster, seeing if each one of them is worth keeping. A :class:`~.db.Quote` is discarded if it has no urls, less than :data:`~.settings.MT_FILTER_MIN_TOKENS`, spans longer than :data:`~.settings.MT_FILTER_MAX_DAYS`, or is not in English. Any :class:`~.db.Quote` that has none of those problems will be kept. If after this filtering there are no :class:`~.db.Quote`\ s left, or the :class:`~.db.Cluster` made of the remaining :class:`~.db.Quote`\ s still spans longer than :data:`~.settings.MT_FILTER_MAX_DAYS`, the cluster and all its quotes will be discarded and `None` is returned. If not, a new :class:`~.db.Cluster` is created with `cluster.filtered = True` and `cluster.id = original_cluster.id +` :func:`filter_cluster_offset`. That new cluster points to copies of all the kept :class:`~.db.Quote`\ s, with `quote.filtered = True` and `quote.id = original_quote.id +` :func:`filter_quote_offset`. All those models (new cluster and new quotes) should later be saved to the database (the method does not do it for you), e.g. by running this method inside a :func:`~.utils.session_scope`. Returns ------- cluster : :class:`~.db.Cluster` or None The filtered cluster pointing to filtered quotes, or `None` if it is to be discarded. Raises ------ AlreadyFiltered If this cluster is already filtered (i.e. :attr:`~.db.Cluster.filtered` is `True`). """ if self.filtered: raise AlreadyFiltered('Cluster is already filtered') min_tokens = settings.MT_FILTER_MIN_TOKENS max_span = timedelta(days=settings.MT_FILTER_MAX_DAYS) fcluster = self.clone(id=filter_cluster_offset() + self.id, filtered=True) # Examine each quote for min_tokens, max_days, and language. for quote in self.quotes: if quote.frequency == 0: logger.debug('Dropping quote #%s (cluster #%s): ' 'no urls', quote.sid, self.sid) continue if len(quote.tokens) < min_tokens: logger.debug('Dropping quote #%s (cluster #%s): ' 'not enough tokens', quote.sid, self.sid) continue if quote.span > max_span: logger.debug('Dropping quote #%s (cluster #%s): ' 'span too big', quote.sid, self.sid) continue if langdetect(quote.string) != 'en': logger.debug('Dropping quote #%s (cluster #%s): ' 'not English', quote.sid, self.sid) continue logger.debug('Keeping quote #%s (cluster #%s)', quote.sid, self.sid) fquote = quote.clone(id=filter_quote_offset() + quote.id, cluster_id=fcluster.id, filtered=True) fcluster.quotes.append(fquote) # If no quotes where kept, drop the whole cluster. if fcluster.size == 0: logger.debug('Dropping cluster #%s: no quotes left', self.sid) return # Finally, if the new cluster spans too many days, discard it. if fcluster.span > max_span: logger.debug('Dropping cluster #%s: span too big', self.sid) return logger.debug('Keeping cluster #%s after filtering', self.sid) return fcluster