Пример #1
0
    def __init__(self, max_gram_size, dictionary_file, stopwords_file, use_bloomfilter=False, timing=False):
        """
        Constructor arguments:
        max_gram_size -- The longest phrase (in words) you want to search for in the dictionary.
        The longer your grams, the more permutations generated (to be searched) adding time to the extract.

        dictionary_file -- The path to your dictionary file.  A sorted, lower-cased, list of phrases.

        stopwords_file -- The path to your stopwords file (words or phrases to be excluded).
        This file should also be sorted and lower-cased.

        use_bloomfilter -- Whether or not to use a BloomFilter.  If True extracts will run several
        times faster.  This takes several minutes to build the very first time you run it.
        """

        self.ngfilter = NGramFilter(max_gram_size)
        self.searcher = Searcher(dictionary_file)
	self.stopwords = Searcher(stopwords_file)
        self.timing = timing
        if use_bloomfilter:
            f = open(dictionary_file)
            self.bloom = BloomFilter(self.BF_BYTES, self.BF_HASHES, iter(f))
        else:
            self.bloom = None
Пример #2
0
class Extractor:
    """
    Extractor Constants:    

    When you instantiate a BloomFilter, you will see the false-positive rate displayed.
    False-positives do not introduce any error, since all 'positive' terms are searched in
    the dictionary.  However, more false-positives means more unnecessary dictionary lookups,
    which will slow down extracts.

    The false-positive rate of the BloomFilter may be tuned with the parameters below:
    (Right now these settings reflect a dictionary size of approximately 10MM entries)

    BF_HASHES: The number of hash functions used by the BloomFilter.   The higher this number 
    is, the lower the false-positive rate drops.  However each hash must be computed for every 
    lookup, so keeping a smaller number of hashes speeds things up.

    BF_BYTES:  
    This is the size of the bitset in use by the BloomFilter.  The bigger the bitset, the lower
    the false-positive rate.  However, the larger the bitset, the more data needs to be read
    into memory at startup (and checked when searched)
    """

    BF_HASHES  = 5
    BF_BYTES   = 8192 * 1024  # 8MB


    def __init__(self, max_gram_size, dictionary_file, stopwords_file, use_bloomfilter=False, timing=False):
        """
        Constructor arguments:
        max_gram_size -- The longest phrase (in words) you want to search for in the dictionary.
        The longer your grams, the more permutations generated (to be searched) adding time to the extract.

        dictionary_file -- The path to your dictionary file.  A sorted, lower-cased, list of phrases.

        stopwords_file -- The path to your stopwords file (words or phrases to be excluded).
        This file should also be sorted and lower-cased.

        use_bloomfilter -- Whether or not to use a BloomFilter.  If True extracts will run several
        times faster.  This takes several minutes to build the very first time you run it.
        """

        self.ngfilter = NGramFilter(max_gram_size)
        self.searcher = Searcher(dictionary_file)
	self.stopwords = Searcher(stopwords_file)
        self.timing = timing
        if use_bloomfilter:
            f = open(dictionary_file)
            self.bloom = BloomFilter(self.BF_BYTES, self.BF_HASHES, iter(f))
        else:
            self.bloom = None
    


    def extract(self, text, case_sensitive=False):
        """
        Extracts any phrases found in the dictionary (and not in the stopwords) from the text provided.

        text -- The text to be searched
        case_sensitive -- Whether or not matches must be case-sensitive (requires provided dictionary
        to be case-sensitive as well.)
        """
        
        extracts = []

        if self.timing:
            t1 = time.time()

        # TODO: Refactor
        try:
            text = remove_accents(unicode(text))
        except:
            try:
                text = remove_accents(text)
            except:
                pass

        # TODO: Refactor
        if (not case_sensitive):
            text = text.lower()

        # TODO: Refactor
	    text = text.replace('-',' ') # (e.g. "Japanese-inspired") TODO: Copy out the token variation

        # TODO: Refactor
        tokens = text.split(' ')
        terms = []
        for t in tokens: # trim any  commas, periods, etc..
            if t is not None:
                try:
                    t = unicode(t)              # make sure this is unicode
                    t = convert_smart_quotes(t) # deal with curly quotes
                    t = t.strip(",.\'<>!?() ")  # then strip them (and others)
                    if t is not None:
                        terms.append(t)
                except:
                    pass

        text = " ".join(terms)


        for gram in self.ngfilter.filter(text):
            if (self.bloom and not gram in self.bloom):
                continue
            elif (gram in self.searcher and gram not in self.stopwords):
                extracts.append(gram)

        if self.timing:
            t2 = time.time()
            print 'extract returned in %0.3f ms' % ((t2-t1)*1000.0)

        return set(extracts)


    # Uses BS4:  ($ sudo easy_install beautifulsoup4 html5lib)
    def extract_url(self, url):
        """
        Extracts any phrases found in the dictionary (and not in the stopwords) from information in the page
        referenced by url.
        """
        import HTMLParser
        from bs4 import BeautifulSoup
        from urllib import urlopen

        h = HTMLParser.HTMLParser()
        soup = BeautifulSoup(urlopen(url).read(), "html5lib")

        results = {}
 
        title = soup.title.string
        results["title"] = self.extract(h.unescape(title))

        md = soup.find('meta', attrs={'name':re.compile("^description$", re.I)})
        results["meta-description"] = self.extract(h.unescape(md['content'])) if md and md.has_key('content') else None

        mk = soup.find('meta', attrs={'name':re.compile("^keywords$", re.I)})
        results["meta-keywords"] = self.extract(h.unescape(mk['content'])) if mk and mk.has_key('content') else None

        ot = soup.find('meta', attrs={'property':re.compile("^og:title$", re.I)})
        results["og:title"] = self.extract(h.unescape(ot['content'])) if ot and ot.has_key('content') else None

        od = soup.find("meta", attrs={'property':re.compile("^og:description$", re.I)})
        results["og:description"] = self.extract(h.unescape(od['value'])) if od and od.has_key('value') else None

        return results