Пример #1
0
    def get_text(self, text_type):
        """
        Return extracted text from the html. Extract text if neccessary
        NOTE: this function's flow can be confusing cause it does not only serve as extraction
        but also cache the extracted text in different scenarios. 

        Parameters:
        -----------
        text_type: string, optional

        """
        if not self.html:
            return ''

        if text_type == 'body':
            if not self.body:
                self.body = Text_Extractor.extract_body(self.html)
                self.body = URLUtility.clean_text(self.body)
            return self.body
        elif text_type == 'meta':
            if not self.meta:
                self.meta = Text_Extractor.extract_body(self.html)
                self.meta = URLUtility.clean_text(self.meta)
            return self.meta
        elif text_type == 'title':
            if not self.title:
                self.title = Text_Extractor.extract_body(self.html)
                self.title = URLUtility.clean_text(self.title)
            return self.title

        else:
            print "Wrong text_type"
            return ''
Пример #2
0
    def _extract_keywords(self, sites, k=10):
        """
        Extract top k most frequent keywords. Skip ones that were selected.
        """
        stop = stopwords.words('english')
        counter = Counter()
        for site in sites:
            for p in site:
                text = p.get_text('meta')
                text = URLUtility.clean_text(text)
                words = nltk.word_tokenize(text)
                words = [
                    word for word in words
                    if word not in stop and len(word) > 2
                ]
                bigram_words = [
                    words[i] + ' ' + words[i + 1]
                    for i in xrange(len(words) - 1)
                ]
                counter += Counter(words + bigram_words)

        # Get the topk words
        """
        counter = [(counter[w], w) for w in counter if counter[w]>1] # convert to array
        heapq.heapify(counter)
        topk = heapq.nlargest(k, counter)
        return [w[1] for w in topk]
        """
        top_words = counter.most_common(k + len(self.keywords))
        result = []  # list of keywords to return
        i = 0
        while len(result) < k and i < len(top_words):
            if top_words[i][0] not in self.keywords:
                result.append(top_words[i][0])
                self.keywords.add(top_words[i][0])
            i += 1
        print "    List of selected keywords: ", result
        return result
def extract_keywords(sites, k=10):
    """
    Extract top k most frequent keywords
    """
    stop = stopwords.words('english')
    counter = Counter()
    for site in sites:
        for p in site:
            text = p.get_text('meta')
            text = URLUtility.clean_text(text)
            words = word_tokenize(text)
            words = [
                word for word in words if word not in stop and len(word) > 2
            ]
            counter += Counter(words)

    # Get the topk words
    counter = [(counter[w], w) for w in counter
               if counter[w] > 1]  # convert to array
    heapq.heapify(counter)
    topk = heapq.nlargest(k, counter)
    print "Top extracted keywords: ", topk
    return [w[1] for w in topk]