Python Corpus.extend示例

编程语言: Python

命名空间/包名称: pattern.vector

类/类型: Corpus

方法/功能: extend

hotexamples.com的示例: 2

Python Corpus.extend - 已找到2个示例。这些是从开源项目中提取的最受好评的pattern.vector.Corpus.extend现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Corpus(5)

append(4)

build(2)

lsa(2)

reduce(2)

cluster(1)

document(1)

export(1)

extend(1)

feature_selection(1)

filter(1)

load(1)

nn(1)

save(1)

search(1)

示例#1

显示文件

class DM_GReader():
    def __init__(self, username, password, method='kmeans'):
        auth = ClientAuthMethod(username, password)
        self.reader = GoogleReader(auth)
        self.reader.buildSubscriptionList()
        self.categories = self.reader.getCategories()
        self.corpus = Corpus()
        self.method = method

    def import_category(self,
                        category_id=0,
                        path=None,
                        local=False,
                        max_articles=2000,
                        days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm(
            (datetime.date.today() -
             datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(
                self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(
                            Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[
                    u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS,
                                                k=k,
                                                seed=KMPP,
                                                p=p,
                                                iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster]
                             for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(
                k)

    def generate_repr_ids(self, k):
        """
        For each cluster, we choose an arbitary article as the cluster's representative.

        Return the ids of the article, here the document name is the article's id.
        Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data.
        Now we use the centroid to represent the documents

        """
        self._generate_clusters(k)
        return self.centroids

    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(
                    self.corpus.document(doc).vector,
                    self.corpus.document(center).vector)

        return cost

    def get_article_content(self, ids):
        """
        Use the ids to find the content of the articles through google web content API
        """
        url = 'http://www.google.com/reader/api/0/stream/items/contents'
        id_handle = 'tag:google.com,2005:reader/item/%s'

        contents = []
        for _id in ids:
            r = requests.post(url, data={'i': (id_handle % _id)})
            contents.append(r.json)
        return contents

    def generate_htmls(self, k, ids):
        """
        Use the ids and k to generate htmls
        """
        htmls = {}
        for i in self.get_article_content(ids):
            feed = i['items'][0]
            for content in [u'content', u'summary']:
                if content in feed:
                    title = feed['title']
                    url = feed['alternate'][0]['href']
                    htmls[title] = url
        return htmls

示例#2

显示文件

文件： DM_GReader.py 项目： zhouzhuojie/DM_GReader

class DM_GReader():

    def __init__(self, username, password, method='kmeans'):
        auth = ClientAuthMethod(username, password)
        self.reader = GoogleReader(auth)
        self.reader.buildSubscriptionList()
        self.categories = self.reader.getCategories()
        self.corpus = Corpus()
        self.method = method

    def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster] for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(k)

    def generate_repr_ids(self, k):
        """
        For each cluster, we choose an arbitary article as the cluster's representative.

        Return the ids of the article, here the document name is the article's id.
        Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data.
        Now we use the centroid to represent the documents

        """
        self._generate_clusters(k)
        return self.centroids

    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(self.corpus.document(doc).vector, self.corpus.document(center).vector)

        return cost

    def get_article_content(self, ids):
        """
        Use the ids to find the content of the articles through google web content API
        """
        url = 'http://www.google.com/reader/api/0/stream/items/contents'
        id_handle = 'tag:google.com,2005:reader/item/%s'

        contents = []
        for _id in ids:
            r = requests.post(url, data={'i': (id_handle % _id)})
            contents.append(r.json)
        return contents

    def generate_htmls(self, k, ids):
        """
        Use the ids and k to generate htmls
        """
        htmls = {}
        for i in self.get_article_content(ids):
            feed = i['items'][0]
            for content in [u'content', u'summary']:
                if content in feed:
                    title = feed['title']
                    url = feed['alternate'][0]['href']
                    htmls[title] = url
        return htmls