Пример #1
0
Файл: core.py Проект: clones/kaa
 def configure(self, download=True, num=0, keep=True):
     """
     Configure feed
     num:      number of items from the feed (0 == all, default)
     keep:     keep old entries not in feed anymore (download only)
     verbose:  print status on stdout
     """
     self._download = download
     self._num = num
     self._keep = keep
     manager.save()
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X)
    print('ms')
    #print(silhouette_score(X, ms.labels_))
    #print(calinski_harabaz_score(X, ms.labels_))
    return ms


def docs2clusters(labels, clusters):
    docs = list(zip(labels, clusters.tolist()))
    return docs


def cluster_docs(cluster, clusters_docs):
    if cluster < 0: return [d for d, c in clusters_docs]
    else: return [d for d, c in clusters_docs if c == cluster]


def cluster_words(cluster, clusters_docs, docs, count=10):
    clust_docs = cluster_docs(cluster, clusters_docs)
    tagged_docs = [d for d in docs if d.tags[0] in clust_docs]
    words = [word for d in tagged_docs for word in d.words]
    topic_words = Counter(words).most_common(count)
    return topic_words


if __name__ == '__main__':
    import manager
    import doc2vec
    vectors = manager.load(os.path.join(manager.PICKLES_DIR, 'vectors.pkl'))
    manager.save(clusterers, clusters_pkl)
Пример #3
0
Файл: core.py Проект: clones/kaa
    def update(self, verbose=False):
        """
        Update feed.
        """
        def print_status(s):
            sys.stdout.write("%s\r" % s.get_progressbar())
            sys.stdout.flush()

        if self._updating:
            log.error('feed %s is already updating', self.url)
            yield False
        self._updating = True
        log.info('update feed %s', self.url)

        # get directory information
        beacondir = yield kaa.beacon.get(self.dirname)
        listing = yield beacondir.list()
            
        allurls = [ f.url for f in listing ]

        num = self._num
        allfiles = [ e[1] for e in self._entries ]
        entries = self._entries
        new_entries = []

        iter = self.iterate()
        for entry in iter:
            while isinstance(entry, kaa.InProgress):
                # dummy entry to signal waiting
                result = yield entry
                # send result back to the iterator
                entry = iter.send(result)

            log.info('process %s', entry.url)
            filename = None

            if not self._download and entry.url in allurls:
                # already in beacon list
                pass
            elif not self._download:
                # use url as name
                entry['name'] = kaa.unicode_to_str(entry.url)
                i = yield kaa.beacon.add_item(parent=beacondir, **entry)
            else:
                # download
                filename = os.path.join(self.dirname, entry.basename)
                if not os.path.isfile(filename) and filename in allfiles:
                    # file not found, check if it was downloaded before. If
                    # so, the user deleted it and we do not fetch it again
                    pass
                elif os.path.isfile(filename):
                    # File already downloaded.
                    # FIXME: make sure the file is up-to-date
                    pass
                else:
                    async = entry.fetch(filename)
                    if verbose:
                        async.progress.connect(print_status)
                    yield async
                    if not os.path.isfile(filename):
                        log.error('error fetching', entry.url)
                        continue

                if os.path.isfile(filename):
                    item = yield kaa.beacon.get(filename)
                    if not item.scanned:
                        yield item.scan()
                    if 'date' in entry:
                        item['timestamp'] = entry['date']
                    for key in ('title', 'description'):
                        if key in entry:
                            item[key] = entry[key]

            new_entries.append((entry['url'], filename))
            num -= 1
            if num == 0:
                break

        log.info('*** finished with %s ***', self.url)
        manager.save()

        # delete old files or remove old entries from beacon
        for url, filename in entries:
            if (self._keep and self._download) or (url, filename) in new_entries:
                continue
            if not filename:
                # delete old entries from beacon
                for f in (yield beacondir.list()):
                    if f.url == url:
                        f.delete()
            elif os.path.isfile(filename):
                # delete file on disc
                os.unlink(filename)
        self._updating = False
        self._entries = new_entries
        yield True