def configure(self, download=True, num=0, keep=True): """ Configure feed num: number of items from the feed (0 == all, default) keep: keep old entries not in feed anymore (download only) verbose: print status on stdout """ self._download = download self._num = num self._keep = keep manager.save()
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X) print('ms') #print(silhouette_score(X, ms.labels_)) #print(calinski_harabaz_score(X, ms.labels_)) return ms def docs2clusters(labels, clusters): docs = list(zip(labels, clusters.tolist())) return docs def cluster_docs(cluster, clusters_docs): if cluster < 0: return [d for d, c in clusters_docs] else: return [d for d, c in clusters_docs if c == cluster] def cluster_words(cluster, clusters_docs, docs, count=10): clust_docs = cluster_docs(cluster, clusters_docs) tagged_docs = [d for d in docs if d.tags[0] in clust_docs] words = [word for d in tagged_docs for word in d.words] topic_words = Counter(words).most_common(count) return topic_words if __name__ == '__main__': import manager import doc2vec vectors = manager.load(os.path.join(manager.PICKLES_DIR, 'vectors.pkl')) manager.save(clusterers, clusters_pkl)
def update(self, verbose=False): """ Update feed. """ def print_status(s): sys.stdout.write("%s\r" % s.get_progressbar()) sys.stdout.flush() if self._updating: log.error('feed %s is already updating', self.url) yield False self._updating = True log.info('update feed %s', self.url) # get directory information beacondir = yield kaa.beacon.get(self.dirname) listing = yield beacondir.list() allurls = [ f.url for f in listing ] num = self._num allfiles = [ e[1] for e in self._entries ] entries = self._entries new_entries = [] iter = self.iterate() for entry in iter: while isinstance(entry, kaa.InProgress): # dummy entry to signal waiting result = yield entry # send result back to the iterator entry = iter.send(result) log.info('process %s', entry.url) filename = None if not self._download and entry.url in allurls: # already in beacon list pass elif not self._download: # use url as name entry['name'] = kaa.unicode_to_str(entry.url) i = yield kaa.beacon.add_item(parent=beacondir, **entry) else: # download filename = os.path.join(self.dirname, entry.basename) if not os.path.isfile(filename) and filename in allfiles: # file not found, check if it was downloaded before. If # so, the user deleted it and we do not fetch it again pass elif os.path.isfile(filename): # File already downloaded. # FIXME: make sure the file is up-to-date pass else: async = entry.fetch(filename) if verbose: async.progress.connect(print_status) yield async if not os.path.isfile(filename): log.error('error fetching', entry.url) continue if os.path.isfile(filename): item = yield kaa.beacon.get(filename) if not item.scanned: yield item.scan() if 'date' in entry: item['timestamp'] = entry['date'] for key in ('title', 'description'): if key in entry: item[key] = entry[key] new_entries.append((entry['url'], filename)) num -= 1 if num == 0: break log.info('*** finished with %s ***', self.url) manager.save() # delete old files or remove old entries from beacon for url, filename in entries: if (self._keep and self._download) or (url, filename) in new_entries: continue if not filename: # delete old entries from beacon for f in (yield beacondir.list()): if f.url == url: f.delete() elif os.path.isfile(filename): # delete file on disc os.unlink(filename) self._updating = False self._entries = new_entries yield True