def extract_features(self, path): self.corpus = twitter_json_parser.TweetJsonParser(path).run(self.number_of_authors) # path = 'blogs' # directory = os.fsencode(path) # print('# of Accounts: ' + str(len(os.listdir(directory)))) # # for file in os.listdir(directory)[-3000:]: # filename = os.fsdecode(file) # if filename.endswith(".xml"): # try: # blog = untangle.parse(path+'/'+filename) # for post in blog.Blog.post: # self.corpus.append(post.cdata.strip()) # except: # continue # continue # else: # continue print(str(len(self.corpus)) + ' texts') ex = extractor.Extractor() for text in tqdm(self.corpus): self.train_labeled_features.append(ex.extract_all_features(text[0])) self.state = 'extracted' return str(len(self.corpus))
def analyse_text(self,text,mode): test_labeled_features = [] ex = extractor.Extractor() test_labeled_features.append(ex.extract_all_features(text)) cluster_results = self.cu.analyse(test_labeled_features, mode) rep = reporter.Reporter() report = rep.report(cluster_results, [text], mode) return report
def train(self, items): self.clusters = { } self.noise = [ ] items = list(items) if self.verbose: sys.stderr.write("{0}: Items to train\n".format(len(items))) # Extract the features we want to use for clustering from the items self.extractor = extractor.Extractor() self.features = self.extractor.fit_transform(items) jobs = os.cpu_count() or -1 start = time.perf_counter() # Initialize the NCD code with our log feature. Currently only # one feature is used: the normalized log X = ncd.prepare(map(lambda features: features[0], self.features)) # Calculate all the pairwise distances between the items in question # The scikit DBSCAN implementation does this anyway, poorly. So why not # do it ahead of time and parralelize it ... which we do here. Then we # # TODO: This takes forever and is an O(n^2) operation # There is significant room for improvement both here, and in the following # DBSCAN usage and implementation. Techniques such as feature/item selection # BIRCH, ball trees, or many other things could make this better/faster matrix = sklearn.metrics.pairwise.pairwise_distances(X, metric=ncd.metric, n_jobs=jobs) if self.verbose: sys.stderr.write("{0}: Computed distances in {1} seconds on {2} cores\n".format( int((len(self.features) * len(self.features)) / 2), int(time.perf_counter() - start), jobs )) # Actually perform the clustering. This is fast compared to above min_samples = min(self.min_samples, len(items) / 10) dbs = sklearn.cluster.DBSCAN(metric='precomputed', eps=self.eps, min_samples=min_samples) dbs.fit(matrix) labels = dbs.labels_ # Create clusters of all the items clusters = { } noise = [ ] for i, label in enumerate(labels): if label == -1: noise.append(i) else: if label not in clusters: clusters[label] = [ ] clusters[label].append(i) self.clusters = { } for label, indexes in clusters.items(): self.clusters[label] = Cluster(label, indexes, items) self.noise = Cluster(None, noise) # Print out a rough description of that if self.verbose: sys.stderr.write("{0}: Clusters ({1} items, {2} noise)\n".format( len(self.clusters.keys()), len(items) - len(noise), len(noise) )) # Setup our neighbors classifier for predict() self.neighbors = sklearn.neighbors.KNeighborsClassifier(metric='precomputed', weights='distance') self.neighbors.fit(matrix, labels)