예제 #1
0
    def extract_features(self, path):
        self.corpus = twitter_json_parser.TweetJsonParser(path).run(self.number_of_authors)
        # path = 'blogs'
        # directory = os.fsencode(path)
        # print('# of Accounts: ' + str(len(os.listdir(directory))))
        #
        # for file in os.listdir(directory)[-3000:]:
        #     filename = os.fsdecode(file)
        #     if filename.endswith(".xml"):
        #         try:
        #             blog = untangle.parse(path+'/'+filename)
        #             for post in blog.Blog.post:
        #                 self.corpus.append(post.cdata.strip())
        #         except:
        #             continue
        #         continue
        #     else:
        #         continue
        print(str(len(self.corpus)) + ' texts')
        ex = extractor.Extractor()

        
        for text in tqdm(self.corpus):
            self.train_labeled_features.append(ex.extract_all_features(text[0]))

        self.state = 'extracted'
        return str(len(self.corpus))
예제 #2
0
    def analyse_text(self,text,mode):
        test_labeled_features = []
        ex = extractor.Extractor()

        test_labeled_features.append(ex.extract_all_features(text))

        cluster_results = self.cu.analyse(test_labeled_features, mode)

        rep = reporter.Reporter()
        report = rep.report(cluster_results, [text], mode)
        return report
예제 #3
0
    def train(self, items):
        self.clusters = { }
        self.noise = [ ]

        items = list(items)

        if self.verbose:
            sys.stderr.write("{0}: Items to train\n".format(len(items)))

        # Extract the features we want to use for clustering from the items
        self.extractor = extractor.Extractor()
        self.features = self.extractor.fit_transform(items)

        jobs = os.cpu_count() or -1
        start = time.perf_counter()

        # Initialize the NCD code with our log feature. Currently only
        # one feature is used: the normalized log
        X = ncd.prepare(map(lambda features: features[0], self.features))

        # Calculate all the pairwise distances between the items in question
        # The scikit DBSCAN implementation does this anyway, poorly. So why not
        # do it ahead of time and parralelize it ... which we do here. Then we
        #
        # TODO: This takes forever and is an O(n^2) operation
        # There is significant room for improvement both here, and in the following
        # DBSCAN usage and implementation. Techniques such as feature/item selection
        # BIRCH, ball trees, or many other things could make this better/faster
        matrix = sklearn.metrics.pairwise.pairwise_distances(X, metric=ncd.metric, n_jobs=jobs)

        if self.verbose:
            sys.stderr.write("{0}: Computed distances in {1} seconds on {2} cores\n".format(
                int((len(self.features) * len(self.features)) / 2),
                int(time.perf_counter() - start), jobs
            ))

        # Actually perform the clustering. This is fast compared to above
        min_samples = min(self.min_samples, len(items) / 10)
        dbs = sklearn.cluster.DBSCAN(metric='precomputed', eps=self.eps, min_samples=min_samples)
        dbs.fit(matrix)
        labels = dbs.labels_

        # Create clusters of all the items
        clusters = { }
        noise = [ ]
        for i, label in enumerate(labels):
            if label == -1:
                noise.append(i)
            else:
                if label not in clusters:
                    clusters[label] = [ ]
                clusters[label].append(i)
        self.clusters = { }
        for label, indexes in clusters.items():
            self.clusters[label] = Cluster(label, indexes, items)
        self.noise = Cluster(None, noise)

        # Print out a rough description of that
        if self.verbose:
            sys.stderr.write("{0}: Clusters ({1} items, {2} noise)\n".format(
                len(self.clusters.keys()),
                len(items) - len(noise),
                len(noise)
            ))

        # Setup our neighbors classifier for predict()
        self.neighbors = sklearn.neighbors.KNeighborsClassifier(metric='precomputed', weights='distance')
        self.neighbors.fit(matrix, labels)