def test_cryo_diff_pipe_init(self): pipeline = Pipeline(BasicCleaner(), ) output1 = pipeline(self.docs) pipeline = Pipeline(BasicCleaner(), ) output2 = pipeline(self.docs) self.assertEqual(output1, output2) # Make sure cryo picks up on differently initialized classes pipeline = Pipeline(BasicCleaner(lowercase=False), ) output3 = pipeline(self.docs) self.assertNotEqual(output1, output3)
def test_valid_branching_pipeline_end_with_branches(self): class A(Pipe): input = Pipe.type.a # A does not output tuples output = Pipe.type.x class B(Pipe): input = Pipe.type.x output = Pipe.type.b_out class C(Pipe): input = Pipe.type.x output = Pipe.type.c_out class D(Pipe): input = Pipe.type.x output = Pipe.type.d_out try: Pipeline( A(), (B(), C(), D()), ) except Exception: self.fail('Valid pipeline raised exception')
def test_valid_branching_pipeline_branches_to_branches(self): class A(Pipe): input = Pipe.type.a # A outputs tuples output = (Pipe.type.b, Pipe.type.c, Pipe.type.d) class B(Pipe): input = Pipe.type.b output = Pipe.type.b class C(Pipe): input = Pipe.type.c output = Pipe.type.c class D(Pipe): input = Pipe.type.d output = Pipe.type.d class E(Pipe): input = (Pipe.type.b, Pipe.type.c, Pipe.type.d) output = Pipe.type.e try: Pipeline(A(), (B(), C(), D()), (B(), C(), D()), E()) except Exception: self.fail('Valid pipeline raised exception')
def test_valid_branching_pipeline_multiout_to_branches(self): class A(Pipe): input = Pipe.type.a output = (Pipe.type.b, Pipe.type.c, Pipe.type.d) class B(Pipe): input = Pipe.type.b output = Pipe.type.b_out class C(Pipe): input = Pipe.type.c output = Pipe.type.c_out class D(Pipe): input = Pipe.type.d output = Pipe.type.d_out class E(Pipe): input = (Pipe.type.b_out, Pipe.type.c_out, Pipe.type.d_out) output = Pipe.type.e try: Pipeline(A(), (B(), C(), D()), E()) except Exception: self.fail('Valid pipeline raised exception')
def test_wikipedia(self): # TO DO not sure if this is implemented correctly; not getting 1. sim # for identical documents... p = Pipeline( (IdentityPipe(Pipe.type.docs), RAKETokenizer()), doc_sim.WikipediaSimilarity() ) sims = p(self.docs) self.assertEqual(sims.shape, (3,3))
def test_nested_pipeline(self): docs = ['<div>{}</div>'.format(d) for d in self.docs] expected = [[ 'time', 'vast', 'empty', 'space', 'time', 'continue', 'dimension', 'hold', 'nonexistence', 'great', 'spring', 'displeased', 'nicolas cage', 'existence' ], [ 'galactic', 'ocean', 'float', 'hand', 'grasp', 'look', 'glorious', 'eye', 'instantaneously', 'begin', 'stretch', 'bend', 'find', 'amusement', 'handling', 'sacred', 'galactic', 'sea', 'mighty', 'hand', 'ocean', 'sacred', 'warmth', 'mighty', 'palm', 'cage reach', 'nicolas cage', 'reach' ]] nested_pipeline = Pipeline(HTMLCleaner(), BasicCleaner(), refresh=True) pipeline = Pipeline(nested_pipeline, OverkillTokenizer(), refresh=True) output = pipeline(docs) for o, e in zip(output, expected): self.assertEqual(set(o), set(e))
def test_entkey(self): class FauxIDF(): def __getitem__(self, key): return 1. p = Pipeline( (RAKETokenizer(), Entities()), doc_sim.EntKeySimilarity(idf=FauxIDF()) ) sims = p(self.docs) self.assertEqual(sims.shape, (3,3))
def test_nested_multipipeline(self): docs = ['<div>{}</div>'.format(d) for d in self.docs] expected = [[[ 'vast', 'empty', 'space', 'time', 'continue', 'dimension', 'hold', 'nonexistence', 'great', 'spring', 'displeased', 'nicolas cage', 'existence' ], [ 'galactic', 'ocean', 'float', 'hand', 'grasp', 'look', 'glorious', 'eye', 'instantaneously', 'begin', 'stretch', 'bend', 'find', 'amusement', 'handling', 'sacred', 'galactic', 'sea', 'mighty', 'hand', 'ocean', 'sacred', 'warmth', 'mighty', 'palm', 'cage reach', 'nicolas cage', 'reach' ]], [[ 'great nicolas cage', 'vast empty', 'sprung', 'nonexistence', 'dimensions', 'held', 'existence', 'displeased', 'continue', 'time', 'space' ], [ 'sacred galactic seas', 'galactic ocean floated', 'nicolas cage reached', 'cage reached', 'sacred warmth', 'glorious eyes', 'mighty palms', 'found amusement', 'instantaneously began', 'mighty hand', 'ocean', 'hand', 'looked', 'stretch', 'grasped', 'handling', 'bend' ]]] nested_multipipeline = Pipeline( BasicCleaner(), [OverkillTokenizer(min_count=1, threshold=0.1), RAKETokenizer()], refresh=True) pipeline = Pipeline(HTMLCleaner(), nested_multipipeline, refresh=True) outputs = pipeline(docs) for i, output in enumerate(outputs): for o, e in zip(output, expected[i]): self.assertEqual(set(o), set(e))
def test_branching_pipeline(self): class A(Pipe): input = Pipe.type.vals output = Pipe.type.vals def __call__(self, vals): return [v + 1 for v in vals] class B(Pipe): input = Pipe.type.vals output = Pipe.type.vals def __call__(self, vals): return [v + 2 for v in vals] class C(Pipe): input = Pipe.type.vals output = Pipe.type.vals def __call__(self, vals): return [v + 3 for v in vals] class D(Pipe): input = Pipe.type.vals output = Pipe.type.vals def __call__(self, vals): return [v + 4 for v in vals] class E(Pipe): input = (Pipe.type.vals, Pipe.type.vals, Pipe.type.vals) output = Pipe.type.vals def __call__(self, vals1, vals2, vals3): return [ sum([v1, v2, v3]) for v1, v2, v3 in zip(vals1, vals2, vals3) ] p = Pipeline(A(), (B(), C(), D()), (B(), C(), D()), E()) out = p([1, 2, 3, 4]) self.assertEqual(out, [24, 27, 30, 33])
def test_valid_branching_pipeline_start_with_branches(self): class B(Pipe): input = Pipe.type.x output = Pipe.type.b_out class C(Pipe): input = Pipe.type.x output = Pipe.type.c_out class D(Pipe): input = Pipe.type.x output = Pipe.type.d_out class E(Pipe): input = (Pipe.type.b_out, Pipe.type.c_out, Pipe.type.d_out) output = Pipe.type.e try: Pipeline((B(), C(), D()), E()) except Exception: self.fail('Valid pipeline raised exception')
def cluster(): data = request.get_json() # Wrangle posted comments into the minimal format needed for processing comments = [ Comment({ 'commentID': c['id'], 'commentBody': c['body'], 'recommendations': c['score'], 'userDisplayName': c['author'], 'createDate': 0, 'replies': [] # ignoring replies for now }) for c in data['comments'] ] # Remove duplicates docs = list({c.body for c in comments}) preprocess = Pipeline(HTMLCleaner(), Cleaner()) names = [ 'lda_hscluster', 'lda_dbscan', 'semsim_hscluster', 'semsim_dbscan', 'bow_hscluster', 'bow_dbscan', 'aspects' ] pipelines = [ Pipeline(preprocess, BoW(), LDA(n_topics=10), Distance(metric='euclidean'), [HSCluster(), DBSCAN()]), Pipeline(preprocess, Overkill(), SemSim(), [HSCluster(), DBSCAN()]), Pipeline(preprocess, BoW(), Distance(metric='euclidean'), [HSCluster(), DBSCAN()]), ] results = [] for p in pipelines: print('Running pipeline:', p) outputs = p(docs) doc_clusters = [] for out in outputs: for clus in out: clus_docs = [] for id in clus: clus_docs.append(docs[id]) doc_clusters.append(clus_docs) results.append(doc_clusters) # Get sentences, filtered fairly aggressively sents = [[sent for sent in sent_tokenize(d) if prefilter(sent)] for d in docs] sents = [sent for s in sents for sent in s] aspect = Pipeline(preprocess, Overkill(), AspectCluster()) output = aspect(sents) highlighted = [] for k, aspect_sents in output: highlighted.append( [markup_highlights(k, sents[i]) for i in aspect_sents]) results.append(highlighted) return jsonify(results=dict(zip(names, results)))
from time import time from glob import glob from broca import Pipeline from broca.preprocess import HTMLCleaner, BasicCleaner from broca.tokenize.keyword import OverkillTokenizer from broca.knowledge.idf import train_idf from broca.knowledge.util import files_stream s = time() print('Loading documents...') files = glob('bodies/*.txt') docs = [d for d in files_stream(files)] tkn = OverkillTokenizer(n_jobs=-1) pipeline = Pipeline(HTMLCleaner(n_jobs=-1), BasicCleaner(n_jobs=-1), tkn, refresh=True) print('Computing pipeline...') tokens = pipeline(docs) print('Training IDF...') train_idf(tokens, out='nyt_idf.json') print('Took {:.2f}s'.format(time() - s)) tkn.bigram.save('nyt.bigram') tkn.trigram.save('nyt.trigram')
import json from broca import Pipeline from broca.tokenize.keyword import Overkill from broca.preprocess import Cleaner, HTMLCleaner from broca.vectorize import BoW from geiger.pipes import LDA, SemSim, HSCluster, DBSCAN, AspectCluster, Distance docs = [ d['body'] for d in json.load(open('examples/climate_example.json', 'r')) ] preprocess = Pipeline(HTMLCleaner(), Cleaner()) pipelines = [ Pipeline(preprocess, BoW(), LDA(n_topics=10), Distance(metric='euclidean'), [HSCluster(), DBSCAN()]), Pipeline(preprocess, Overkill(), SemSim(), [HSCluster(), DBSCAN()]), Pipeline(preprocess, BoW(), Distance(metric='euclidean'), [HSCluster(), DBSCAN()]), ] #for p in pipelines: #print('Running pipeline:', p) #outputs = p(docs) #doc_clusters = [] #for out in outputs: #for clus in out: #clus_docs = [] #for id in clus: #clus_docs.append(docs[id])