Exemplo n.º 1
0
    def test_cryo_diff_pipe_init(self):
        pipeline = Pipeline(BasicCleaner(), )
        output1 = pipeline(self.docs)

        pipeline = Pipeline(BasicCleaner(), )
        output2 = pipeline(self.docs)
        self.assertEqual(output1, output2)

        # Make sure cryo picks up on differently initialized classes
        pipeline = Pipeline(BasicCleaner(lowercase=False), )
        output3 = pipeline(self.docs)
        self.assertNotEqual(output1, output3)
Exemplo n.º 2
0
    def test_valid_branching_pipeline_end_with_branches(self):
        class A(Pipe):
            input = Pipe.type.a
            # A does not output tuples
            output = Pipe.type.x

        class B(Pipe):
            input = Pipe.type.x
            output = Pipe.type.b_out

        class C(Pipe):
            input = Pipe.type.x
            output = Pipe.type.c_out

        class D(Pipe):
            input = Pipe.type.x
            output = Pipe.type.d_out

        try:
            Pipeline(
                A(),
                (B(), C(), D()),
            )
        except Exception:
            self.fail('Valid pipeline raised exception')
Exemplo n.º 3
0
    def test_valid_branching_pipeline_branches_to_branches(self):
        class A(Pipe):
            input = Pipe.type.a
            # A outputs tuples
            output = (Pipe.type.b, Pipe.type.c, Pipe.type.d)

        class B(Pipe):
            input = Pipe.type.b
            output = Pipe.type.b

        class C(Pipe):
            input = Pipe.type.c
            output = Pipe.type.c

        class D(Pipe):
            input = Pipe.type.d
            output = Pipe.type.d

        class E(Pipe):
            input = (Pipe.type.b, Pipe.type.c, Pipe.type.d)
            output = Pipe.type.e

        try:
            Pipeline(A(), (B(), C(), D()), (B(), C(), D()), E())
        except Exception:
            self.fail('Valid pipeline raised exception')
Exemplo n.º 4
0
    def test_valid_branching_pipeline_multiout_to_branches(self):
        class A(Pipe):
            input = Pipe.type.a
            output = (Pipe.type.b, Pipe.type.c, Pipe.type.d)

        class B(Pipe):
            input = Pipe.type.b
            output = Pipe.type.b_out

        class C(Pipe):
            input = Pipe.type.c
            output = Pipe.type.c_out

        class D(Pipe):
            input = Pipe.type.d
            output = Pipe.type.d_out

        class E(Pipe):
            input = (Pipe.type.b_out, Pipe.type.c_out, Pipe.type.d_out)
            output = Pipe.type.e

        try:
            Pipeline(A(), (B(), C(), D()), E())
        except Exception:
            self.fail('Valid pipeline raised exception')
Exemplo n.º 5
0
 def test_wikipedia(self):
     # TO DO not sure if this is implemented correctly; not getting 1. sim
     # for identical documents...
     p = Pipeline(
         (IdentityPipe(Pipe.type.docs), RAKETokenizer()),
         doc_sim.WikipediaSimilarity()
     )
     sims = p(self.docs)
     self.assertEqual(sims.shape, (3,3))
Exemplo n.º 6
0
 def test_nested_pipeline(self):
     docs = ['<div>{}</div>'.format(d) for d in self.docs]
     expected = [[
         'time', 'vast', 'empty', 'space', 'time', 'continue', 'dimension',
         'hold', 'nonexistence', 'great', 'spring', 'displeased',
         'nicolas cage', 'existence'
     ],
                 [
                     'galactic', 'ocean', 'float', 'hand', 'grasp', 'look',
                     'glorious', 'eye', 'instantaneously', 'begin',
                     'stretch', 'bend', 'find', 'amusement', 'handling',
                     'sacred', 'galactic', 'sea', 'mighty', 'hand', 'ocean',
                     'sacred', 'warmth', 'mighty', 'palm', 'cage reach',
                     'nicolas cage', 'reach'
                 ]]
     nested_pipeline = Pipeline(HTMLCleaner(), BasicCleaner(), refresh=True)
     pipeline = Pipeline(nested_pipeline, OverkillTokenizer(), refresh=True)
     output = pipeline(docs)
     for o, e in zip(output, expected):
         self.assertEqual(set(o), set(e))
Exemplo n.º 7
0
    def test_entkey(self):
        class FauxIDF():
            def __getitem__(self, key):
                return 1.

        p = Pipeline(
            (RAKETokenizer(), Entities()),
            doc_sim.EntKeySimilarity(idf=FauxIDF())
        )

        sims = p(self.docs)
        self.assertEqual(sims.shape, (3,3))
Exemplo n.º 8
0
 def test_nested_multipipeline(self):
     docs = ['<div>{}</div>'.format(d) for d in self.docs]
     expected = [[[
         'vast', 'empty', 'space', 'time', 'continue', 'dimension', 'hold',
         'nonexistence', 'great', 'spring', 'displeased', 'nicolas cage',
         'existence'
     ],
                  [
                      'galactic', 'ocean', 'float', 'hand', 'grasp', 'look',
                      'glorious', 'eye', 'instantaneously', 'begin',
                      'stretch', 'bend', 'find', 'amusement', 'handling',
                      'sacred', 'galactic', 'sea', 'mighty', 'hand',
                      'ocean', 'sacred', 'warmth', 'mighty', 'palm',
                      'cage reach', 'nicolas cage', 'reach'
                  ]],
                 [[
                     'great nicolas cage', 'vast empty', 'sprung',
                     'nonexistence', 'dimensions', 'held', 'existence',
                     'displeased', 'continue', 'time', 'space'
                 ],
                  [
                      'sacred galactic seas', 'galactic ocean floated',
                      'nicolas cage reached', 'cage reached',
                      'sacred warmth', 'glorious eyes', 'mighty palms',
                      'found amusement', 'instantaneously began',
                      'mighty hand', 'ocean', 'hand', 'looked', 'stretch',
                      'grasped', 'handling', 'bend'
                  ]]]
     nested_multipipeline = Pipeline(
         BasicCleaner(),
         [OverkillTokenizer(min_count=1, threshold=0.1),
          RAKETokenizer()],
         refresh=True)
     pipeline = Pipeline(HTMLCleaner(), nested_multipipeline, refresh=True)
     outputs = pipeline(docs)
     for i, output in enumerate(outputs):
         for o, e in zip(output, expected[i]):
             self.assertEqual(set(o), set(e))
Exemplo n.º 9
0
    def test_branching_pipeline(self):
        class A(Pipe):
            input = Pipe.type.vals
            output = Pipe.type.vals

            def __call__(self, vals):
                return [v + 1 for v in vals]

        class B(Pipe):
            input = Pipe.type.vals
            output = Pipe.type.vals

            def __call__(self, vals):
                return [v + 2 for v in vals]

        class C(Pipe):
            input = Pipe.type.vals
            output = Pipe.type.vals

            def __call__(self, vals):
                return [v + 3 for v in vals]

        class D(Pipe):
            input = Pipe.type.vals
            output = Pipe.type.vals

            def __call__(self, vals):
                return [v + 4 for v in vals]

        class E(Pipe):
            input = (Pipe.type.vals, Pipe.type.vals, Pipe.type.vals)
            output = Pipe.type.vals

            def __call__(self, vals1, vals2, vals3):
                return [
                    sum([v1, v2, v3])
                    for v1, v2, v3 in zip(vals1, vals2, vals3)
                ]

        p = Pipeline(A(), (B(), C(), D()), (B(), C(), D()), E())

        out = p([1, 2, 3, 4])
        self.assertEqual(out, [24, 27, 30, 33])
Exemplo n.º 10
0
    def test_valid_branching_pipeline_start_with_branches(self):
        class B(Pipe):
            input = Pipe.type.x
            output = Pipe.type.b_out

        class C(Pipe):
            input = Pipe.type.x
            output = Pipe.type.c_out

        class D(Pipe):
            input = Pipe.type.x
            output = Pipe.type.d_out

        class E(Pipe):
            input = (Pipe.type.b_out, Pipe.type.c_out, Pipe.type.d_out)
            output = Pipe.type.e

        try:
            Pipeline((B(), C(), D()), E())
        except Exception:
            self.fail('Valid pipeline raised exception')
Exemplo n.º 11
0
def cluster():
    data = request.get_json()

    # Wrangle posted comments into the minimal format needed for processing
    comments = [
        Comment({
            'commentID': c['id'],
            'commentBody': c['body'],
            'recommendations': c['score'],
            'userDisplayName': c['author'],
            'createDate': 0,
            'replies': []  # ignoring replies for now
        }) for c in data['comments']
    ]

    # Remove duplicates
    docs = list({c.body for c in comments})

    preprocess = Pipeline(HTMLCleaner(), Cleaner())

    names = [
        'lda_hscluster', 'lda_dbscan', 'semsim_hscluster', 'semsim_dbscan',
        'bow_hscluster', 'bow_dbscan', 'aspects'
    ]
    pipelines = [
        Pipeline(preprocess, BoW(), LDA(n_topics=10),
                 Distance(metric='euclidean'),
                 [HSCluster(), DBSCAN()]),
        Pipeline(preprocess, Overkill(), SemSim(),
                 [HSCluster(), DBSCAN()]),
        Pipeline(preprocess, BoW(), Distance(metric='euclidean'),
                 [HSCluster(), DBSCAN()]),
    ]

    results = []
    for p in pipelines:
        print('Running pipeline:', p)
        outputs = p(docs)

        doc_clusters = []
        for out in outputs:
            for clus in out:
                clus_docs = []
                for id in clus:
                    clus_docs.append(docs[id])
                doc_clusters.append(clus_docs)
            results.append(doc_clusters)

    # Get sentences, filtered fairly aggressively
    sents = [[sent for sent in sent_tokenize(d) if prefilter(sent)]
             for d in docs]
    sents = [sent for s in sents for sent in s]
    aspect = Pipeline(preprocess, Overkill(), AspectCluster())

    output = aspect(sents)
    highlighted = []
    for k, aspect_sents in output:
        highlighted.append(
            [markup_highlights(k, sents[i]) for i in aspect_sents])
    results.append(highlighted)

    return jsonify(results=dict(zip(names, results)))
Exemplo n.º 12
0
from time import time
from glob import glob
from broca import Pipeline
from broca.preprocess import HTMLCleaner, BasicCleaner
from broca.tokenize.keyword import OverkillTokenizer
from broca.knowledge.idf import train_idf
from broca.knowledge.util import files_stream

s = time()
print('Loading documents...')
files = glob('bodies/*.txt')
docs = [d for d in files_stream(files)]

tkn = OverkillTokenizer(n_jobs=-1)

pipeline = Pipeline(HTMLCleaner(n_jobs=-1),
                    BasicCleaner(n_jobs=-1),
                    tkn,
                    refresh=True)

print('Computing pipeline...')
tokens = pipeline(docs)

print('Training IDF...')
train_idf(tokens, out='nyt_idf.json')

print('Took {:.2f}s'.format(time() - s))

tkn.bigram.save('nyt.bigram')
tkn.trigram.save('nyt.trigram')
Exemplo n.º 13
0
import json
from broca import Pipeline
from broca.tokenize.keyword import Overkill
from broca.preprocess import Cleaner, HTMLCleaner
from broca.vectorize import BoW
from geiger.pipes import LDA, SemSim, HSCluster, DBSCAN, AspectCluster, Distance

docs = [
    d['body'] for d in json.load(open('examples/climate_example.json', 'r'))
]

preprocess = Pipeline(HTMLCleaner(), Cleaner())

pipelines = [
    Pipeline(preprocess, BoW(), LDA(n_topics=10), Distance(metric='euclidean'),
             [HSCluster(), DBSCAN()]),
    Pipeline(preprocess, Overkill(), SemSim(),
             [HSCluster(), DBSCAN()]),
    Pipeline(preprocess, BoW(), Distance(metric='euclidean'),
             [HSCluster(), DBSCAN()]),
]

#for p in pipelines:
#print('Running pipeline:', p)
#outputs = p(docs)
#doc_clusters = []
#for out in outputs:
#for clus in out:
#clus_docs = []
#for id in clus:
#clus_docs.append(docs[id])