def test_add_too_big(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=12) c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionTooBig): c.add(u'12345', 1) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_add_too_big(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=12) c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionTooBig): c.add(u'12345', 1) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_add_get_duplicate(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionDuplicate): c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_add_get_duplicate(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionDuplicate): c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_chunking(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=13) c = Corpus('/tmp/TEST_CORPUS') c.add(u'12345', 1) c.add(u'12345', 2) (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2)) self.assertEqual(chunk_number, 1) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_chunking(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=13) c = Corpus('/tmp/TEST_CORPUS') c.add(u'12345', 1) c.add(u'12345', 2) (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2)) self.assertEqual(chunk_number, 1) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_add_get(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get(3), ({ 'p1': 1, 'p2': "2", 'p3': [1, 2, 3, u'ą'], 'id': 3 }, u'Żółte źrebie')) self.assertEqual(d.get(1), ({ 'p1': 1, 'p2': "2", 'p3': [1, 2, 3, u'ą'], 'id': 1 }, u'Gżegżółką jaźń')) self.assertEqual(d.get(2), ({ 'p1': 1, 'p2': "2", 'p3': [1, 2, 3, u'ą'], 'id': 2 }, u'Chrząszcz brzmi w czcinie')) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def test_len(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą']) self.assertEqual(len(c), 3) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_add_get(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get(3), ( { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':3}, u'Żółte źrebie' ) ) self.assertEqual(d.get(1), ( { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':1}, u'Gżegżółką jaźń' ) ) self.assertEqual(d.get(2), ( { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':2}, u'Chrząszcz brzmi w czcinie' ) ) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def test_len(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą']) self.assertEqual(len(c), 3) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_iter(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1,2,3,u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') l = [] for t in d: l.append(t[0]['id']) self.assertEqual(l, [3,1,2]) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def test_iter(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') l = [] for t in d: l.append(t[0]['id']) self.assertEqual(l, [3, 1, 2]) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
""" """ from corpora import Corpus from nltk.corpus import PlaintextCorpusReader import csv corpus_path = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/test' Corpus.create(corpus_path) corpus = Corpus(corpus_path) training_file_path = "/home/mayank/IdeaProjects/Lab_Machine_Learning/src/resources/TrainingData.csv" reader = csv.reader(open(training_file_path, 'r')) for (i, row) in enumerate(reader, 1): print i corpus.add(row[6].decode('utf-8'), i) if i == 10: break print len(corpus) print corpus.get()