示例#1
0
 def test_add_too_big(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=12)
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.add(u'12345', 1)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')           
示例#2
0
 def test_add_too_big(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=12)
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.add(u'12345', 1)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
示例#3
0
 def test_add_get_duplicate(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionDuplicate):
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
示例#4
0
 def test_add_get_duplicate(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionDuplicate):
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
示例#5
0
 def test_chunking(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=13)
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'12345', 1)
     c.add(u'12345', 2)
     
     (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2))
     self.assertEqual(chunk_number, 1)
     
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')           
示例#6
0
    def test_chunking(self):
        Corpus.create('/tmp/TEST_CORPUS', chunk_size=13)
        c = Corpus('/tmp/TEST_CORPUS')
        c.add(u'12345', 1)
        c.add(u'12345', 2)

        (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2))
        self.assertEqual(chunk_number, 1)

        del c
        shutil.rmtree('/tmp/TEST_CORPUS')
示例#7
0
 def test_add_get(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           2,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.save_indexes()
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get(3), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 3
     }, u'Żółte źrebie'))
     self.assertEqual(d.get(1), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 1
     }, u'Gżegżółką jaźń'))
     self.assertEqual(d.get(2), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 2
     }, u'Chrząszcz brzmi w czcinie'))
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
示例#8
0
 def test_len(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])            
     self.assertEqual(len(c), 3)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')        
示例#9
0
 def test_add_get(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])  
     c.save_indexes()      
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get(3), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':3},  u'Żółte źrebie'   ) )
     self.assertEqual(d.get(1), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':1},  u'Gżegżółką jaźń'   ) )
     self.assertEqual(d.get(2), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':2},  u'Chrząszcz brzmi w czcinie'  ) )
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')         
示例#10
0
 def test_len(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           2,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     self.assertEqual(len(c), 3)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
示例#11
0
 def test_iter(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])  
     c.save_indexes()      
     d = Corpus('/tmp/TEST_CORPUS')
     l = []
     for t in d:
         l.append(t[0]['id'])
     self.assertEqual(l, [3,1,2])
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')                 
示例#12
0
 def test_iter(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           1,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.save_indexes()
     d = Corpus('/tmp/TEST_CORPUS')
     l = []
     for t in d:
         l.append(t[0]['id'])
     self.assertEqual(l, [3, 1, 2])
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
"""

"""

from corpora import Corpus
from nltk.corpus import PlaintextCorpusReader
import csv

corpus_path = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/test'
Corpus.create(corpus_path)
corpus = Corpus(corpus_path)

training_file_path = "/home/mayank/IdeaProjects/Lab_Machine_Learning/src/resources/TrainingData.csv"
reader = csv.reader(open(training_file_path, 'r'))

for (i, row) in enumerate(reader, 1):
    print i
    corpus.add(row[6].decode('utf-8'), i)
    if i == 10: break

print len(corpus)
print corpus.get()