示例#1
0
 def test_016(self):
     """ Document document setter - valid text file """
     document = Document()
     document.document = "test.txt"
     self.assertEqual(document.name, "test")
     self.assertEqual(len(document), 1)
     self.assertEqual(document.text, ["foo"])
     os.remove("test1.txt")
示例#2
0
 def test_023(self):
     """ Document text setter """
     document = Document("files/4page.pdf", "./")
     document.text[0] = "goo"
     # TODO
     #self.assertEqual(document.text[0], "goo")
     for i in range(1,5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
示例#3
0
 def test_064(self):
     """ config stem is valid """
     document = Document("files/4page.pdf", "./", config=['stem=gap'])
     document = Document("files/4page.pdf", "./", config=['stem=porter'])
     document = Document("files/4page.pdf", "./", config=['stem=snowball'])
     document = Document("files/4page.pdf", "./", config=['stem=lancaster'])
     document = Document("files/4page.pdf", "./", config=['stem=lemma'])
     for i in range(1,5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
示例#4
0
 def test_018(self):
     """ Document document setter - valid PDF file with page directory """
     document = Document()
     document.dir = "./"
     document.document = "files/4page.pdf"
     self.assertEqual(document.name, "4page")
     self.assertEqual(len(document), 4)
     for i in range(1,5):
         self.assertTrue(os.path.isfile("4page" + str(i) + ".pdf"))
         self.assertTrue(os.path.isfile("4page" + str(i) + ".txt"))
         self.assertTrue(os.path.isfile("4page" + str(i) + ".json"))
     for i in range(1,5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
示例#5
0
 def test_074(self):
     """ Document - lang type, page 1 """
     document = Document("files/lang-en.txt", "./")
     self.assertEquals(document.lang, 'en')
     document = Document("files/lang-es.txt", "./")
     self.assertEquals(document.lang, 'es')
     document = Document("files/lang-fr.txt", "./")
     self.assertEquals(document.lang, 'fr')
     document = Document("files/lang-de.txt", "./")
     self.assertEquals(document.lang, 'de')
     document = Document("files/lang-it.txt", "./")
     self.assertEquals(document.lang, 'it')
     for lang in ['en', 'es', 'fr', 'de', 'it']:
         os.remove("lang-" + lang + "1.txt")
         os.remove("lang-" + lang + "1.json")
示例#6
0
 def test_008(self):
     """ Document constructor - keyword argument: document """
     document = Document(document="test.txt")
     self.assertEqual(document.name, "test")
     self.assertEqual(len(document), 1)
     os.remove("test1.txt")
     os.remove("test1.json")
示例#7
0
 def test_011(self):
     """ Document constructor - store single page text file for raw text document """
     document = Document("test.txt", "./")
     self.assertEqual(document.name, "test")
     self.assertTrue(os.path.isfile("test1.txt"))
     os.remove("test1.txt")
     os.remove("test1.json")
示例#8
0
 def test_026(self):
     """ Document [] setter """
     document = Document("test.txt")
     page = Page(text='hello world')
     document[0] = page
     self.assertEqual(document[0].text, "hello world")
     os.remove("test1.txt")    
示例#9
0
 def test_067(self):
     """ config segment image """
     document = Document('files/text.png', './', config=['segment'])
     self.assertEquals(len(document[0].words), 7)
     os.remove('text1.png')
     os.remove('text1.txt')
     os.remove('text1.json')
示例#10
0
 def test_066(self):
     """ config segment pdf """
     document = Document('files/invoice.pdf', './', config=['segment'])
     self.assertEquals(len(document[0].words), 15)
     os.remove('invoice1.pdf')
     os.remove('invoice1.txt')
     os.remove('invoice1.json')
示例#11
0
 def test_028(self):
     """ Document [] setter - not an int index """
     document = Document("test.txt")
     page = Page(text='hello world')
     with pytest.raises(TypeError):
         document['abc'] = page
     os.remove("test1.txt")
示例#12
0
 def test_053(self):
     # page.path for .txt file
     document = Document("test.txt")
     self.assertTrue(os.path.isfile("test1.json"))
     self.assertEquals(document[0].path, "./test1.txt")
     os.remove("test1.txt")
     os.remove("test1.json")
示例#13
0
 def test_012(self):
     """ Document constructor - non-ascii characters in document (UTF-8 encoding) """
     document = Document("files/7page.pdf", "./")
     self.assertEqual(document[0].text.strip()[0:7], "MEDICAL")
     for i in range(1,8):
         os.remove("7page" + str(i) + ".pdf")
         os.remove("7page" + str(i) + ".txt")
         os.remove("7page" + str(i) + ".json")
示例#14
0
 def test_037(self): 
     """ Document type getter - PDF """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document.type, "pdf")
     for i in range(1,5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")   
         os.remove("4page" + str(i) + ".json")      
示例#15
0
 def test_035(self): 
     """ Document size getter - non-zero """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document.size, 32667)
     for i in range(1,5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt") 
         os.remove("4page" + str(i) + ".json") 
示例#16
0
 def test_058(self):
     """ config is empty """
     document = Document("files/4page.pdf", "./", config=[])
     self.assertTrue(document.bagOfWords != None)
     for i in range(1,5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
示例#17
0
 def test_051(self):
     """ async processing """
     document = Document("files/invoice.pdf", "./", self.done)
     time.sleep(6)
     self.assertTrue(self.isdone)
     os.remove("invoice1.pdf")
     os.remove("invoice1.txt")
     os.remove("invoice1.json")
示例#18
0
 def test_065(self):
     """ config segment txt """
     document = Document('files/segment_para.txt', './', config=['segment'])
     self.assertEquals(document[0].size, 91)
     self.assertEquals(document[0].text, 'This is a first paragraph\nand continues to next line.\n\nThen this is the second\nparagraph.')
     self.assertEquals(len(document[0].words), 2)
     os.remove('segment_para1.txt')
     os.remove('segment_para1.json')
示例#19
0
 def test_025(self):
     """ Document [] getter - index out of range """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document[4], None)
     for i in range(1,5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
示例#20
0
 def test_059(self):
     """ config has multiple entries """
     document = Document("files/4page.pdf", "./", config=['bare', 'pos', 'roman'])
     self.assertTrue(document.bagOfWords != None)
     for i in range(1,5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
示例#21
0
 def test_077(self):
     """ Document - French Scanned PDF """
     document = Document("files/french-scan.pdf", "./")
     self.assertEquals(document.lang, 'fr')
     self.assertEquals(document.scanned, (False, 0))
     for i in range(1,9):
         os.remove("french-scan" + str(i) + ".pdf")
         os.remove("french-scan" + str(i) + ".txt")
         os.remove("french-scan" + str(i) + ".json")
示例#22
0
 def test_076(self):
     """ Document - French PDF """
     document = Document("files/french.pdf", "./")
     self.assertEquals(document.lang, 'fr')
     self.assertEquals(len(document), 2)
     for i in range(1,3):
         os.remove("french" + str(i) + ".pdf")
         os.remove("french" + str(i) + ".txt")
         os.remove("french" + str(i) + ".json")
示例#23
0
 def test_075(self):
     """ Document - Spanish PDF """
     document = Document("files/spanish.pdf", "./")
     self.assertEquals(document.lang, 'es')
     self.assertEquals(len(document), 2)
     for i in range(1,3):
         os.remove("spanish" + str(i) + ".pdf")
         os.remove("spanish" + str(i) + ".txt")
         os.remove("spanish" + str(i) + ".json")
示例#24
0
 def test_013(self):
     """ Document constructor - create page directory """
     document = Document("files/4page.pdf", "tests2")
     self.assertTrue(os.path.isdir("tests2"))
     for i in range(1,5):
         os.remove("tests2/4page" + str(i) + ".pdf")
         os.remove("tests2/4page" + str(i) + ".txt")
         os.remove("tests2/4page" + str(i) + ".json")
     os.removedirs("tests2")
示例#25
0
 def test_054(self):
     """ bag of words / freqDist """
     document = Document("files/4page.pdf", "./")
     self.assertTrue(document.bagOfWords != None)
     self.assertTrue(document.freqDist != None)
     for i in range(1,5):
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".json")
示例#26
0
 def test_071(self):
     """ config - spell checker - norvig """
     with open("spell.txt", "w") as f:
         f.write("mispell speling similiar")
     Document.WORDDICT = 'norvig'
     document = Document('spell.txt')
     page = document[0]
     os.remove('spell.txt')
     os.remove('spell1.txt')
     os.remove('spell1.json')
示例#27
0
 def test_041(self): 
     """ Document - invoice PDF """
     document = Document("files/invoice.pdf", "./")
     self.assertEqual(len(document), 1)
     self.assertTrue(os.path.isfile("invoice1.txt"))
     self.assertTrue(os.path.isfile("invoice1.pdf"))
     self.assertTrue(os.path.isfile("invoice1.json"))
     os.remove("invoice1.txt")
     os.remove("invoice1.pdf")
     os.remove("invoice1.json")
示例#28
0
 def test_040(self): 
     """ Document - color PDF with overlay """
     document = Document("files/5page.pdf", "./")
     self.assertEqual(len(document), 5)
     for i in range(1,6):
         self.assertTrue(os.path.isfile("5page"  + str(i) + ".txt"))
         self.assertTrue(os.path.isfile("5page"  + str(i) + ".json"))
     for i in range(1,6):
         os.remove("5page" + str(i) + ".txt")
         os.remove("5page" + str(i) + ".pdf") 
         os.remove("5page" + str(i) + ".json") 
示例#29
0
 def test_022(self):
     """ Document text getter - PDF file """
     document = Document("files/4page.pdf", "./")
     self.assertEqual(document.text[0].strip()[0:6], "TIER 1")
     self.assertEqual(document.text[1].strip()[0:15], "COVERED MEDICAL")
     self.assertEqual(document.text[2].strip()[0:14], "Emergency mean")
     self.assertEqual(document.text[3].strip()[0:15], "Maximum Benefit")
     for i in range(1,5):
         os.remove("4page" + str(i) + ".pdf")
         os.remove("4page" + str(i) + ".txt")
         os.remove("4page" + str(i) + ".json")
示例#30
0
 def test_005(self):
     """ Document Constructor - document = valid text document """
     document = Document("test.txt")
     self.assertEqual(document.document, "test.txt")
     self.assertEqual(document.name, "test")
     self.assertEqual(len(document), 1)
     self.assertEqual(document.text, ["foo"])
     self.assertTrue(os.path.isfile("test1.txt"))
     self.assertTrue(os.path.isfile("test1.json"))
     os.remove("test1.txt")
     os.remove("test1.json")