def test_line_tokenizer_include_blanks(self): """Test LineTokenizer""" text = """48. Cum tibi contigerit studio cognoscere multa,\nFac discas multa, vita nil discere velle.\n\n49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos.""" target = ['48. Cum tibi contigerit studio cognoscere multa,','Fac discas multa, vita nil discere velle.','','49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.'] tokenizer = LineTokenizer('latin') tokenized_lines = tokenizer.tokenize(text, include_blanks=True) self.assertTrue(tokenized_lines == target)
def test_french_line_tokenizer_include_blanks(self): """Test LineTokenizer""" text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie.\n\nLes contes que jo sai verais,\ndunt li Bretun unt fait les lais,\nvos conterai assez briefment.""" # pylint: disable=line-too-long target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.', 'Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie.', '', 'Les contes que jo sai verais,', 'dunt li Bretun unt fait les lais,', 'vos conterai assez briefment.'] # pylint: disable=line-too-long tokenizer = LineTokenizer('french') tokenized_lines = tokenizer.tokenize(text, include_blanks=True) self.assertTrue(tokenized_lines == target)
def test_line_tokenizer(self): """Test LineTokenizer""" text = """49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos.""" target = ['49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.'] tokenizer = LineTokenizer('latin') tokenized_lines = tokenizer.tokenize(text) self.assertTrue(tokenized_lines == target)
def test_french_line_tokenizer_include_blanks(self): """Test LineTokenizer""" text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie.\n\nLes contes que jo sai verais,\ndunt li Bretun unt fait les lais,\nvos conterai assez briefment.""" # pylint: disable=line-too-long target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.', 'Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie.','','Les contes que jo sai verais,','dunt li Bretun unt fait les lais,','vos conterai assez briefment.'] # pylint: disable=line-too-long tokenizer = LineTokenizer('french') tokenized_lines = tokenizer.tokenize(text, include_blanks=True) self.assertTrue(tokenized_lines == target)
def test_french_line_tokenizer(self): """Test LineTokenizer""" text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie. """ # pylint: disable=line-too-long target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.', 'Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie. '] # pylint: disable=line-too-long tokenizer = LineTokenizer('french') tokenized_lines = tokenizer.tokenize(text) self.assertTrue(tokenized_lines == target)
def test_french_line_tokenizer(self): """Test LineTokenizer""" text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie. """ # pylint: disable=line-too-long target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.','Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie. '] # pylint: disable=line-too-long tokenizer = LineTokenizer('french') tokenized_lines = tokenizer.tokenize(text) self.assertTrue(tokenized_lines == target)
def test_line_tokenizer_include_blanks(self): """Test LineTokenizer""" text = """48. Cum tibi contigerit studio cognoscere multa,\nFac discas multa, vita nil discere velle.\n\n49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos.""" # pylint: disable=line-too-long target = ['48. Cum tibi contigerit studio cognoscere multa,','Fac discas multa, vita nil discere velle.','','49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.'] # pylint: disable=line-too-long tokenizer = LineTokenizer('latin') tokenized_lines = tokenizer.tokenize(text, include_blanks=True) self.assertTrue(tokenized_lines == target)
from cltk.tokenize.line import LineTokenizer from os import listdir, path #initialize tokenizer tokenizer = LineTokenizer('latin') #create list of lines whole_met = [] list_of_files = sorted( [file for file in listdir('la') if path.isfile(path.join('la/', file))]) #iterate through files/books of Metamorphoses for file in list_of_files: if file.startswith('ovid'): #get text from each file with open('la/' + file) as f: raw = f.read() #add line-tokenized text to the master list of lines whole_met += tokenizer.tokenize(raw) clean_met = [string.replace('\t', ' ') for string in whole_met]
import random import os from cltk.tokenize.line import LineTokenizer #initialize tokenizer tokenizer = LineTokenizer('latin') #create list of lines whole_met = [] list_of_files = [ file for file in listdir('la') if os.path.isfile(os.path.join('la/', file)) ] print(list_of_files) ''' #iterate through files/books of Metamorphoses for file in [file for file in listdir('la') if os.path.isfile(os.path.join('la/',file))]: if file.startswith('ovid'): #get text from each file with open('la/' + file) as f: raw = f.read() #add line-tokenized text to the master list of lines whole_met += tokenizer.tokenize(raw) whole_met.replace('\t',' ') #test if there are any empty lines def test_for_empty(list):