def uctoTokenizer(input_text: str, output_text: str): from ucto import Tokenizer """ A function to create a folia file from a text file :param input_text: path to text file :param output_text: name for the folia file """ configurationfile = "../ucto_config/tokconfig_eng_ch" tokenizer = Tokenizer(configurationfile, foliaoutput=True) folia_file_P = Path(input_text) isFolder = folia_file_P.is_dir() if isFolder: # files = [f for f in glob.glob(folia_file + "**/*.xml", recursive=True)] files = [f for f in glob.glob(input_text + "**/*.*", recursive=True)] pbar = ProgressBar() path = Path(output_text) if not path.exists(): path.mkdir() for f in pbar(files): path_out = f.split("/") name_out = path / path_out[len(path_out) - 1] out = str(name_out).replace(".txt", ".folia.xml") tokenizer.tokenize(f, str(out)) else: tokenizer.tokenize(input_text, output_text)
import csv import re import sys from lxml import etree from ucto import Tokenizer tokenizer = Tokenizer('-L nl -n -Q') data = etree.parse(sys.argv[1]) with open(sys.argv[1] + ".csv", 'w') as output: writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) for row in data.getroot(): _, idnumber, text, summary = list(row.iterchildren()) motifs = set() if summary.text is None: continue for motiflist in re.findall('\[.*?\]', summary.text): for motif in re.findall('[^\s\[\],]+', motiflist): if not re.search('[0-9]', motif): continue motifs.add(motif) if motifs: print idnumber.text text = tokenizer.tokenize(text.text, verbose=False) writer.writerow([idnumber.text, r' '.join(motifs), ' '.join(text)])