Пример #1
0
def uctoTokenizer(input_text: str, output_text: str):
    from ucto import Tokenizer

    """
    A function to create a folia file from a text file

    :param input_text: path to text file
    :param output_text: name for the folia file

    """
    configurationfile = "../ucto_config/tokconfig_eng_ch"

    tokenizer = Tokenizer(configurationfile, foliaoutput=True)

    folia_file_P = Path(input_text)
    isFolder = folia_file_P.is_dir()
    if isFolder:
        # files = [f for f in glob.glob(folia_file + "**/*.xml", recursive=True)]
        files = [f for f in glob.glob(input_text + "**/*.*", recursive=True)]
        pbar = ProgressBar()

        path = Path(output_text)
        if not path.exists():
            path.mkdir()

        for f in pbar(files):
            path_out = f.split("/")
            name_out = path / path_out[len(path_out) - 1]
            out = str(name_out).replace(".txt", ".folia.xml")
            tokenizer.tokenize(f, str(out))
    else:
        tokenizer.tokenize(input_text, output_text)
Пример #2
0
import csv
import re
import sys
from lxml import etree
from ucto import Tokenizer

tokenizer = Tokenizer('-L nl -n -Q')
data = etree.parse(sys.argv[1])
with open(sys.argv[1] + ".csv", 'w') as output:
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    for row in data.getroot():
        _, idnumber, text, summary = list(row.iterchildren())
        motifs = set()
        if summary.text is None: continue
        for motiflist in re.findall('\[.*?\]', summary.text):
            for motif in re.findall('[^\s\[\],]+', motiflist):
                if not re.search('[0-9]', motif): continue
                motifs.add(motif)
        if motifs:
            print idnumber.text
            text = tokenizer.tokenize(text.text, verbose=False)
            writer.writerow([idnumber.text, r' '.join(motifs), ' '.join(text)])