예제 #1
0
def uctoTokenizer(input_text: str, output_text: str):
    from ucto import Tokenizer

    """
    A function to create a folia file from a text file

    :param input_text: path to text file
    :param output_text: name for the folia file

    """
    configurationfile = "../ucto_config/tokconfig_eng_ch"

    tokenizer = Tokenizer(configurationfile, foliaoutput=True)

    folia_file_P = Path(input_text)
    isFolder = folia_file_P.is_dir()
    if isFolder:
        # files = [f for f in glob.glob(folia_file + "**/*.xml", recursive=True)]
        files = [f for f in glob.glob(input_text + "**/*.*", recursive=True)]
        pbar = ProgressBar()

        path = Path(output_text)
        if not path.exists():
            path.mkdir()

        for f in pbar(files):
            path_out = f.split("/")
            name_out = path / path_out[len(path_out) - 1]
            out = str(name_out).replace(".txt", ".folia.xml")
            tokenizer.tokenize(f, str(out))
    else:
        tokenizer.tokenize(input_text, output_text)
예제 #2
0
import csv
import re
import sys
from lxml import etree
from ucto import Tokenizer

tokenizer = Tokenizer('-L nl -n -Q')
data = etree.parse(sys.argv[1])
with open(sys.argv[1] + ".csv", 'w') as output:
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    for row in data.getroot():
        _, idnumber, text, summary = list(row.iterchildren())
        motifs = set()
        if summary.text is None: continue
        for motiflist in re.findall('\[.*?\]', summary.text):
            for motif in re.findall('[^\s\[\],]+', motiflist):
                if not re.search('[0-9]', motif): continue
                motifs.add(motif)
        if motifs:
            print idnumber.text
            text = tokenizer.tokenize(text.text, verbose=False)
            writer.writerow([idnumber.text, r' '.join(motifs), ' '.join(text)])
예제 #3
0
import csv
import re
import sys
from lxml import etree
from ucto import Tokenizer

tokenizer = Tokenizer("-L nl -n -Q")
data = etree.parse(sys.argv[1])
with open(sys.argv[1] + ".csv", "w") as output:
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    for row in data.getroot():
        _, idnumber, text, summary = list(row.iterchildren())
        motifs = set()
        if summary.text is None:
            continue
        for motiflist in re.findall("\[.*?\]", summary.text):
            for motif in re.findall("[^\s\[\],]+", motiflist):
                if not re.search("[0-9]", motif):
                    continue
                motifs.add(motif)
        if motifs:
            print idnumber.text
            text = tokenizer.tokenize(text.text, verbose=False)
            writer.writerow([idnumber.text, r" ".join(motifs), " ".join(text)])
예제 #4
0
    # check if trainingfile or instancebase is an existing file and
    # add this to the configuration. If no file is given we stick
    # to the default file with that comes with a particular classifier

    if args.trainingfile:
        if not os.path.isfile(args.trainingfile):
            raise IOError('Trainingfile not found')
        settings['f'] = args.trainingfile
        del settings['i']
    elif args.instancebase:
        if not os.path.isfile(args.instancebase):
            raise IOError('Instancebase not found')
        settings['i'] = args.instancebase

    tokenizer = Tokenizer('-L nl -n -Q')
    with classifier(config.HOST, config.PORT, settings) as program:
        args.output.write(codecs.BOM_UTF8)
        for i, line in enumerate(codecs.open(args.testfile, encoding=config.ENCODING)):
            words = tokenizer.tokenize(line.strip(), tokens = lambda s: s.split())
            output = []
            for word in word:
                results = classifier.classify(word)
                output.append(classifier.pprint_results(results))
            for word, result in zip(words, output):
                args.output.write(
                    u'{0}\t{1}\n'.format(word, result).encode(config.ENCODING))

            if (i+1) % 25 == 0:
                sys.stderr.write(
                    'Processed: {0} words @ {1}\n'.format(i, time.ctime()))