def run(self): print('Running Tokenizer...') with open(self.in_filtered().path, 'r', encoding='utf-8') as file_in: tweets = json.load(file_in) toktweets = [] tokenizer = ucto.Tokenizer(self.config) for tweet in tweets: text = tweet['text'] tokenizer.process(text) tokens = [] for token in tokenizer: if not (self.strip_punctuation and token.tokentype == 'PUNCTUATION'): tokens.append(token.text) tokenized = ' '.join(tokens) if self.lowercase: tokenized = tokenized.lower() tweet['text'] = tokenized toktweets.append(tweet) # write to file with open(self.out_tokenized().path, 'w', encoding='utf-8') as file_out: json.dump(toktweets, file_out)
def vectorize(self, text, underscore=False, user=False, url=False): tokenizer = ucto.Tokenizer( '/vol/customopt/lamachine/etc/ucto/tokconfig-nl-twitter') text = text[2:-1] vector = [] if underscore: text = '<s> ' + text + ' <s>' tokens = [] tokenizer.process(text) for token in tokenizer: if not token.tokentype == 'PUNCTUATION': if user and token.text[0] == '@': tokens.append('USER') elif url and re.search('^http', token): tokens.append('URL') else: tokens.append(token.text) if underscore: ngrams = tokens + [ '_'.join(x) for x in zip(tokens, tokens[1:]) ] + [' '.join(x) for x in zip(tokens, tokens[1:], tokens[2:])] else: ngrams = tokens + [ ' '.join(x) for x in zip(tokens, tokens[1:]) ] + [' '.join(x) for x in zip(tokens, tokens[1:], tokens[2:])] in_vocabulary = [(x, float(ngrams.count(x))) for x in list(set(ngrams) & set(self.keys))] #print('IN VOCABULARY', in_vocabulary) vector = [0.0] * self.vocabulary_length for ngram in in_vocabulary: vector[self.vocabulary[ngram[0]]] = ngram[1] return vector
def get_tokenized_data(data): settingsfile = "./tokconfig-eng" tokenizer = ucto.Tokenizer(settingsfile) tokenizer.process(data) tokenized_array = [] for sentence in tokenizer.sentences(): tokenized_array.append(sentence) return tokenized_array
def __init__(self): name = '' os.chdir('/scratch2/www/yawyt3/repo/youarewhatyoutweet/yawyt/main/classifiers/') os.mkdir('files/') self.tokenizer = ucto.Tokenizer('/vol/customopt/lamachine/etc/ucto/tokconfig-nl-twitter') self.f = 0 self.tweets = []
import csv import json import tqdm import ucto tokenizer = ucto.Tokenizer("tokconfig-eng") contractions = { "ain't", "aren't", "can't", "could've", "couldn't", "daren't", "daresn't", "dasn't", "didn't", "doesn't", "don't", "e'er", "everyone's", "finna", "gimme", "gonna", "gotta", "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "he've", "how'd", "how'll", "how're", "how's", "I'd", "I'll", "I'm", "I'm'a", "I'm'o", "I've", "isn't", "it'd", "it'll", "it's", "let's", "ma'am", "mayn't", "may've", "mightn't", "might've", "mustn't", "mustn't've", "must've", "needn't", "ne'er", "o'clock", "o'er", "ol'", "oughtn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "somebody's", "someone's", "something's", "that'll", "that're", "that's", "that'd", "there'd", "there'll", "there're", "there's", "these're", "they'd", "they'll", "they're", "they've", "this's", "those're", "'tis", "'twas", "wasn't", "we'd", "we'd've", "we'll", "we're", "we've", "weren't", "what'd", "what'll", "what've", "when's", "where'd", "where're", "where's", "where've", "which's", "who'd", "who'd've", "who'll", "who're", "who's", "who've", "why'd", "why're", "why's", "won't", "would've", "wouldn't", "y'all", "y'all'd've", "yesn't", "you'd", "you'll", "you're", "you've", "noun's" } def resolve_contractions(tokens): new_tokens = [] for i, token in enumerate(tokens):
from texts.models import Text from collections import Counter from . import number2word from . import lexicons import string from . import language_detection as ld import re '''ucto is a tokenizer and part of lamachine environment, can only be loaded if lamachine is active''' try: import ucto tokenizer = ucto.Tokenizer("tokconfig-nld",lowercase = True,paragraphdetection=False) except: print('could not load ucto, please activate lamachine?') tokenizer = '' #object to map digits 11 to words elf (in dutch or frisian n2w = number2word.Number2word() dutch,frisian = lexicons.make_dutch_and_frysian_lexicon() #lexicon with dutch and frisian lemma's to decide whether to include a dash word i.e e-mail lexicon = set(frisian.words +dutch.words) class Cleaner: '''clean a text with ucto numbers are mapped to written out words''' def __init__(self,text,remove_number_compounds =True): '''text text to clean remove_number_compounds whether 2013-2301 should be mapped to 2013 2301 ''' self.text = text
#!/usr/bin/env python3 import sys import textgrid import argparse import ucto import re tokenizer = ucto.Tokenizer("tokconfig-nld") def tokenize(text): #first we remove *d (dialetical or foreign), *u (other), *v (vreemde taal), *a (afgebroken woorden, *x (uncertain whether heard correctly) text = re.sub(r"\*[duvax]\b", "", text) tokenizer.process(text) return [str(token) for token in tokenizer] def chunks(text): """Parse data into chunks, each entity and each non-entity is a chunk, text will be tokenised on the fly""" begin = None end = None cls = "" chunk = "" for i, c in enumerate(text.strip() + " "): if begin is not None: if c == '[': print("Skipping text because of nested brackets: " + text, file=sys.stderr) break elif c == ']':
import re from tqdm import tqdm import math import toolz import json DEFAULT_EOW = '__eow' DEFAULT_SOW = '__sow' DEFAULT_UNK = '__unk' DEFAULT_PAD = '__pad' DEFAULT_MASK = '__mask' DEFAULT_CLS = '__cls' DEFAULT_SEP = '__sep' configurationFile = "tokconfig-nld" tokenizer = ucto.Tokenizer(configurationFile) def ucto_tokenize(sentence): tokenized_sentence = [] tokenizer.process(sentence) for token in tokenizer: tokenized_sentence += [str(token)] ucto_tokenize.counter += 1 print(int(ucto_tokenize.counter)) return tokenized_sentence ucto_tokenize.counter = 0
import ucto text = """To be or not to be, that's the question. This is a test to tokenise. We can span multiple lines!!! The number 6 is Mr Li's favourite. We can't stop yet. This is the next paragraph. And so it ends""" #Set a file to use as tokeniser rules, this one is for English, other languages are available too: settingsfile = "tokconfig-eng" #Initialise the tokeniser, options are passed as keyword arguments, defaults: # lowercase=False,uppercase=False,sentenceperlineinput=False, # sentenceperlineoutput=False, # sentencedetection=True, paragraphdetection=True, quotedetectin=False, # debug=False tokenizer = ucto.Tokenizer(settingsfile) #pass the text (may be called multiple times), tokenizer.process(text) #read the tokenised data for token in tokenizer: #token is an instance of ucto.Token, serialise to string using str() print("[" + str(token) + "]", end="") #tokens remember whether they are followed by a space if token.isendofsentence(): print() elif not token.nospace(): print(" ", end="")
labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(["BEL", "DUT"]) print("BEL", labelEncoder.transform(["BEL"])) print("DUT", labelEncoder.transform(["DUT"])) if _show_graphics: plt.figure() plot_confusion_matrix(cm, classes=labelEncoder.classes_, title="My first confusion matrix") # In[3]: ucto_config = "tokconfig-nld" tokeniser = ucto.Tokenizer(ucto_config, sentenceperlineinput=True, sentencedetection=False, paragraphdetection=False) def read_data(file): text = {} with open(file) as f: for line in tqdm(f): sentence, language = line.strip().split("\t") tokeniser.process(sentence) if language not in text: text[language] = [] current_line = [] for token in tokeniser: current_line.append(str(token)) if token.isendofsentence():