Exemplo n.º 1
0
    def run(self):

        print('Running Tokenizer...')

        with open(self.in_filtered().path, 'r', encoding='utf-8') as file_in:
            tweets = json.load(file_in)

        toktweets = []
        tokenizer = ucto.Tokenizer(self.config)
        for tweet in tweets:
            text = tweet['text']
            tokenizer.process(text)
            tokens = []
            for token in tokenizer:
                if not (self.strip_punctuation
                        and token.tokentype == 'PUNCTUATION'):
                    tokens.append(token.text)
            tokenized = ' '.join(tokens)
            if self.lowercase:
                tokenized = tokenized.lower()
            tweet['text'] = tokenized
            toktweets.append(tweet)

        # write to file
        with open(self.out_tokenized().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(toktweets, file_out)
    def vectorize(self, text, underscore=False, user=False, url=False):

        tokenizer = ucto.Tokenizer(
            '/vol/customopt/lamachine/etc/ucto/tokconfig-nl-twitter')
        text = text[2:-1]
        vector = []
        if underscore:
            text = '<s> ' + text + ' <s>'
        tokens = []
        tokenizer.process(text)
        for token in tokenizer:
            if not token.tokentype == 'PUNCTUATION':
                if user and token.text[0] == '@':
                    tokens.append('USER')
                elif url and re.search('^http', token):
                    tokens.append('URL')
                else:
                    tokens.append(token.text)
        if underscore:
            ngrams = tokens + [
                '_'.join(x) for x in zip(tokens, tokens[1:])
            ] + [' '.join(x) for x in zip(tokens, tokens[1:], tokens[2:])]
        else:
            ngrams = tokens + [
                ' '.join(x) for x in zip(tokens, tokens[1:])
            ] + [' '.join(x) for x in zip(tokens, tokens[1:], tokens[2:])]
        in_vocabulary = [(x, float(ngrams.count(x)))
                         for x in list(set(ngrams) & set(self.keys))]
        #print('IN VOCABULARY', in_vocabulary)
        vector = [0.0] * self.vocabulary_length
        for ngram in in_vocabulary:
            vector[self.vocabulary[ngram[0]]] = ngram[1]
        return vector
Exemplo n.º 3
0
def get_tokenized_data(data):
    settingsfile = "./tokconfig-eng"
    tokenizer = ucto.Tokenizer(settingsfile)
    tokenizer.process(data)
    tokenized_array = []
    for sentence in tokenizer.sentences():
        tokenized_array.append(sentence)
    return tokenized_array
    def __init__(self):

        name = ''
        os.chdir('/scratch2/www/yawyt3/repo/youarewhatyoutweet/yawyt/main/classifiers/')
        os.mkdir('files/')
        self.tokenizer = ucto.Tokenizer('/vol/customopt/lamachine/etc/ucto/tokconfig-nl-twitter')
        self.f = 0
        self.tweets = []
Exemplo n.º 5
0
import csv
import json
import tqdm
import ucto

tokenizer = ucto.Tokenizer("tokconfig-eng")

contractions = {
    "ain't", "aren't", "can't", "could've", "couldn't", "daren't", "daresn't",
    "dasn't", "didn't", "doesn't", "don't", "e'er", "everyone's", "finna",
    "gimme", "gonna", "gotta", "hadn't", "hasn't", "haven't", "he'd", "he'll",
    "he's", "he've", "how'd", "how'll", "how're", "how's", "I'd", "I'll",
    "I'm", "I'm'a", "I'm'o", "I've", "isn't", "it'd", "it'll", "it's", "let's",
    "ma'am", "mayn't", "may've", "mightn't", "might've", "mustn't",
    "mustn't've", "must've", "needn't", "ne'er", "o'clock", "o'er", "ol'",
    "oughtn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't",
    "somebody's", "someone's", "something's", "that'll", "that're", "that's",
    "that'd", "there'd", "there'll", "there're", "there's", "these're",
    "they'd", "they'll", "they're", "they've", "this's", "those're", "'tis",
    "'twas", "wasn't", "we'd", "we'd've", "we'll", "we're", "we've", "weren't",
    "what'd", "what'll", "what've", "when's", "where'd", "where're", "where's",
    "where've", "which's", "who'd", "who'd've", "who'll", "who're", "who's",
    "who've", "why'd", "why're", "why's", "won't", "would've", "wouldn't",
    "y'all", "y'all'd've", "yesn't", "you'd", "you'll", "you're", "you've",
    "noun's"
}


def resolve_contractions(tokens):
    new_tokens = []
    for i, token in enumerate(tokens):
Exemplo n.º 6
0
from texts.models import Text
from collections import Counter
from . import number2word
from . import lexicons
import string
from . import language_detection as ld
import re

'''ucto is a tokenizer and part of lamachine environment, can only be loaded if lamachine is active'''
try:
	import ucto
	tokenizer = ucto.Tokenizer("tokconfig-nld",lowercase = True,paragraphdetection=False)
except:
	print('could not load ucto, please activate lamachine?')
	tokenizer = ''

#object to map digits 11 to words elf (in dutch or frisian
n2w = number2word.Number2word()
dutch,frisian = lexicons.make_dutch_and_frysian_lexicon()
#lexicon with dutch and frisian lemma's to decide whether to include a dash word i.e e-mail
lexicon = set(frisian.words +dutch.words)


class Cleaner:
	'''clean a text with ucto
	numbers are mapped to written out words'''
	def __init__(self,text,remove_number_compounds =True):
		'''text 					text to clean
		remove_number_compounds 	whether 2013-2301 should be mapped to 2013 2301
		'''
		self.text = text
Exemplo n.º 7
0
#!/usr/bin/env python3

import sys
import textgrid
import argparse
import ucto
import re

tokenizer = ucto.Tokenizer("tokconfig-nld")


def tokenize(text):
    #first we remove *d (dialetical or foreign), *u (other), *v (vreemde taal), *a (afgebroken woorden, *x (uncertain whether heard correctly)
    text = re.sub(r"\*[duvax]\b", "", text)
    tokenizer.process(text)
    return [str(token) for token in tokenizer]


def chunks(text):
    """Parse data into chunks, each entity and each non-entity is a chunk, text will be tokenised on the fly"""
    begin = None
    end = None
    cls = ""
    chunk = ""
    for i, c in enumerate(text.strip() + " "):
        if begin is not None:
            if c == '[':
                print("Skipping text because of nested brackets: " + text,
                      file=sys.stderr)
                break
            elif c == ']':
Exemplo n.º 8
0
import re
from tqdm import tqdm
import math
import toolz
import json

DEFAULT_EOW = '__eow'
DEFAULT_SOW = '__sow'
DEFAULT_UNK = '__unk'
DEFAULT_PAD = '__pad'
DEFAULT_MASK = '__mask'
DEFAULT_CLS = '__cls'
DEFAULT_SEP = '__sep'

configurationFile = "tokconfig-nld"
tokenizer = ucto.Tokenizer(configurationFile)


def ucto_tokenize(sentence):

    tokenized_sentence = []
    tokenizer.process(sentence)
    for token in tokenizer:
        tokenized_sentence += [str(token)]
    ucto_tokenize.counter += 1
    print(int(ucto_tokenize.counter))
    return tokenized_sentence


ucto_tokenize.counter = 0
Exemplo n.º 9
0
import ucto

text = """To be or not to be, that's the question. This is a test to tokenise. We can span
multiple lines!!! The number 6 is Mr Li's favourite. We can't stop yet.

This is the next paragraph. And so it ends"""

#Set a file to use as tokeniser rules, this one is for English, other languages are available too:
settingsfile = "tokconfig-eng"

#Initialise the tokeniser, options are passed as keyword arguments, defaults:
#   lowercase=False,uppercase=False,sentenceperlineinput=False,
#   sentenceperlineoutput=False,
#   sentencedetection=True, paragraphdetection=True, quotedetectin=False,
#   debug=False
tokenizer = ucto.Tokenizer(settingsfile)

#pass the text (may be called multiple times),
tokenizer.process(text)

#read the tokenised data
for token in tokenizer:
    #token is an instance of ucto.Token, serialise to string using str()
    print("[" + str(token) + "]", end="")

    #tokens remember whether they are followed by a space
    if token.isendofsentence():
        print()
    elif not token.nospace():
        print(" ", end="")
Exemplo n.º 10
0
labelEncoder = preprocessing.LabelEncoder()
labelEncoder.fit(["BEL", "DUT"])
print("BEL", labelEncoder.transform(["BEL"]))
print("DUT", labelEncoder.transform(["DUT"]))

if _show_graphics:
    plt.figure()
    plot_confusion_matrix(cm, classes=labelEncoder.classes_, title="My first confusion matrix")


# In[3]:


ucto_config = "tokconfig-nld"
tokeniser = ucto.Tokenizer(ucto_config, sentenceperlineinput=True, sentencedetection=False, paragraphdetection=False)

def read_data(file):
    text = {}
    with open(file) as f:
        for line in tqdm(f):
            sentence, language = line.strip().split("\t")
            tokeniser.process(sentence)

            if language not in text:
                text[language] = []

            current_line = []
            for token in tokeniser:
                current_line.append(str(token))
                if token.isendofsentence():