Пример #1
0
 def createTagCloud(self,wordline):
     """
     Create tag cloud image 
     """
     wordstream = []
     if wordline == '':
         return False
     
     wordsTokens = WhitespaceTokenizer().tokenize(wordline)
     wordsTokens.remove(wordsTokens[0])
     wordstream.append(' '.join(wordsTokens))
     wordstream = ' '.join(wordstream)
     thresh = self.wordCount
     colorS = self.colorSchemes[self.color]
     
     tags = make_tags(get_tag_counts(wordstream)[:thresh],\
                      minsize=3, maxsize=40,\
                      colors = COLOR_SCHEMES[colorS])
     
     create_tag_image(tags, self.png,\
                      size=(960, 400),\
                      background=(255, 255, 255, 255),\
                      layout= LAYOUT_HORIZONTAL,\
                      fontname='Neuton')
     return True
    def parse(self, corpus_filename, key):
        assert type(corpus_filename) == str, "the filename must be a string"
        assert type(key) == str, "the key must be a string"

        wst = WhitespaceTokenizer()
        with codecs.open(corpus_filename, encoding="utf8") as input:
            corpus = [wst.tokenize(l) for l in input]
        return {key: corpus}
Пример #3
0
 def get_texts_raw(self):
     """
     Parse documents analogously to SimpleCorpus.get_texts(),
     but tokenized by whitespace only
     """
     wst = WhitespaceTokenizer()
     with self.getstream() as stream:
         for doc in stream:
             yield [word for word in wst.tokenize(utils.to_unicode(doc))]
 def tokenizeDoc(self, doc):
     """
     Get the tokens (words) from the doc
     uses nltk.
     """
     #print ("Tokenizing doc")
     tokenizer = WhitespaceTokenizer()
     docTokens = tokenizer.tokenize(doc)
     return docTokens
Пример #5
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
Пример #6
0
    def _calculate_sentence_title_score(self, sentence):
        """Calculates a score based on how many words the sentence shares with the article title."""
        title = self._remove_punctuation(self.title)
        sentence = self._remove_punctuation(sentence)
        tokenizer = WhitespaceTokenizer()
        tokenized_title = tokenizer.tokenize(title)
        tokenized_sentence = tokenizer.tokenize(sentence)
        
        common_words = set()
        for word in tokenized_sentence:
            if word in tokenized_title:
                common_words.add(word)

        score = float(len(common_words)) / len(tokenized_sentence)
        return SENTENCE_SCORE_WEIGHTS['title'] * score
Пример #7
0
def get_words(document):
    '''
    Return a list of unique words in document
    '''
    regex1 = re.compile('\W')          # match non-alphanumeric
    regex2 = re.compile('&(#)*(\w)*;')  # match html entities
    regex3 = re.compile('( ){2,}')      # match more than 2 spaces
    lemmatizer = WordNetLemmatizer()
    tokenizer  = WhitespaceTokenizer()
    # lowercase document, remove punctuation, and html entities
    document   = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower())))
    words = [
             lemmatizer.lemmatize(word)
             for word in tokenizer.tokenize(document)
             if word not in STOPWORDS and len(word) > 2
            ]
    return FreqDist(words)
Пример #8
0
def preprocess_article_content(text_df):

    print 'preprocessing article text...'
    
    # text_df is data frame from SQL query, column 'content' contains text content from each article
    article_list = []
    
    # define punctuation to remove
    punc=set('''`~!@#$%^&*()-_=+\|]}[{;:'",<.>/?''')
    
    tokenizer = WhitespaceTokenizer()
    stop_words = set(stopwords.words('english'))
    #stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    kept_rows = []
    
    for row, article in enumerate(text_df['content']):
        
        cleaned_tokens = []
        
        tokens = tokenizer.tokenize(article.decode('unicode-escape', 'ignore').lower())
        
        for token in tokens:
            token = ''.join(ch for ch in token if ch not in punc)
            
            if token not in stop_words:
                
                if len(token) > 0 and len(token) < 20: 
                    
                    if not token[0].isdigit() and  not token[-1].isdigit(): 
                        #stemmed_token = stemmer.stem(token)
                        lemmatized_tokens = lemmatizer.lemmatize(token)
                        #cleaned_tokens.append(stemmed_token)
                        cleaned_tokens.append(lemmatized_tokens)
        
        # join cleaned tokens into a string for subsequent LDA
        # filtering out content that is likely noise (error messages etc)
        if len(cleaned_tokens) > 100:
            article_list.append(' '.join(wd for wd in cleaned_tokens))
            kept_rows.append(row)

    print 'preprocessed content for %d articles' % len(article_list)
        
    return article_list, kept_rows
Пример #9
0
 def extract(self, corpus):
     from nltk.stem import WordNetLemmatizer
     from nltk.corpus import stopwords
     from nltk.tokenize import WhitespaceTokenizer
     exclude_words = stopwords.words('english')
     exclude_words.append('rt')
     exclude_words.append('&amp;')
     tok = WhitespaceTokenizer()
     lem = WordNetLemmatizer()
     tsents = [tok.tokenize(sent) for sent in corpus]
     norm_words = []
     for sent in tsents:
         for word in sent:
             if word.startswith('http://'): continue
             nword = lem.lemmatize(word.lower())
             if nword not in exclude_words:
                 norm_words.append(nword)
     return nltk.FreqDist(norm_words)
Пример #10
0
def CleanAndTokenize(text):
    # Strip URLs and replace with token "URLURLURL"
    r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
    text = re.sub(r, " URLURLURL", text)
    # Strip html tags
    soup = BeautifulSoup(text, "html.parser")
    for tag in soup.findAll(True):
        tag.replaceWithChildren()
        text = soup.get_text()
    # Normalize everything to lower case
    text = text.lower()
    # Strip line breaks and endings \r \n
    r = re.compile(r"[\r\n]+")
    text = re.sub(r, "", text)
    # get rid of em dashes
    # table = {
    #     ord(u'\u2018') : u"'",
    #     ord(u'\u2019') : u"'",
    #     ord(u'\u201C') : u'"',
    #     ord(u'\u201d') : u'"',
    #     ord(u'\u2026') : u'',
    #     ord(u'\u2014') : u'',
    # }
    # text = text.translate(table)

    # Normalize contractions
    # e.g. can't => can not, it's => it is, he'll => he will
    text = NormalizeContraction(text)

    # Strip punctuation (except for a few)
    punctuations = string.punctuation
    # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    excluded_punctuations = ["$", "%"]
    for p in punctuations:
        if p not in excluded_punctuations:
            text = text.replace(p, " ")

    # Condense double spaces
    text = text.replace("  ", " ")

    # Tokenize the text
    tokenizer = WhitespaceTokenizer()
    text_tokens = tokenizer.tokenize(text)
    return text_tokens
Пример #11
0
def tokenize(sent,tokenizer_type):
	#tokenizer_type is [0] the tokenizer [1] the REGEX or ''
	tokenizer = 'not_implemented'
	#split on custom is the only non-nltk tokenizer
	if tokenizer_type == 'split_on_custom':
		return [sent.split(tokenizer_type[1]) for sent in sents]
	if tokenizer_type[0] == 'whitespace':
		tokenizer = WhitespaceTokenizer()
	if tokenizer_type[0] == 'wordpunkt':
		tokenizer = WordPunctTokenizer()
	if tokenizer_type[0] == 'regexp':
		tokenizer = RegexpTokenizer(tokenizer_type[1])
	if tokenizer_type[0] == 'treebank':
		tokenizer = TreebankWordTokenizer()
	try:
		if tokenizer != "not_implemented":
			return tokenizer.tokenize(sent)
		else:
			return 'Tokenizer not implemented'
	except ValueError: #if the input is not a list of strings
		pass
Пример #12
0
def main(args):
    tokenizer = WhitespaceTokenizer()
    voc = set()

    dir = args.train_dir

    dir_pos = os.path.join(dir, 'pos')
    cnt = 0
    fmt = 'Processed %d positive docs'
    for fname in os.listdir(dir_pos):
        if not fname.endswith('.txt'):
            continue
        cnt += 1
        if cnt % REPORT_INTERVAL == 0:
            print fmt % cnt

        f = open(os.path.join(dir_pos, fname), 'rb')
        voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read())))
        f.close()
    print fmt % cnt

    dir_neg = os.path.join(dir, 'neg')
    cnt = 0
    fmt = 'Processed %d negative docs'
    for fname in os.listdir(dir_neg):
        if not fname.endswith('.txt'):
            continue
        cnt += 1
        if cnt % REPORT_INTERVAL == 0:
            print fmt % cnt

        f = open(os.path.join(dir_neg, fname), 'rb')
        voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read())))
        f.close()
    print fmt % cnt

    voc = sorted(list(voc))
    f = open(args.output, 'wb')
    pickle.dump(voc, f)
    f.close()
    def buildVocab(self):
        self.vocabSize = int(self.vocabSize)
        print ("Building vocab from frequencies")
        # get tokenized corpus and get word counts
        self.tokenizedCorpus = []
        self.vocabSet = set()
        
        tokenizer = WhitespaceTokenizer()
        
        for doc in self.corpus:
            # tokenize doc
            docTokens = tokenizer.tokenize(doc)
            self.tokenizedCorpus.extend(docTokens)
        print ("  Tokenized corpus = ", len(self.tokenizedCorpus))

        # vocab for entire corpus
        self.fullVocab = set(self.tokenizedCorpus)
        print ("  Full vocab = ", len(self.fullVocab))
        
        self.vocabCounts = {}        
        # Extremely inefficient since has to iterate entire corpus for each word
        # generate counts for each word
        #for w in self.fullVocab:
        #    self.vocabCounts[w] = self.tokenizedCorpus.count(w)
        
        # for each word in corpus
        for w in self.tokenizedCorpus:
            if w in self.vocabCounts:
                self.vocabCounts[w] += 1
            else:
                self.vocabCounts[w] = 1


        # sort counts with most frequent first
        sortedCounts = sorted(self.vocabCounts.items(), key=operator.itemgetter(1), reverse=True)
        
        # generate vocab from first vocabSize words
        vocabCounts = sortedCounts[0:self.vocabSize]
        self.vocab = [e[0] for e in vocabCounts]
        print ("  vocab = ", self.vocab)
Пример #14
0
 def topicWordFreq(self,wordline):
     """
     Statistic word stream frequency.
     """
     wordstream = []
     wordsTokens = WhitespaceTokenizer().tokenize(wordline)
     wordsTokens.remove(wordsTokens[0])
     wordstream.append('  '.join(wordsTokens))
     wordstream = ' '.join(wordstream)
     #print wordstream
     
     fdist = FreqDist()
     for word in word_tokenize(wordstream):
         fdist.inc(word)
     result = [list(item) for item in fdist.items()]
     num = float(fdist.N())
     result = [[val[0],val[1],val[1]/num] for val in result]
     #print "smaples:"
     #print fdist.items()
     #print fdist.keys()
    
     return result
Пример #15
0
def bag_of_words(voc, doc, handle_negation=False, handle_bigrams=False):
    """
    Generate bag of words according to dictionary.
    Haven'd done sanity check on dictionary.
    Please make each word in dictionary unique and sorted.
    :param voc: list of words
    :param doc: string
    :return: list of feature vector.
        0 as not appearing.
        1 as appearing positive.
        -1 as appearing negative.
        Has the same size of dictionary.
    """
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(doc)
    fv = np.zeros_like(voc, np.int8)
    is_previous_negative = False
    is_previous_enhanced = False

    for token in tokens:
        word = token.lower()
        if is_skip_word(word):
            continue
        if is_negative(word):
            is_previous_negative = True
            continue
        if is_degree(word):
            is_previous_enhanced = True
            continue

        try:
            idx = voc.index(word)
            fv[idx] = 1
            fv[idx] *= -1 if handle_negation and is_previous_negative else 1
            fv[idx] *= 2 if handle_bigrams and is_previous_enhanced else 1
        except ValueError, e:
            pass
        is_previous_negative = False
        is_previous_enhanced = False
Пример #16
0
    def __init__(self, mongo_db, postgre_db, sentence_mode=True, punctuation_mode=False, window_size=0):
        """Initialize a prototype with a specified configurations.

        Parameters:
        mongo_db -- Mongo DB connection
        postgre_db -- PostGre DB connection
        sentence_mode -- whether or not to use sentence window mode (default True)
        window_size -- the size of the sentence or word window (default 0)
        """
        self.__mongo_db = mongo_db
        self.__postgre_db = postgre_db
        self.__sentence_mode = sentence_mode
        self.___punctuation_mode = punctuation_mode
        self.__window_size = window_size
        self.tokenizer = WhitespaceTokenizer()
        self.parser = Parser()
Пример #17
0
import string
import json
import io
import sys 


# problem with conjugation
# need hist for that too without decomposing them
# but now works
ic_dict = {}

cong = []
all_tokens = 0
#create IC dict
#tokenizer = RegexpTokenizer(r'\w+')
tokenizer = WhitespaceTokenizer()
filename = "../Subtlex.US.txt"
for line in open(filename,"r").readlines():
    line = line.lower()
    line = line.strip()
    #line = line.replace("-"," ")
    #line = "self-support"
    line = ' '.join(word.strip(string.punctuation) for word in line.split())
    print tokenizer.tokenize(line)
    t_list = tokenizer.tokenize(line)
    
    for token in t_list:
        try:
            token = token.encode("ascii", "ignore").lower()
            #token = unicode(token, 'utf8')
            #token = token.encode('utf8')
Пример #18
0
# Write your code here
from nltk.tokenize import WhitespaceTokenizer
from nltk import bigrams, trigrams
from collections import Counter
from random import choices, choice
import re


filename = input()
tk = WhitespaceTokenizer()
with open(filename, 'r', encoding='utf-8') as fc:
    cops = fc.readlines()
cops1 = " ".join(cops)
copsx = tk.tokenize(cops1)
tricops = list(trigrams(copsx))
bicops = list(bigrams(copsx))

tricop_dict = {}
for head1, head2, tail in tricops:
    head = head1 + " " + head2
    tricop_dict.setdefault(head, []).append(tail)
regex1 = r'^[A-Z][a-z\']*?$'
regex2 = r'^[A-z\'-]*?$'
regex3 = r'^[A-z\'-]*?[.!?]$'
regex4 = r'^[A-Z][A-z\' ]*?$'
new_elements = ''

for _xelem in range(len(tricop_dict)):
    start1 = choice(tricops)
    if re.match(regex1, start1[0]) and re.match(regex2, start1[1]):
        new_elements = start1[0] + " " + start1[1]
Пример #19
0
 def reducer(self, sentence, docnames):
     tokens = WhitespaceTokenizer().tokenize(sentence)
     yield (', '.join(docnames) + ': ' + sentence + ' --> ', tokens)
Пример #20
0
import emoji
from emoji import UNICODE_EMOJI
import unicodedata
import num2words
import pandas
import string
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from operator import add
from itertools import starmap
from nltk.stem import LancasterStemmer

lancaster=LancasterStemmer()
stopWords = set(stopwords.words('english'))
word = WhitespaceTokenizer()

class TextSetup:
    def __init__(self, text):
        self.t = text
        self.text = self.t
        self.dtype_str = isinstance(self.text, str)
        self.dtype_list = isinstance(self.text, list)
        self.dtype_pd_series = isinstance(self.text, pandas.core.series.Series)
        if self.dtype_pd_series:
            self.t = text.tolist()
            self.text = self.t

class SwachhText(TextSetup):
    def __init__(self, TextSetup):
        self.t = TextSetup.t
Пример #21
0
import re
from nltk.tokenize import WhitespaceTokenizer

# LOAD csv into dataframe
colnames = ['author', 'title', 'date', 'length', 'text']
df = pandas.read_csv('../data/talks_3.csv', names=colnames)
talks = df.text.tolist()

# All this for labels
authors = df.author.tolist()
dates = df.date.tolist()
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author + " " + year for author, year in zip(authors, years)]

# TOKENIZE
tokenizer = WhitespaceTokenizer()
texts = []
for talk in talks:
    raw = re.sub(r"[^\w\d'\s]+", '', talk).lower()
    tokens = tokenizer.tokenize(raw)
    texts.append(tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Small Test Corpus
# =-=-=-=-=-=-=-=-=-=-=

test = texts[0:5]

# =-=-=-=-=-=-=-=-=-=-=
# Function to collect word positions within a text (as a word list)
# =-=-=-=-=-=-=-=-=-=-=
Пример #22
0
 
mwe_tokenizer.tokenize(sentence5.split()) # 'For', 'more', '@indian_army',
#  Indian Army' should be treated as a single token. But here "Army!" is treated as a token. 

mwe_tokenizer.tokenize(sentence5.replace('!','').split()) # "Army!" will be treated as Army 


# 3. Regexp Tokenizer
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence5)


# 4. Whitespace Tokenizer
from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence5)


# 5. WordPunct Tokenizer
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence5)


# Regexp Stemmer
sentence6 = "I love playing Cricket. Cricket players practice hard."
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$')

' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()])
Пример #23
0
 def tokens(self):
     """method is used to parse the text file and to return
     a list of all tokens"""
     text = self.read_file()
     tk = WhitespaceTokenizer()
     return tk.tokenize(text)
Пример #24
0
import pymysql
from nltk.tokenize import WhitespaceTokenizer

connection = pymysql.connect(host="127.0.0.1",
                             user="******",
                             password="******",
                             charset='utf8',
                             db='tf-idf',
                             cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()

terms = ['debut', 'two', 'language', 'also']
tokenizer = WhitespaceTokenizer()

sql = 'SELECT * FROM wiki'
cursor.execute(sql)
for record in cursor.fetchall():
    doc_id = record['id']
    text = record['text']
    for term in terms:
        for start, end in tokenizer.span_tokenize(text):
            if text[start:end].lower() == term:
                insert_sql = 'INSERT INTO inverted_index VALUES (%s, %s)'
                cursor.execute(insert_sql, (term, doc_id))
                break

connection.commit()
connection.close()
Пример #25
0
# Stage 2. Break the dataset into bigrams
from nltk.tokenize import WhitespaceTokenizer
from nltk import bigrams
from collections import Counter

# f = open(input(), "r", encoding="utf-8")
f = open(input(), "r", encoding="utf-8")

# We get tokens from the corpus
wtk = WhitespaceTokenizer().tokenize(f.read())

# Bigrams generates an iterator. Put type list to get the data
bigr = list(bigrams(wtk))

dict_bigr = {}

# We create a dictionary
# Key = First value in the tuple
# Value = Second value that we store as a list associated to the key
for key, value in bigr:
    dict_bigr.setdefault(key, []).append(value)

head = None
while head != "exit":
    head = input()
    if head != "exit":
        try:
            print(f"Head: {head}")
            cad = "The requested word is not in the model. Please input another word."
            p = dict_bigr.setdefault(head, cad)
            if p == cad:
Пример #26
0
def CleanAndTokenize(text):

    # Strip URLs and replace with token "URLURLURL"
    r = re.compile(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    text = re.sub(r, " URLURLURL", text)

    # Strip html tags
    soup = BeautifulSoup(text)
    for tag in soup.findAll(True):
        tag.replaceWithChildren()
        text = soup.get_text()

    # Normalize everything to lower case
    text = text.lower()

    # Strip line breaks and endings \r \n
    r = re.compile(r"[\r\n]+")
    text = re.sub(r, "", text)

    table = {
        ord(u'\u2018'): u"'",
        ord(u'\u2019'): u"'",
        ord(u'\u201C'): u'"',
        ord(u'\u201d'): u'"',
        ord(u'\u2026'): u'',
        ord(u'\u2014'): u'',  # get rid of em dashes
    }
    text = text.translate(table)

    # Normalize contractions
    # e.g. can't => can not, it's => it is, he'll => he will
    text = text.replace("can't", "can not")
    text = text.replace("couldn't", "could not")
    text = text.replace("don't", "do not")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("shouldn't", "should not")
    text = text.replace("haven't", "have not")
    text = text.replace("aren't", "are not")
    text = text.replace("weren't", "were not")
    text = text.replace("wouldn't", "would not")
    text = text.replace("hasn't", "has not")
    text = text.replace("hadn't", "had not")
    text = text.replace("won't", "will not")
    text = text.replace("wasn't", "was not")
    text = text.replace("can't", "can not")
    text = text.replace("isn't", "is not")
    text = text.replace("ain't", "is not")
    text = text.replace("it's", "it is")

    text = text.replace("i'm", "i am")
    text = text.replace("i'm", "i am")
    text = text.replace("i've", "i have")
    text = text.replace("i'll", "i will")
    text = text.replace("i'd", "i would")

    text = text.replace("we've", "we have")
    text = text.replace("we'll", "we will")
    text = text.replace("we'd", "we would")
    text = text.replace("we're", "we are")

    text = text.replace("you've", "you have")
    text = text.replace("you'll", "you will")
    text = text.replace("you'd", "you would")
    text = text.replace("you're", "you are")

    text = text.replace("he'll", "he will")
    text = text.replace("he'd", "he would")
    text = text.replace("he's", "he has")

    text = text.replace("she'll", "she will")
    text = text.replace("she'd", "she would")
    text = text.replace("she's", "she has")

    text = text.replace("they've", "they have")
    text = text.replace("they'll", "they will")
    text = text.replace("they'd", "they would")
    text = text.replace("they're", "they are")

    text = text.replace("that'll", "that will")
    text = text.replace("that's", "that is")
    text = text.replace("there's", "there is")

    # Strip punctuation (except for a few)
    punctuations = string.punctuation  # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    excluded_punctuations = ["$", "%"]
    for p in punctuations:
        if p not in excluded_punctuations:
            text = text.replace(p, " ")

    # Condense double spaces
    text = text.replace("  ", " ")

    # Tokenize the text
    # NOTE: Using a simple tokenizer based on spaces ...
    # Could also try a more sophisticated tokenizer if abbreviations / contractions should be conserved
    tokenizer = WhitespaceTokenizer()
    text_tokens = tokenizer.tokenize(text)

    return text_tokens
Пример #27
0
def get_corpus_stats(text_content):
    return WhitespaceTokenizer().tokenize(text_content)
Пример #28
0
target_vect = []
for line0 in tarin_label:
    target_vect.append(line0.strip())

dic_list = OrderedDict()

for line1 in Dictionary_txt:
    dic_list[line1.strip()] = 0
    preprocessed.write(line1.strip() + ",")
preprocessed.write("\n")
temp_dic = OrderedDict()
count = 0
for line in train_txt:
    feature_vect = []
    temp_dic = copy.copy(dic_list)
    token = WhitespaceTokenizer().tokenize(line)
    for wrd in token:
        if wrd in temp_dic:
            #print(wrd)
            temp_dic[wrd] = 1
    for key in temp_dic:
        feature_vect.append(temp_dic[key])
    temp_dic.clear()
    feature_vect.append(target_vect[count])
    print(feature_vect)
    str1 = ','.join(str(e) for e in feature_vect)
    preprocessed.write(str1)
    preprocessed.write("\n")
    count += 1

print(len(target_vect))
Пример #29
0
def CleanAndTokenize(text):
	
	# Strip URLs and replace with token "URLURLURL"
	r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
	text = re.sub(r, " URLURLURL", text)

	# Strip html tags
	soup = BeautifulSoup(text)
	for tag in soup.findAll(True):
		tag.replaceWithChildren()
		text = soup.get_text()

	# Normalize everything to lower case
	text = text.lower()
	
	# Strip line breaks and endings \r \n
	r = re.compile(r"[\r\n]+")
	text = re.sub(r, "", text)

	table = {
		ord(u'\u2018') : u"'",
		ord(u'\u2019') : u"'",
		ord(u'\u201C') : u'"',
		ord(u'\u201d') : u'"',
		ord(u'\u2026') : u'',
		ord(u'\u2014') : u'', # get rid of em dashes
	}
	text = text.translate(table)

	# Normalize contractions
	# e.g. can't => can not, it's => it is, he'll => he will
	text = text.replace("can't", "can not")
	text = text.replace("couldn't", "could not")
	text = text.replace("don't", "do not")
	text = text.replace("didn't", "did not")
	text = text.replace("doesn't", "does not")
	text = text.replace("shouldn't", "should not")
	text = text.replace("haven't", "have not")
	text = text.replace("aren't", "are not")
	text = text.replace("weren't", "were not")
	text = text.replace("wouldn't", "would not")
	text = text.replace("hasn't", "has not")
	text = text.replace("hadn't", "had not")
	text = text.replace("won't", "will not")
	text = text.replace("wasn't", "was not")
	text = text.replace("can't", "can not")
	text = text.replace("isn't", "is not")
	text = text.replace("ain't", "is not")    
	text = text.replace("it's", "it is")
	
	text = text.replace("i'm", "i am")
	text = text.replace("i'm", "i am")
	text = text.replace("i've", "i have")
	text = text.replace("i'll", "i will")
	text = text.replace("i'd", "i would")
 
	text = text.replace("we've", "we have")
	text = text.replace("we'll", "we will")
	text = text.replace("we'd", "we would")
	text = text.replace("we're", "we are")
	
	text = text.replace("you've", "you have")
	text = text.replace("you'll", "you will")
	text = text.replace("you'd", "you would")
	text = text.replace("you're", "you are")
	
	text = text.replace("he'll", "he will")
	text = text.replace("he'd", "he would")
	text = text.replace("he's", "he has")
	
	text = text.replace("she'll", "she will")
	text = text.replace("she'd", "she would")
	text = text.replace("she's", "she has")
	
	text = text.replace("they've", "they have")
	text = text.replace("they'll", "they will")
	text = text.replace("they'd", "they would")
	text = text.replace("they're", "they are")
	
	text = text.replace("that'll", "that will")
	text = text.replace("that's", "that is")
	text = text.replace("there's", "there is")

	
	# Strip punctuation (except for a few)
	punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
	excluded_punctuations = ["$", "%"]
	for p in punctuations:
		if p not in excluded_punctuations:
			text = text.replace(p, " ")

	# Condense double spaces
	text = text.replace("  ", " ")

	# Tokenize the text 
	# NOTE: Using a simple tokenizer based on spaces ... 
	# Could also try a more sophisticated tokenizer if abbreviations / contractions should be conserved
	tokenizer = WhitespaceTokenizer()
	text_tokens = tokenizer.tokenize(text)
	
	return text_tokens
Пример #30
0
import logging
import re
import string
from unidecode import unidecode
import numpy as np
from affiliation_parser.keywords import *
from affiliation_parser.data_processor import us_cities, us_state_cities_map, us_city_pop_map
from nltk.tokenize import WhitespaceTokenizer

w_tokenizer = WhitespaceTokenizer()
punct_re = re.compile("[{}]".format(re.escape(string.punctuation)))

US_CITIES = us_cities()
US_CITIES_SET = set(US_CITIES)
US_CITIES_TOP_2000 = set(US_CITIES[:1000])
US_CITIES_POP_MAP = us_city_pop_map()
US_STATE_CITY_MAP = us_state_cities_map()
MAX_WORDS = max(len(s.split()) for s in US_CITIES)


def string_steps(s: str, max_size=MAX_WORDS):
    string_words = s.upper().replace(',', '').replace('.', '').split()
    final_set = set([])
    for step in range(1, max_size + 1):
        for start in range(len(string_words)):
            final_set.add(" ".join(string_words[start:start + step]))
            if start + step > len(string_words):
                break
        if step > len(string_words):
            break
Пример #31
0
class InvertedIndex:
    ''' Main Inverted-Index structure'''
    def __init__(self):
        self._tokenizer = WhitespaceTokenizer()
        self._index_cache = IndexCache()
        self._stop_words = set(stopwords.words('english'))
        self._stemmer = SnowballStemmer("english")
        self._max_documents_per_shard = 50000
        self._num_documents_in_current_shard = 0
        if os.path.isfile("index_data/index.meta"):
            self._num_documents_in_current_shard = pickle.load(
                open("index_data/index.meta"))

    def search(self, query):
        combined_results = None
        ret_results = None
        for i in range(0, len(query), 2):
            op = query[i]
            keyword = self._stemmer.stem(query[i + 1].strip(
                string.punctuation))
            keyword_results = self._search_keyword(keyword)
            if combined_results:
                if op == "AND":
                    combined_results = combined_results.intersection(
                        set(keyword_results.keys()))
                elif op == "OR":
                    combined_results = combined_results.union(
                        set(keyword_results.keys()))
                else:
                    return {"status": False, "message": "Malformed query"}
                for doc in ret_results.keys():
                    if doc not in combined_results:
                        del ret_results[doc]
                    elif keyword_results.get(doc):
                        ret_results[doc].union(keyword_results[doc])
                for doc in keyword_results:
                    if doc not in ret_results:
                        ret_results[doc] = keyword_results[doc]
            else:
                combined_results = set(keyword_results.keys())
                ret_results = keyword_results
        result_counts = dict()
        for el in ret_results:
            result_counts[el] = len(ret_results[el])
        sorted_result_counts = sorted(result_counts.items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
        sorted_results = []
        for key, _ in sorted_result_counts:
            sorted_results.append({"key": key, "positions": ret_results[key]})
        if len(sorted_results) > 0:
            ret = {"status": True, "results": sorted_results}
        else:
            ret = {"status": False, "message": "No hits"}
        return ret

    def _search_keyword(self, query):
        docs = self._index_cache.get(query)
        if not docs:
            return dict()
        return docs

    def add(self, key, text):
        self._num_documents_in_current_shard += 1
        if self._num_documents_in_current_shard > self._max_documents_per_shard:
            self._num_documents_in_current_shard = 0
            self._index_cache.create_new_shard()
        token_positions = self._tokenizer.span_tokenize(text)
        for pos in token_positions:
            start_pos = pos[0]
            end_pos = pos[1]
            token = text[start_pos:end_pos].lower()
            if token in self._stop_words:
                continue
            token = token.strip(string.punctuation)
            token = self._stemmer.stem(token)
            if len(token) > 0:
                self._index_cache.add(token, key, (start_pos, end_pos))

    def delete(self, key, text):
        pass

    def save(self):
        pickle.dump(self._num_documents_in_current_shard,
                    open("index_data/index.meta", "wb"))
        self._index_cache.flush()
Пример #32
0
import nltk
from nltk.tokenize import WhitespaceTokenizer, TreebankWordTokenizer, WordPunctTokenizer

PHRASE = 'I hadn\'t taken my breakfast before I came to Sharan\'s class'

white_space = WhitespaceTokenizer()
tree_bank_word = TreebankWordTokenizer()
word_punct = WordPunctTokenizer()

print("WhitespaceTokenizer   : ", white_space.tokenize(PHRASE))
print("TreebankWordTokenizer : ", tree_bank_word.tokenize(PHRASE))
print("WordPunctTokenizer    : ", word_punct.tokenize(PHRASE))
Пример #33
0
class Plagiarism_Checker:
    def __init__(self, algorithm="TFIDF"):

        self.algorithm = algorithm
        self.stopWords = stopwords.words('english')

        self.wsTok = WhitespaceTokenizer()
        self.stemmer = LancasterStemmer()
        self.countVect = CountVectorizer()
        self.tfidfVect = TfidfVectorizer()

        self.queryData = []
        self.srcData = []

    def preprocess(self, documents):

        processed = []
        for document in documents:

            #1 Removing Punctuations
            data = document.translate(
                str.maketrans("'", " ", string.punctuation))

            #2 Converting to Lowercase
            data = data.lower()

            #3 Tokenization
            data = self.wsTok.tokenize(data)

            #4 Removing Stop Words
            data = [word for word in data if not word in self.stopWords]

            #5 Stemming words
            data = [self.stemmer.stem(word) for word in data]

            processed.append(data)

        return processed

    def setQueryText(self, data, clearData=True):

        if type(data) != list:
            print("Error : Set Query - Datatype should be 'list'")

        if clearData:
            self.queryData = []

        for d in data:
            self.queryData.append(d)

    def setSourceText(self, data, clearData=True):

        if type(data) != list:
            print("Error : Set Source - Datatype should be 'list'")

        if clearData:
            self.srcData = []

        for d in data:
            self.srcData.append(d)

    def jaccardSimilarity(self, query, document):

        inter_l = list(set(query) & set(document))
        union_l = list(set(query) or set(document))

        return len(inter_l) / len(union_l)

    def getPlagMatrix(self, documents):

        if self.algorithm == "TFIDF":
            data = [
                ','.join(str(v) for v in document) for document in documents
            ]
            tfidf = self.tfidfVect.fit_transform(data)
            similarityMatrix = cosine_similarity(tfidf)

        elif self.algorithm == "TF":
            data = [
                ','.join(str(v) for v in document) for document in documents
            ]
            sparse_matrix = self.countVect.fit_transform(data)
            doc_term_matrix = sparse_matrix.todense()
            tf = pd.DataFrame(doc_term_matrix,
                              columns=self.countVect.get_feature_names())
            similarityMatrix = cosine_similarity(tf)

        else:
            similarityMatrix = np.zeros((len(documents), len(documents)))
            for i, doc1 in enumerate(documents):
                for j, doc2 in enumerate(documents):
                    similarityMatrix[i][j] = self.jaccardSimilarity(doc1, doc2)

        return similarityMatrix

    def getReport(self):

        query = self.preprocess(self.queryData)
        src = self.preprocess(self.srcData)
        similarity = []

        for q in query:

            documents = [q] + src
            sim = self.getPlagMatrix(documents)[0][1:]
            similarity.append(sim)

        return similarity
 def __init__(self, config):
     self.config = config
     self.sentence_tokenizer = PunktSentenceTokenizer()
     self.word_tokenizer = WhitespaceTokenizer()  # PunktWordTokenizer()
Пример #35
0
for f in listdir('corpus/'):
 	if f[-4:] == ".txt" and not f in skipOver:
 		fileName = f

 		F = open('corpus/'+f)
 		text = F.read()
 		F.close()

		alphanum = letters+octdigits

		paragraphs = [s for s in text.split("\n\n") if s != "" ][:-1]
		numParagraphs = len(paragraphs)

		# average paragraph size
		wst = WhitespaceTokenizer()
		paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs]

		# the approximate number of words in the document
		numWords = sum(paraWordCounts)

		# the average number of words per paragraph
		avgParagraphLen = mean(paraWordCounts)

		# rejoin the paragraphs
		text = ' '.join(paragraphs)

 		# part of speech word list for the text
 		text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl]

 		# remove symbols from list by checking the first character of the word
Пример #36
0
    def __init__(self, name, config):
        """
        The init method downloads the required files, loads the file associated with a given subset (train/valid/test), 
        concatenates all sencentes and tokenizes them using NLTK's WhitespaceTokenizer.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructor of parent classes.
        Task.__init__(self, name, TranslationPairs, config) 

        # Set streams key mappings.
        self.key_sources = self.stream_keys["sources"]
        self.key_targets = self.stream_keys["targets"]

        # Get absolute path to data folder.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get dataset.
        if (self.config['dataset'] is None) or (self.config['dataset'] not in ["eng-fra", "eng-pol"]):
            raise ConfigurationError("Task supports only 'dataset' options: 'eng-fra', 'eng-pol'")
        dataset = self.config['dataset']

        # Get (sub)set: train/valid/test.
        if (self.config['subset'] is None) or (self.config['subset'] not in ['train', 'valid', 'test']):
            raise ConfigurationError("Task supports one 'subset' options: 'train', 'valid', 'test' ")
        subset = self.config['subset']

        # Extract source and target language name
        self.lang_source = self.config['dataset'].split('-')[0]
        self.lang_target = self.config['dataset'].split('-')[1]


        # Names of files used by this task.
        filenames = [
            self.lang_source + ".train.txt",
            self.lang_target + ".train.txt", 
            self.lang_source + ".valid.txt", 
            self.lang_target + ".valid.txt", 
            self.lang_source + ".test.txt", 
            self.lang_target + ".test.txt"
            ]

        # Initialize dataset if files do not exist.
        if not io.check_files_existence(os.path.join(self.data_folder, dataset), filenames):
            # Set url and source filename depending on dataset.
            url = "https://www.manythings.org/anki/" + self.lang_target + "-" + self.lang_source + ".zip"
            zipfile_name = "translate_" + self.lang_target + "_" + self.lang_source + ".zip"

            with tempfile.TemporaryDirectory() as tmpdirname:
                # Download and extract wikitext zip.
                io.download_extract_zip_file(self.logger, tmpdirname, url, zipfile_name)

                # Create train, valid, test files from the downloaded file
                lines = io.load_string_list_from_txt_file(tmpdirname, self.lang_target + ".txt")

                # Shuffle the lines
                random.seed(42)
                random.shuffle(lines)

                # Split english and french pairs
                lines_source = [self.normalizeString(l.split('\t')[0]) for l in lines]
                lines_target = [self.normalizeString(l.split('\t')[1]) for l in lines]

                # Cut dataset into train (90%), valid (5%), test (5%) files
                test_index = len(lines) // 20
                valid_index = test_index + (len(lines) // 20)

                os.makedirs(os.path.join(self.data_folder, dataset), exist_ok=True)
                
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".test.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_source[0:test_index]))
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".test.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_target[0:test_index]))

                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".valid.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_source[test_index:valid_index]))
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".valid.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_target[test_index:valid_index]))

                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".train.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_source[valid_index:]))
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".train.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_target[valid_index:]))

        else:
            self.logger.info("Files {} found in folder '{}'".format(filenames, self.data_folder))


        # Load the lines
        lines_source = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_source + "."+subset+".txt")
        lines_target = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_target + "."+subset+".txt")

        # Get the required sample length.
        self.sentence_length = self.config['sentence_length']

        # Separate into src - tgt sentence pairs + tokenize
        tokenizer = WhitespaceTokenizer()
        self.sentences_source = []
        self.sentences_target = []
        for s_src, s_tgt in zip(lines_source, lines_target):
            src = tokenizer.tokenize(s_src)
            tgt = tokenizer.tokenize(s_tgt)
            # Keep only the pairs that are shorter or equal to the requested length
            # If self.sentence_length < 0, then give all the pairs regardless of length
            if (len(src) <= self.sentence_length and len(tgt) <= self.sentence_length) \
                or self.sentence_length < 0:
                self.sentences_source += [src]
                self.sentences_target += [tgt]

        self.logger.info("Load text consisting of {} sentences".format(len(self.sentences_source)))

        # Calculate the size of dataset.
        self.dataset_length = len(self.sentences_source)

        # Display exemplary sample.
        self.logger.info("Exemplary sample:\n  source: {}\n  target: {}".format(self.sentences_source[0], self.sentences_target[0]))
class nltk_tokenizer(IncrementalTransform):
    """
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    """

    tagger_id = "nltk_tokenizer"

    def __init__(self, config):
        self.config = config
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  # PunktWordTokenizer()

    def _sentences(self, clean_visible):
        "generate strings identified as sentences"
        previous_end = 0
        clean_visible = clean_visible.decode("utf8")
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        "make a sortedcollection on body.labels"
        labels = stream_item.body.labels.get(self.config.get("annotator_id"))
        if not labels:
            labels = []

        self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        "assemble Sentence and Token objects"
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode("utf8")
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True)
                    sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str))
                tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos)
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, first=sent_start + start, length=end - start
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info("overlapping label: %r" % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
Пример #38
0
 def clear(self):
     self.tok_num = 0
     self.byte_idx = 0
     self.line_idx = 0
     self.word_tokenizer = WhitespaceTokenizer()
Пример #39
0
class Prototype:
    """Prototype system that searches for RDF pattern (aka Q-Calculus pattern) to find textsnippets."""

    def __init__(self, mongo_db, postgre_db, sentence_mode=True, punctuation_mode=False, window_size=0):
        """Initialize a prototype with a specified configurations.

        Parameters:
        mongo_db -- Mongo DB connection
        postgre_db -- PostGre DB connection
        sentence_mode -- whether or not to use sentence window mode (default True)
        window_size -- the size of the sentence or word window (default 0)
        """
        self.__mongo_db = mongo_db
        self.__postgre_db = postgre_db
        self.__sentence_mode = sentence_mode
        self.___punctuation_mode = punctuation_mode
        self.__window_size = window_size
        self.tokenizer = WhitespaceTokenizer()
        self.parser = Parser()

    def exit(self):
        """Close down the prototype."""
        self.__mongo_db.close_connection()
        self.__postgre_db.close_connection()

    def create_new_collection(self, schema_name):
        self.__postgre_db.create_schema(schema_name)

    def get_window_size(self):
        """Gets the current window size."""
        return self.__window_size

    def get_sentence_mode(self):
        """Returns True if sentence window mode is activated, else False."""
        return self.__sentence_mode

    def change_window_size(self, size):
        """Change the current window size to a new size."""
        value = 0
        try:
            value = int(size)
        except ValueError:
            raise ValueError("Please type in a valid number.")

        if value >= 0:
            self.__window_size = value
        else:
            raise ValueError("Please type in a valid positive number.")

    def activate_sentence_window_mode(self):
        """Activate sentence window mode."""
        self.__sentence_mode = True

    def activate_word_window_mode(self):
        """De-activate sentence window mode."""
        self.__sentence_mode = False

    def activate_punctuation_mode(self):
        self.___punctuation_mode = True

    def deactivate_punctuation_mode(self):
        self.___punctuation_mode = False

    def get_punctuation_mode(self):
        return self.___punctuation_mode

    def get_word_window(self, pattern, tokens, constraints):
        """Get a word window list with a specific number of words.

        Parameters:
        pattern -- the pattern to search for
        tokens -- the tokens to search in
        constraints -- a constraint tuple list
        """
        split_pattern = pattern.split()
        if len(split_pattern) > 1:
            textsnippets = self.__get_word_window_more_words_help(split_pattern, tokens, constraints)
        else:
            textsnippets = self.__get_word_window_one_word_help(pattern, tokens, constraints)
        return textsnippets

    def __get_word_window_more_words_help(self, split_pattern, tokens, constraints):
        """Find pattern with more than one word.
        """
        textsnippets = []
        textlength = len(tokens)
        for ind, token in enumerate(tokens):
            p_index = 0
            end_index = ind
            while p_index < len(split_pattern):
                if self.check_pattern(split_pattern[p_index], tokens[end_index]):
                    p_index += 1
                    end_index += 1
                else:
                    break
            if p_index == len(split_pattern):
                if constraints is not None:
                    self.__check_constraints(constraints, (ind, end_index - 1), ind, split_pattern, None, None, textsnippets, tokens)
                else:
                    pattern = " ".join(item for item in split_pattern)
                    self.__get_word_window_help((ind, end_index - 1), textsnippets, textlength, tokens, pattern)
        return textsnippets

    def __get_word_window_one_word_help(self, pattern, tokens, constraints):
        """Find pattern with only one word."""
        textsnippets = []
        textlength = len(tokens)
        for ind, token in enumerate(tokens):
            if self.check_pattern(pattern, token):
                if constraints is not None:
                    self.__check_constraints(constraints, (ind, ind), ind, pattern, None, None, textsnippets, tokens)
                else:
                    self.__get_word_window_help((ind, ind), textsnippets, textlength, tokens, pattern)
        return textsnippets

    def __get_word_window_help(self, token_pos, textsnippets, textlength, tokens, pattern):
        snippet = self.__get_textsnippets(token_pos[0], token_pos[1], textlength, tokens)
        offset_start = re.search(pattern, snippet).span()[0]
        offset_end = offset_start + (len(pattern) - 1)
        SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end'])
        textsnippets.append(SentObj(snippet=snippet, offset_start=offset_start, offset_end=offset_end))

    def __get_textsnippets(self, indl, indr, textlength, tokens):
        if (indl - self.__window_size < 0) and (indr + self.__window_size > textlength):
            left_index = self.__window_size - 1
            while not (indl - left_index) == 0:
                left_index -= 1
            right_index = self.__window_size - 1
            while not (indr + right_index) == textlength:
                right_index -= 1
            return " ".join(tokens[indl - left_index:indr + right_index])

        elif indr + self.__window_size > textlength:
            right_index = self.__window_size - 1
            while not (indr + right_index) == textlength:
                right_index -= 1
            return " ".join(tokens[indl - self.__window_size:indr + right_index])

        elif indl - self.__window_size < 0:
            left_index = self.__window_size - 1
            while not (indl - left_index) == 0:
                left_index -= 1
            return " ".join(tokens[indl - left_index:indr + self.__window_size + 1])
        else:
            return " ".join(tokens[indl - self.__window_size:indr + (self.__window_size + 1)])

    def get_sentence_window(self, pattern, sentences, constraints):
        """Get a list with a specific number of sentences. size 0 will return the
        current sentence the pattern is found in. size n will return n sentences left and right
        from the initial sentence.

        Parameters:
        pattern -- the pattern to search for
        sentences -- the sentences to search in
        constraints -- the constraint tuple list
        """
        split_pattern = pattern.split()

        if len(split_pattern) > 1:
            textsnippets = self.__get_sentence_window_more_words(split_pattern, sentences, constraints)
        else:
            textsnippets = self.__get_sentence_window_one_word(pattern, sentences, constraints)
        return textsnippets

    def __get_sentence_window_one_word(self, pattern, sentences, constraints):
        """Get sentence snippets with pattern containing of only one words according to window size."""
        textsnippets = []
        for ind, sent in enumerate(sentences):
            tokens = self.tokenizer.tokenize(sent)
            for i, token in enumerate(tokens):
                if self.check_pattern(pattern, token):
                    if constraints is not None:
                        self.__check_constraints(constraints, (i, i), ind, pattern, sent, sentences, textsnippets, tokens)
                    else:
                        self.__get_sentence_window_help(ind, sentences, textsnippets, pattern)
        return textsnippets

    def __check_constraints(self, constraints, token_pos, sent_num, pattern, sent, sentences, textsnippets, tokens):
        """Traverse the given list of constraints and find target words near the keyword. The number of word distance
        is given in the constraint list.
        add_info[0] is the keyword aka pattern.
        add_info[1] is the target_word aka the constraint.
        add_info[2] is the word distance from constraint to the pattern."""
        pos = 0
        more_words_flag = False
        if token_pos[0] == token_pos[1]:
            pos = token_pos[0]
        else:
            more_words_flag = True

        for add_info in constraints:
            # find pattern that matches target word
            index = add_info[2]
            found_constraint_flag = True
            if more_words_flag:
                constraint = add_info[0].split()
                i = 0
                while found_constraint_flag and i < len(pattern) and i < len(constraint):
                    if self.check_pattern(pattern[i], constraint[i]):
                        pass
                    else:
                        found_constraint_flag = False
                        break
                    i += 1

            if found_constraint_flag or self.check_pattern(pattern, add_info[0]):
                # set token_pos depending if index is positive or negative
                if more_words_flag and index > 0:
                    pos = token_pos[1]
                elif more_words_flag and index < 0:
                    pos = token_pos[0]

                if self.__sentence_mode:
                    if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                        self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern)
                    else:
                        while index != 0:
                            if index > 0:
                                index -= 1
                            else:
                                index += 1
                            if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                                self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern)
                                break
                else:
                    if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                        self.__get_word_window_help(token_pos, textsnippets, len(tokens), tokens, pattern)
                    else:
                        while index != 0:
                            if index > 0:
                                index -= 1
                            else:
                                index += 1
                            if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                                self.__get_word_window_help(token_pos, textsnippets, sent, tokens, pattern)
                                break

    def __get_sentence_window_help(self, ind, sentences, textsnippets, pattern):
        sentence = self.__get_sentences(ind, sentences)
        # get offsets
        offset_start = re.search(pattern, sentence).span()[0]
        offset_end = offset_start + (len(pattern) - 1)
        SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end'])
        textsnippets.append(SentObj(snippet=sentence, offset_start=offset_start, offset_end=offset_end))

    def __get_sentence_window_more_words(self, split_pattern, sentences, constraints):
        """Get sentence snippets with pattern containing of more than 2 words according to window size."""
        textsnippets = []
        for ind, sent in enumerate(sentences):
            tokens = self.tokenizer.tokenize(sent)
            p_index = 0
            begin_index = 0
            end_index = 0
            while p_index < len(split_pattern):
                if (end_index < len(tokens)) and self.check_pattern(split_pattern[p_index], tokens[end_index]):
                    if p_index == 0:
                        begin_index = end_index
                    else:
                        begin_index = begin_index + end_index - end_index
                    p_index += 1
                    end_index += 1
                else:
                    break
            end_index -= 1
            if p_index == len(split_pattern):
                # search for constraints in sentence
                if constraints is not None:
                    self.__check_constraints(constraints, (begin_index, end_index), ind, split_pattern, sent, sentences,
                                             textsnippets, tokens)
                else:
                    pattern = " ".join(item for item in split_pattern)
                    self.__get_sentence_window_help(ind, sentences, textsnippets, pattern)
        return textsnippets

    def __get_sentences(self, ind, sentences):
        if self.__window_size == 0:
            return sentences[ind]

        elif self.__window_size > 0:
            left_window_border = ind - self.__window_size
            right_window_border = ind + self.__window_size + 1
            if left_window_border < 0:
                left_window_border = 0
            if right_window_border >= len(sentences):
                right_window_border = len(sentences)
            return " ".join(sentences[left_window_border:right_window_border])

    def find_text_window(self, schema, text, text_id, constraints=None):
        """Finds text windows with variable size and pushes the found results in the PostGre database.

        Parameters:
        text -- text to search in
        text_id -- id of the text
        constraints -- the constraint tuple list"""

        # this is only a quick and dirty fix: replace weird quotes to basic ones
        for ch in ['›', '‹', '»', '«']:
            if ch in text:
                text = text.replace(ch, '"')

        tokenized_text = self.tokenizer.tokenize(text)
        if self.___punctuation_mode:
            punctuation_text = re.split('[!?.,;:]', text)
            punctuation_text = [item for item in punctuation_text if item != '']
        for pattern in self.__postgre_db.get_data_from_table(schema, "single_pattern"):
            if self.___punctuation_mode and self.__sentence_mode:
                windows_objects = self.get_sentence_window(
                    pattern['single_pattern'], punctuation_text, constraints)
            elif self.__sentence_mode:
                windows_objects = self.get_sentence_window(
                pattern['single_pattern'], sent_tokenize(text, language='german'), constraints)
            else:
                windows_objects = self.get_word_window(pattern['single_pattern'], tokenized_text, constraints)

            # push found snippets onto database
            if len(windows_objects) > 0:
                single_pattern_id = pattern['id']
                for sent_obj in windows_objects:
                    # push snippets
                    self.__push_snippets(schema, sent_obj.snippet)
                    snippet_id = self.__postgre_db.get_id(schema,"snippets", "snippet=" + add_quotes(
                        replace_special_characters(sent_obj.snippet)))
                    # push relations
                    self.__push_texts_snippets(schema, text_id, snippet_id)
                    self.__push_snippet_offsets(schema,
                        single_pattern_id, snippet_id, sent_obj.offset_start, sent_obj.offset_end)

    def __push_snippets(self, schema, snippet):
        """Push found snippets onto the snippets table in PostGre DB, if not already in the table.
        Afterwards push the single_pattern and snippets relation."""
        if not self.__postgre_db.is_in_table(schema, "snippets", "snippet=" + add_quotes(
                replace_special_characters(snippet))):
            self.__postgre_db.insert(schema,"snippets", {"snippet": snippet})

    def __push_texts_snippets(self, schema, text_id, snippet_id):
        """Get all saved snippets that occur in a text and push them onto PostGre DB."""
        self.__push_relation(schema, text_id, snippet_id, "text_id", "snippet_id", "texts_snippets")

    def __push_snippet_offsets(self, schema, single_pattern_id, snippet_id, offset_start, offset_end):
        """Push found single_pattern in snippets and their respective offset."""
        if not self.__postgre_db.is_in_table(
                schema, "snippet_offsets", "single_pattern_id=" + str(single_pattern_id) + " and snippet_id=" + str(
                    snippet_id)):
            self.__postgre_db.insert(schema, "snippet_offsets", {
                "single_pattern_id": single_pattern_id, "snippet_id": snippet_id, "offsets": [
                    [offset_start, offset_end]]})
        else:
            old_list = self.__postgre_db.get(schema, "snippet_offsets", "single_pattern_id=" + str(
                single_pattern_id) + " and snippet_id=" + str(snippet_id), "offsets")
            old_list.append([offset_start, offset_end])
            pid = self.__postgre_db.get_id(schema, "snippet_offsets", "single_pattern_id=" + str(
                single_pattern_id) + " and snippet_id=" + str(snippet_id))
            self.__postgre_db.update(schema, "snippet_offsets", "offsets=" + add_quotes(replace_brackets(str(
                old_list))), "id=" + str(pid))

    def __push_relation(self, schema, id1, id2, id1_name, id2_name, table):
        """Push a relation onto the PostGre DB. The relation has to have a primary key."""
        # case: No entry about relation is in DB yet
        if not self.__postgre_db.is_in_table(schema, table, id1_name + "=" + str(
                id1)):
            self.__postgre_db.insert(schema, table, {
                id1_name: id1, id2_name: [id2], "aggregation": 0})

        # case: Entry about single_pattern is in DB
        else:
            old_list = self.__postgre_db.get(schema, table, id1_name + "=" + str(
                id1), id2_name)
            new_list = list(set(old_list + [id2]))
            self.__postgre_db.update(schema, table, id2_name + "=" + add_quotes(replace_brackets(str(
                new_list))), id1_name + "=" + str(id1))

    def __push_aggregation_lowest_layer(self, schema, aggregation_object, aggregation_name, table, id_name):
        """Push the aggregated snippet numbers onto corresponding the lower layer tables."""
        for aggregation in aggregation_object:
            id = aggregation[aggregation_name][0]
            aggregation_value = aggregation[aggregation_name][1]
            self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation_value), id_name + "=" + str(id))

    def __push_aggregation(self, schema, table, sub_table, table_id, sub_table_id):
        """Calculate and push aggregation on the rest layer tables."""
        table_entries = self.__postgre_db.get_data_from_table(schema, table)
        for entry in table_entries:
            aggregation = 0
            entry_id = entry[table_id]
            entries_to_look_up = entry[sub_table_id]

            for look_up in entries_to_look_up:
                query = "SELECT SUM(aggregation) FROM " + schema + "." + sub_table + " WHERE " + sub_table_id + "=" + str(look_up)
                stored_value = self.__postgre_db.query(query)[0]['sum']
                if stored_value is None:
                    stored_value = 0
                aggregation += stored_value
            self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation), table_id + "=" + str(entry_id))

    def get_snippets(self, schema, constraints):
        """Get snippets for the whole corpus.

        Parameter:
        constraints -- the constraint tuple list"""
        for ind, text in enumerate(self.__mongo_db.get(schema, {})):
            self.__postgre_db.insert(schema, "texts", {"title": text['title']})
            self.find_text_window(schema, text['text'], text['id'], constraints)
            print("Finished extracting snippets from chapter " + str(text['id']) + ".")

    def aggregation(self, schema):
        """Calculate aggregation bottom-up and store the interim data onto the database."""
        aggregation_texts_snippets = self.__postgre_db.query("SELECT " + schema + ".aggregate_texts_snippets()")
        aggregation_snippet_offsets = self.__postgre_db.query("SELECT " + schema + ".aggregate_snippet_offsets()")

        # push 2 lowest levels of the hierarchy
        self.__push_aggregation_lowest_layer(schema,
            aggregation_texts_snippets, str('aggregate_texts_snippets'), "texts_snippets", "text_id")
        self.__push_aggregation_lowest_layer(schema,
            aggregation_snippet_offsets, str('aggregate_snippet_offsets'), "snippet_offsets", "id")

        # push rest of the hierarchy
        self.__push_aggregation(schema,
            "pattern_single_pattern", "snippet_offsets", str('pattern_id'), str('single_pattern_id'))
        self.__push_aggregation(schema, "has_object", "pattern_single_pattern", str('bscale_id'), str('pattern_id'))
        self.__push_aggregation(schema, "has_attribute", "has_object", str('bsort_id'), str('bscale_id'))

    def aggregate_bscale(self, schema, new_bscale, bsort, scale_type, *args):
        pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args)
        if pattern_info is not None:
            pattern_ids = pattern_info[0]
            new_bscale_id = pattern_info[1]
            new_pattern_list = list(set.union(*[set(item) for item in pattern_ids]))
            aggregation = 0
            for item in new_pattern_list:
                aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation")
            self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation})

    def intersect_bscale(self, schema, new_bscale, bsort, scale_type, *args):
        pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args)
        if pattern_info is not None:
            pattern_ids = pattern_info[0]
            new_bscale_id = pattern_info[1]
            new_pattern_list = list(set.intersection(*[set(item) for item in pattern_ids]))
            aggregation = 0
            for item in new_pattern_list:
                aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation")
            self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation})

    def __add_new_bscale(self, schema, new_bscale, bsort, scale_type, *args):
        if args is not None:
            bscale_table = self.__postgre_db.get_data_from_table(schema, "bscale")
            bscale_ids = []
            for scale in args:
                scale_found = False
                for bscale in bscale_table:
                    if scale == bscale['bscale']:
                        bscale_ids.append(bscale['id'])
                        scale_found = True
                if not scale_found:
                    raise Exception("Chosen Bscale does not exist.")
            if not self.__postgre_db.is_in_table(schema, "bscale", "bscale=" + add_quotes(new_bscale)):
                self.__postgre_db.insert(schema, "bscale", {"bscale": new_bscale, "nominal": False, "ordinal": False, "interval": False})
            new_bscale_id = self.__postgre_db.get_id(schema, "bscale", "bscale=" + add_quotes(new_bscale))
            self.__postgre_db.update(schema, "bscale", scale_type + "=" + add_quotes('True'), "id=" + str(new_bscale_id))
            bsort_id = self.__postgre_db.get_id(schema, "bsort", "bsort=" + add_quotes(bsort))
            if self.__postgre_db.is_in_table(schema, "has_attribute", "bsort_id=" + str(bsort_id)):
                old_list = self.__postgre_db.get(schema, "has_attribute", "bsort_id=" + str(bsort_id), "bscale_id")
                old_list.append(new_bscale_id)
                self.__postgre_db.update(schema, "has_attribute", "bscale_id=" + add_quotes(
                    replace_brackets(str(old_list))), "bsort_id=" + str(bsort_id))
            else:
                self.__postgre_db.insert(schema, "has_attribute",
                                         {"bsort_id": bsort_id, "bscale_id": [new_bscale_id], "aggregation": 0})

            scale_obj = self.__postgre_db.get_data_from_table(schema, "has_object")
            pattern_ids = []
            for scale_id in bscale_ids:
                for item in scale_obj:
                    if scale_id == item['bscale_id']:
                        pattern_ids.append(item['pattern_id'])

            return (pattern_ids, new_bscale_id)

    def find_correlating_pattern(self, schema):
        all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets")
        all_snippets = [snippet['snippet'] for snippet in all_snippets_table]
        all_bscales_table = self.__postgre_db.get_data_from_table(schema, "bscale")
        all_bscales = [bscale['id'] for bscale in all_bscales_table]

        for bscale_id in all_bscales:
            pattern_list = self.__postgre_db.get(schema, "has_object", "bscale_id=" + str(bscale_id), "pattern_id")
            for pattern_id in pattern_list:
                single_pattern_id_list = self.__postgre_db.get(
                    schema, "pattern_single_pattern", "pattern_id=" + str(pattern_id), "single_pattern_id")
                for single_pattern_id in single_pattern_id_list:
                    single_pattern = self.__postgre_db.get(schema, "single_pattern", "id=" + str(single_pattern_id), "single_pattern")
                    self.__postgre_db.insert(schema, "bscale_single_pattern", {"bscale_id": bscale_id, "single_pattern_id": single_pattern_id, "single_pattern": single_pattern , "count": 0})
        for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1):
            correlating_pattern = self.parser.get_correlating_nouns_and_adjectives(snippet)
            for ind, item in enumerate(correlating_pattern):
                if self.__postgre_db.is_in_table(schema, "bscale_single_pattern",
                                                 "single_pattern=" + add_quotes(item)):
                    pattern_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(item)), "single_pattern_id")
                    index = ind + 1
                    while index < len(correlating_pattern):
                        next_item = correlating_pattern[index]
                        if self.__postgre_db.is_in_table(schema, "bscale_single_pattern",
                                                 "single_pattern=" + add_quotes(next_item)):
                            pattern_next_item_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(next_item)), "single_pattern_id")
                            if pattern_id != pattern_next_item_id:
                                first_combination_in_table = self.__postgre_db.is_in_table(
                                        schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id))
                                second_combination_in_table = self.__postgre_db.is_in_table(
                                    schema, "correlating_pattern",
                                    "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str(pattern_id))

                                # update entry if already exists in table correlating_pattern
                                if first_combination_in_table:
                                    old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id), "count")
                                    new_count = old_count + 1
                                    self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count), "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id))
                                elif second_combination_in_table:
                                    old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str(
                                        pattern_next_item_id) + " and pattern_b=" + str(pattern_id), "count")
                                    new_count = old_count + 1
                                    self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count),
                                                             "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str(
                                                                 pattern_id))
                                else:
                                    # create new entry for pattern pair if none exists
                                    self.__postgre_db.insert(schema, "correlating_pattern", {
                                        "pattern_a": pattern_id, "pattern_b": pattern_next_item_id, "count": 1})
                        index += 1

    def find_spo_and_adjectives(self, schema):
        all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets")
        all_snippets = [snippet['snippet'] for snippet in all_snippets_table]
        for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1):
            spo = self.parser.get_SVO(snippet)
            for item in spo:
                if item is not None:
                    # subject is pattern
                    if item.subject != "'":
                        if self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.subject)):
                            self.push_parser_items(schema, item.subject, "subject_occ", "subject")
                            self.push_parser_items(schema, item.verb, "verb_occ", "verb")
                            self.push_parser_item_relationship(
                                schema, item.subject, item.verb, "subject_verb_occ", "subject", "verb")
                            if item.object != '':
                                self.push_parser_items(schema, item.object, "object_occ", "object")
                                self.push_parser_item_relationship(schema,
                                        item.subject, item.object, "subject_object_occ", "subject", "object")
                        #object is pattern
                        elif self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.object)):
                            self.push_parser_items(schema, item.object, "object_occ", "object")
                            self.push_parser_items(schema, item.verb, "verb_occ", "verb")
                            self.push_parser_item_relationship(schema,
                                                               item.object, item.verb, "object_verb_occ", "object", "verb")
                            if item.subject != '':
                                self.push_parser_items(schema, item.subject, "subject_occ", "subject")
                                self.push_parser_item_relationship(schema,
                                        item.subject, item.object, "subject_object_occ", "subject", "object")

            noun_adjectives = self.parser.nouns_adj_spacy(snippet)
            for item in noun_adjectives:
                subject = item['noun']
                adjective = item['adj']
                if self.__postgre_db.is_in_table(
                        schema, "single_pattern", "single_pattern=" + add_quotes(item['noun'])):
                    self.push_parser_items(schema, subject, "subject_occ", "subject")
                    self.push_parser_items(schema, adjective, "adjective_occ", "adjective")
                    self.push_parser_item_relationship(
                        schema, subject, adjective, "subject_adjective_occ", "subject", "adjective")

    def push_parser_items(self, schema, word, table, word_type):
        if not self.__postgre_db.is_in_table(schema, table, word_type + "=" + add_quotes(word)):
            self.__postgre_db.insert(schema, table, {word_type: word, "count": 0})

    def push_parser_item_relationship(self, schema, word1, word2, table, word_type1, word_type2):
        word1_id = self.__postgre_db.get_id(schema, word_type1 + "_occ", word_type1 + "=" + add_quotes(word1))
        word2_id = self.__postgre_db.get_id(schema, word_type2 + "_occ", word_type2 + "=" + add_quotes(word2))

        if not self.__postgre_db.is_in_table(schema, table, word_type1 + "=" + str(
                word1_id) + " and " + word_type2 + "=" + str(word2_id)):
            self.__postgre_db.insert(schema, table, {word_type1: word1_id, word_type2: word2_id, "count": 1})
        else:
            table_id = self.__postgre_db.get_id(schema, table, word_type1 + "=" + str(word1_id) + " and " + word_type2 + "=" + str(word2_id))
            old_count = self.__postgre_db.get(schema, table, "id=" + str(table_id), "count")
            self.__postgre_db.update(schema, table, "count=" + str(old_count + 1), "id=" + str(table_id))

    def aggregate_occurences_help(self, text_counter, word):
        count = text_counter[word]
        if count == 0:
            return 1
        else:
            return count

    def calculate_pmi(self, schema):
        print("Calculating PMI for " + schema)
        corpus_count = 0
        for item in self.__mongo_db.get(schema, {}):
            corpus_count += len(word_tokenize(item['text']))
        print(corpus_count)
        print("Lemmatizing corpus.")
        lemmatized_text = []
        for ind, text in enumerate(self.__mongo_db.get(schema, {})):
            doc = text['text']
            for ch in ['›', '‹', '»', '«']:
                if ch in doc:
                    doc = doc.replace(ch, '"')
            lemmatized_text += self.parser.lemmatize_chunk(doc)
            print("Part " + str(ind) + " lemmatized.")
        self.aggregate_occurences(schema, "subject", lemmatized_text)
        self.aggregate_occurences(schema, "object", lemmatized_text)
        self.aggregate_occurences(schema, "adjective", lemmatized_text)
        self.aggregate_occurences(schema, "verb", lemmatized_text)
        print("Finished aggregating occurences.")
        self.calculate_pmi_helper(schema, corpus_count, "subject_adjective_occ", "subject", "adjective")
        self.calculate_pmi_helper(schema, corpus_count, "subject_verb_occ", "subject", "verb")
        self.calculate_pmi_helper(schema, corpus_count, "subject_object_occ", "subject", "object")
        self.calculate_pmi_helper(schema, corpus_count, "object_verb_occ", "object", "verb")

    def aggregate_occurences(self, schema, word_table, lemmatized_text):
        table = self.__postgre_db.get_data_from_table(schema, word_table + "_occ")
        for item in table:
            word = item[word_table]
            split_word = word.split(" ")
            length = len(split_word)
            if length > 1:
                if length == 2:
                    counter = list(bigrams(lemmatized_text))
                    word_tuple = (split_word[0], split_word[1])
                elif length == 3:
                    counter = list(trigrams(lemmatized_text))
                    word_tuple = (split_word[0], split_word[1], split_word[2])
                else:
                    counter = []
                count = counter.count(word_tuple)
            else:
                word = item[word_table]
                count = self.aggregate_occurences_help(Counter(lemmatized_text), word)
            print(word, str(count))
            self.__postgre_db.update(schema, word_table + "_occ", "count=" + str(count), "id=" + str(item['id']))

    def calculate_pmi_helper(self, schema, corpus_count, co_occurence, word1, word2):
        co_occ_table = self.__postgre_db.get_data_from_table(schema, co_occurence)
        for item in co_occ_table:
            item_id = item['id']
            co_occ_freq = float(item['count'] / corpus_count)
            word1_id = item[word1]
            word2_id = item[word2]
            word1_occ = self.__postgre_db.get(schema, word1 + "_occ", "id=" + str(word1_id), "count")
            word2_occ = self.__postgre_db.get(schema, word2 + "_occ", "id=" + str(word2_id), "count")
            pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count)))
            self.__postgre_db.update(schema, co_occurence, "pmi=" + str(pmi), "id=" + str(item_id))

    def calculate_pmi_use_case2(self, schema):
        print("Calculating PMI for " + schema)
        corpus_count = 0
        text = []
        for item in self.__mongo_db.get(schema, {}):
            text += word_tokenize(item['text'], language='german')
            corpus_count += len(word_tokenize(item['text'], language='german'))
        print(corpus_count)
        counter = Counter(text)
        single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern")
        # counting single pattern occurrences
        for item in single_pattern_table:
            word = item['single_pattern']
            count = counter[word]
            self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word))

        # pmi calculation
        co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern")
        for item in co_occ_table:
            item_id = item['id']
            co_occ_freq = float(item['count'] / corpus_count)
            word1_id = item['pattern_a']
            word2_id = item['pattern_b']
            word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count")
            print(word1_occ)
            word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count")
            print(word2_occ)
            pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count)))
            print(pmi)
            self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id))

    def get_results_use_case2(self, schema):
        print("Colour + Nature")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 2 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Colour + Location")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Colour + Social")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Nature + Location")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Nature + Social")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Location + Social")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 3 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))

    def check_pattern(self, pattern, token):
        """Strip token and check if the token matches the defined pattern.

        Parameter:
        pattern -- the pattern to search for
        token -- the token to match with the pattern
        """
        split_token = re.split('\W+', token)
        if split_token[0] == '':
            split_token = split_token[1]
        else:
            split_token = split_token[0]
        return split_token == pattern

    def get_result(self, schema):
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_verb_occ SV"""))
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.object_verb_occ SV"""))
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_object_occ SV"""))
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_adjective_occ SV"""))
        pprint(self.__postgre_db.query("""SELECT S.subject, V.verb, SV.pmi FROM """ + schema + """.subject_verb_occ SV, """ + schema + """.subject_occ S, """ + schema + """.verb_occ V WHERE SV.subject = S.id AND SV.verb = V.id ORDER BY subject DESC, pmi DESC"""))
        pprint(self.__postgre_db.query("""SELECT O.object, V.verb, OV.pmi FROM """ + schema + """.object_verb_occ OV, """ + schema + """.object_occ O, """ + schema + """.verb_occ V WHERE OV.object = O.id AND OV.verb = V.id ORDER BY object DESC, pmi DESC"""))
        pprint(self.__postgre_db.query("""SELECT O.object, S.subject, SO.pmi FROM """ + schema + """.subject_object_occ SO, """ + schema + """.subject_occ S, """ + schema + """.object_occ O WHERE SO.object = O.id AND SO.subject = S.id ORDER BY subject DESC, pmi DESC"""))
        pprint(self.__postgre_db.query("""SELECT S.subject, A.adjective, SA.pmi FROM """ + schema + """.subject_adjective_occ SA, """ + schema + """.subject_occ S, """ + schema + """.adjective_occ A WHERE SA.subject = S.id AND SA.adjective = A.id ORDER BY subject DESC, pmi DESC"""))
Пример #40
0
def predict(**kwargs):
    import pandas as pd
    import numpy as np
    from model_bridging.helpers import (tokenize_pd_code, get_date_diff,
                                        add_emp_tenure_to_df,
                                        get_bucket_and_key_from_s3_uri,
                                        download_model_from_s3)
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    from nltk.tokenize import WhitespaceTokenizer
    import json
    import nltk
    nltk.download("stopwords")

    def stem_text(text):
        return " ".join([stemmer.stem(w) for w in w_tokenizer.tokenize(text)])

    COLS_REQD = [
        "ClaimNumber",
        "PrimaryDiagnosisCode",
        "SICCode",
        "InsuredGender",
        "InsuredSalaryIndicator",
        "DOTPrimaryExertionLevel",
        "CaseSize",
        "PrimaryDiagnosisDecription",
        "PrimaryDiagnosisCategory",
        "InsuredAgeatLoss",
        "InsuredAnnualizedSalary",
        "InsuredHireDate",
        "ReceivedDate",
        "LossDate",
    ]
    CATEGORICAL = [
        "pd_code_1",
        "pd_code_2",
        "SIC_category",
        "InsuredGender",
        "InsuredSalaryIndicator",
        "DOTPrimaryExertionLevel",
        "CaseSize",
        "PrimaryDiagnosisCategory",
    ]
    VALID_CLAIM_STATUS_DESC = ["Benefit Case Under Review"]

    for artifact in kwargs.get("artifact"):
        if artifact.get("dataName") == "combined_artifacts":
            model_bucket, model_key = get_bucket_and_key_from_s3_uri(
                artifact.get("dataValue"))

    artifacts = download_model_from_s3(model_bucket, model_key)

    # Unpacking artifacts from the joblib object
    model = artifacts.get("model")
    tfidf_model = artifacts.get("tfidf_model")
    categorical_grouper = artifacts.get("categorical_grouper")
    train_template = artifacts.get("train_template")

    input_data = pd.DataFrame([kwargs.get("inputs").get("claim")])

    date_cols = [x for x in input_data.columns if "date" in x.lower()]
    for col in date_cols:
        input_data.loc[:, col] = pd.to_datetime(input_data[col],
                                                errors="coerce")

    prediction_df = input_data[input_data["ClaimStatusDescription"].isin(
        VALID_CLAIM_STATUS_DESC)].copy()
    prediction_df = prediction_df.loc[~(
        (prediction_df["ClaimStatusDescription"] == "Benefit Case Under Review"
         )
        & (prediction_df["ClaimStatusCode"] == "Closed")), :, ].copy()
    pred_features = prediction_df[COLS_REQD].copy().drop_duplicates()

    # tabular data preprocessing part 1
    # Extract first 2 characters from SIC code
    pred_features.loc[:, "SIC_category"] = (
        pred_features["SICCode"].astype(str).str[:2])  # sic category feature
    # split primary diagnosis code into two sub-codes
    pred_features = tokenize_pd_code(pred_features)  # features from PD code
    # calculate employment tenure feature
    pred_features = add_emp_tenure_to_df(pred_features)  # emp tenure feature
    # string salary range to number conversion
    pred_features.loc[:, "InsuredAnnualizedSalary"] = [
        (float(op[0]) + float(op[1])) / 2
        for op in pred_features["InsuredAnnualizedSalary"].fillna(
            "0-0").str.split("-")
    ]  # salary feature
    # pivot operation around Claim Number and Approval date to get sequential info in single row
    prediction_df["approval_date_rank"] = (
        prediction_df.groupby("ClaimNumber")["ApprovalDate"].rank(
            ascending=True).fillna(-1).astype(int))
    # get the values from the earliest snapshot
    pivot_df = prediction_df.loc[
        prediction_df.groupby("ClaimNumber").approval_date_rank.idxmin(),
        ["ClaimNumber", "BenefitCaseType", "DurationDate"], ].copy()
    pivot_df.rename({"DurationDate": "first_duration_date"},
                    axis=1,
                    inplace=True)
    pivot_df = pivot_df.loc[pivot_df["BenefitCaseType"] == "STD"]
    # extract features for prediction
    pred_features = pivot_df.merge(pred_features,
                                   how="inner",
                                   on="ClaimNumber")
    # initial prognosis days feature
    pred_features.loc[:, "initial_prognosis_days"] = get_date_diff(
        pred_features["LossDate"], pred_features["first_duration_date"], "D")
    pred_features.loc[pred_features["initial_prognosis_days"] <= 0,
                      "initial_prognosis_days"] = np.nan

    # text preprocessing pipeline for extracting features from
    # Primary Diagnosis Desc feature
    # initialize tokenizer, stemmer and stopwords from NLTK
    w_tokenizer = WhitespaceTokenizer()
    # lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer(language="english")
    stop = stopwords.words("english")

    # stop word removal and clean up
    pred_features.loc[:, "PrimaryDiagnosisDecription"] = (
        pred_features["PrimaryDiagnosisDecription"].fillna("_na_").apply(
            lambda x: " ".join(
                [word for word in x.split(" ") if word not in (stop)])))
    pred_features.loc[:, "PrimaryDiagnosisDecription"] = pred_features[
        "PrimaryDiagnosisDecription"].str.replace("[^\w\s]", "")
    # stemming the cleaned text
    pred_features.loc[:, "pd_desc_stemmed"] = pred_features[
        "PrimaryDiagnosisDecription"].apply(stem_text)
    # feature extraction from tf-idf vectorizer
    vocab = tfidf_model.get_feature_names()
    pred_desc_feat = tfidf_model.transform(
        pred_features.loc[:, "pd_desc_stemmed"]).toarray()
    pred_desc_feat = pd.DataFrame(pred_desc_feat, columns=vocab)

    # adding text features to the tabular data
    x_pred = pd.concat([pred_features, pred_desc_feat], axis=1)
    # preserving training dataset feature ordering
    x_pred_sub = x_pred[train_template.columns].copy()
    x_pred_sub[CATEGORICAL] = x_pred_sub[CATEGORICAL].copy().astype(str)
    x_pred_sub[CATEGORICAL] = categorical_grouper.transform(
        x_pred_sub[CATEGORICAL].copy(), CATEGORICAL)
    pred_features.loc[:, "predicted_probability"] = model.predict_proba(
        x_pred_sub)[:, 1]
    pred_features.loc[:, "predicted_bridged_ind"] = model.predict(x_pred_sub)

    pred_payload = pred_features[[
        "ClaimNumber", "predicted_probability", "predicted_bridged_ind"
    ]]
    payload_json = json.loads(pred_payload.to_json(orient="records"))[0]
    claim_number = payload_json["ClaimNumber"]
    return [{
        "inputDataSource":
        f"{claim_number}:0",
        "entityId":
        claim_number,
        "predictedResult": [{
            'claimNumber':
            claim_number,
            'predictedProbability':
            payload_json['predicted_probability'],
            'predicted_bridged_ind':
            payload_json['predicted_bridged_ind']
        }]
    }]
Пример #41
0
from nltk.tokenize import WhitespaceTokenizer

TOKENIZER = WhitespaceTokenizer()

def read(file_name):
	try:
		f_in = '%s.txt' % file_name
		file_in = open(f_in, 'r')
		f_out = '%s.csv' % file_name
		file_out = open(f_out, 'wb')
	except Exception, e:
		raise e

	data = ', '.join( [TOKENIZER.tokenize(line)[1] for line in file_in] )

	try:
		file_out.write(data)
	except Exception, e:
		raise e

#read()

if __name__ == "__main__":
    # Command line arguments
    import argparse
    parser = argparse.ArgumentParser(
        description='Converts a space two column space separted file into csv containing second column'
    )
    parser.add_argument('file', help='The file to convert')
    args = parser.parse_args()
Пример #42
0
import re
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk import bigrams, trigrams
import math
from collections import Counter
import time
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate
from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer
tokenizer = WhitespaceTokenizer()
templates = [
	SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)),
	SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)),
	SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)),
	SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)),
	SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
	SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
	SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
	SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
	ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
	ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),	]

tagged_sentences=[]
tokenizer =WhitespaceTokenizer()
with open("datascience_6.txt","r") as openfile:
Пример #43
0
def splitToWords(text):
    return WhitespaceTokenizer().tokenize(text)
Пример #44
0



#Undirected Graph
#Boolean CoOccurrenceCounter 
with open("datascience545.txt","r") as openfile:
	Stopwords = nltk.corpus.stopwords.words('english')
	pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
	WordGramWordGram= Counter()
	WordGramBiGram = Counter()
	WordGramTriGram = Counter()
	BiGramBiGram = Counter()
	BiGramTriGram = Counter()
	TriGramTriGram = Counter()
	tokenizer = WhitespaceTokenizer()
	for line in openfile:
		words = line.lower().strip().replace('(',',').replace(')',',')
		words=re.sub(r'\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\{|\[|\}|\]|\\|\<|\,|\<|\.|\>|\?|\/|\;|\:|\"|\'', '',words)
		words = pattern.sub('', words)
		words=words.split('\r')
		words = [s.lstrip() for s in words]
		ReservoirALL={}
		for word in words:
			CountWordGrams = Counter()
			CountBiGrams = Counter()
			CountTriGrams = Counter()
			
			wordsplit= tokenizer.tokenize(word)
			wordsplit = [s.lstrip() for s in wordsplit]
			NoDupes = list(set(wordsplit))
Пример #45
0
def tokenize(text):
	tknzr = WhitespaceTokenizer()
	tokens = tknzr.tokenize(text)
	# tokens = nltk.word_tokenize(text)
	return tokens
    # If text is empty, return None.
    if not text: return None
    sentence_tokenizer = _SENTENCE_TOKENIZER_DICT.get(sentence_tokenizer_id)
    return sentence_tokenizer(text)


_word_tokenizer_default = word_tokenize

_word_tokenizer_treebank = TreebankWordTokenizer().tokenize

_word_tokenizer_regex = RegexpTokenizer(pattern=get_word_token_pattern(),
                                        gaps=False).tokenize

_word_tokenizer_punkt = WordPunctTokenizer().tokenize

_word_tokenizer_whitespace = WhitespaceTokenizer().tokenize

_WORD_TOKENIZER_DICT = {
    'default': _word_tokenizer_default,
    'treebank': _word_tokenizer_treebank,
    'regex': _word_tokenizer_regex,
    'punkt': _word_tokenizer_punkt,
    'whitespace': _word_tokenizer_whitespace
}


def word_tokenize(text, word_tokenizer_id='default'):
    """
    Word-tokenizes a given sentence, based on a defined tokenizer.
    Args:
        sentence: A string, corresponding to a sentence.
Пример #47
0
class LingPipeParser(object):
    def __init__(self, config):
        self.clear()
        self.config = config

    def clear(self):
        self.tok_num = 0
        self.byte_idx = 0
        self.line_idx = 0
        self.word_tokenizer = WhitespaceTokenizer()

    def set(self, ner_dom):
        self.clear()
        ## nltk wants a unicode string, so decode, it and then we will
        ## re-encode it to carefully recover the byte offsets.  We
        ## must take care not to use any nltk components that insert
        ## new whitespace, such
        ## nltk.tokenize.treebank.TreebankTokenizer
        self.ner_dom = ner_dom
        self.attributes = []
        self.relations = []

    def sentences(self):
        '''
        Iterate over <s> XML-like tags and tokenize with nltk
        '''
        for sentence_id, node in enumerate(self.ner_dom.childNodes):
            ## increment the char index with any text before the <s>
            ## tag.  Crucial assumption here is that the LingPipe XML
            ## tags are inserted into the original byte array without
            ## modifying the portions that are not inside the
            ## LingPipe-added tags themselves.
            if node.nodeType == node.TEXT_NODE:
                ## we expect to only see TEXT_NODE instances with whitespace
                assert only_whitespace.match(node.data), repr(node.data)

                ## must convert back to utf-8 to have expected byte offsets
                self.byte_idx += len(node.data.encode('utf-8'))

                ## count full lines, i.e. only those that end with a \n
                # 'True' here means keep the trailing newlines
                for line in node.data.splitlines(True):
                    if line.endswith('\n'):
                        self.line_idx += 1
            else:
                logger.debug('getting tokens for sentence_id=%d' % sentence_id)
                more_sentence_remains = True
                while more_sentence_remains:
                    ## always a sentence
                    sent = Sentence()

                    ## this "node" came from for loop above, and it's
                    ## childNodes list might have been popped by a
                    ## previous pass through this while loop
                    tokens = iter( self.tokens( node ) )

                    while 1:
                        try:
                            tok = tokens.next()
                            sent.tokens.append(tok)
                            #logger.debug('got token: %r  %d %d' % (tok.token, tok.mention_id, tok.sentence_pos))

                        except StopIteration:
                            yield sent
                            more_sentence_remains = False
                            break

    def _make_token(self, start, end):
        '''
        Instantiates a Token from self._input_string[start:end]
        '''
        ## all thfift strings must be encoded first
        tok_string = self._input_string[start:end].encode('utf-8')
        if only_whitespace.match(tok_string):
            ## drop any tokens with only whitespace
            return None
        tok = Token()
        tok.token = tok_string
        tok.token_num = self.tok_num
        if 'BYTES' in self.config['offset_types']:
            tok.offsets[OffsetType.BYTES] = Offset(
                type =  OffsetType.BYTES,
                first=self.byte_idx + len(self._input_string[:start].encode('utf-8')),
                length=len(tok_string),
                value=self.config['offset_debugging'] and tok_string or None,
                )
        if 'LINES' in self.config['offset_types']:
            tok.offsets[OffsetType.LINES] = Offset(
                type =  OffsetType.LINES,
                first=self.line_idx,
                length=1,
                value=self.config['offset_debugging'] and tok_string or None,
                )
        self.tok_num += 1
        ## keep track of position within a sentence
        tok.sentence_pos = self.sent_pos
        self.sent_pos += 1
        return tok

    def tokens(self, sentence_dom):
        '''
        Tokenize all the words and preserve NER labels from ENAMEX tags
        '''
        ## keep track of sentence position, which is reset for each
        ## sentence, and used above in _make_token
        self.sent_pos = 0
    
        ## keep track of mention_id, so we can distinguish adjacent
        ## multi-token mentions within the same coref chain
        mention_id = 0

        while len(sentence_dom.childNodes) > 0:
            ## shrink the sentence_dom's child nodes.  In v0_2_0 this
            ## was required to cope with HitMaxi16.  Now it is just to
            ## save memory.
            node = sentence_dom.childNodes.pop(0)

            if node.nodeType == node.TEXT_NODE:
                ## process portion before an ENAMEX tag
                for line in node.data.splitlines(True):
                    self._input_string = line
                    for start, end in self.word_tokenizer.span_tokenize(line):
                        tok = self._make_token(start, end)
                        if tok:
                            yield tok

                    if line.endswith('\n'):
                        ## maintain the index to the current line
                        self.line_idx += 1

                    ## increment index pasat the 'before' portion
                    self.byte_idx += len(line.encode('utf-8'))

            else:
                ## process text inside an ENAMEX tag
                assert node.nodeName == 'ENAMEX', node.nodeName
                chain_id = node.attributes.get('ID').value
                entity_type = node.attributes.get('TYPE').value
                for node in node.childNodes:
                    assert node.nodeType == node.TEXT_NODE, node.nodeType
                    for line in node.data.splitlines(True):
                        self._input_string = line
                        for start, end in self.word_tokenizer.span_tokenize(line):
                            tok = self._make_token(start, end)
                            if tok:
                                if entity_type in _PRONOUNS:
                                    tok.mention_type = MentionType.PRO
                                    tok.entity_type = _ENTITY_TYPES[entity_type]
                                    
                                    ## create an attribute
                                    attr = Attribute(
                                        attribute_type=AttributeType.PER_GENDER,
                                        value=str(_PRONOUNS[entity_type])
                                        )
                                    self.attributes.append(attr)

                                else:
                                    ## regular entity_type
                                    tok.mention_type = MentionType.NAME
                                    tok.entity_type = _ENTITY_TYPES[entity_type]

                                tok.equiv_id = int(chain_id)
                                tok.mention_id = mention_id
                                yield tok

                        if line.endswith('\n'):
                            ## maintain the index to the current line
                            self.line_idx += 1

                        ## increment index pasat the 'before' portion
                        self.byte_idx += len(line.encode('utf-8'))

                ## increment mention_id within this sentence
                mention_id += 1
Пример #48
0
 def __init__(self, *args, **kwargs):
     super(nltk_tokenizer, self).__init__(*args, **kwargs)
     self.sentence_tokenizer = PunktSentenceTokenizer()
     self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()
Пример #49
0
def word_parser( input_str ):
    tokenizer = WhitespaceTokenizer()
    return tokenizer.tokenize( input_str )
Пример #50
0
"""

import re
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

text = "This is a block of text. I'm writing a piece to explain the usage of nltk packages."

text = text.lower()  #changes evrything lower case
nopunct_text = re.sub('[^a-z0-9]', ' ',
                      text)  #remove non alphanumeric characters

#tokenize
tokens = WhitespaceTokenizer().tokenize(nopunct_text)

#remove stopwords
stop_words = set(stopwords.words('english'))

filtered_tokens = []
for token in tokens:
    if token not in stop_words:
        filtered_tokens.append(token)

#lemmatize and stem
ps = PorterStemmer()
lem = WordNetLemmatizer()

stemmed_tokens = []
for token in filtered_tokens: