Exemplo n.º 1
0
 def regnltk_tokenize(text):
     lemmatizer = WordNetLemmatizer()
     text_clean = clean_text(text)
     words = regexp_tokenize(text_clean, pattern='\s+', gaps=True)
     return [
         lemmatizer.lemmatize(word) for word in words if (len(word) >= 3)
     ]
Exemplo n.º 2
0
 def format_sentence(self, sent):
     """Tokenize sentence and return format that can work with NLTK.NaiveBayesClassifier."""
     return {
         word: True
         for word in regexp_tokenize(sent, pattern='\w+')
         if word not in stopwords.words('english')
     }
Exemplo n.º 3
0
def sample_preprocess(
        sample: str,
        header: str) -> Tuple[SampleLabelled, SampleLabelledTokenised]:
    """Pre-process data and format them in structure that is easy to pass to 
    other methods

    Parameters
    ----------
    sample : str
        The sample you want to process
    header : str
        The label you want to apply to the sample
    """

    # """Clean sample"""
    temp = sample.lower()

    # Remove all occurences of square brackets and everything in between
    temp = re.sub("\[.*?\]", "", temp)

    # Focus on words, disregard numbers, etc.
    temp_tokens = regexp_tokenize(temp, r"[a-zA-z]+")

    # Remove stop words
    temp_tokens = [word for word in temp_tokens if not word in stopwords]

    # Create labelled sample
    sample_labelled = (' '.join(temp_tokens), header)

    # Create labelled tokenised sample
    sample_labelled_tokenised = (temp_tokens, header)

    return sample_labelled, sample_labelled_tokenised
Exemplo n.º 4
0
def analyse(text):
    global NUM_SECS
    global NUM_SENTENCES
    global NUM_WORDS
    global NUM_STOP_WORD
    NUM_SECS += len(text.split("\n\n"))

    text = re.sub('@|#|:|\n|-|’', ' ', text)
    tokens = regexp_tokenize(text.lower().replace("'", " "),
                             pattern='\w+|\$[\d\.]+|\S+')

    terms = set()
    lexemes = set()
    for token in tokens:
        if token in punctuation or token == " ":
            if token in ['!', '.', '?']:
                NUM_SENTENCES += 1
            continue  # ignore punctuation
        elif token in stop:
            NUM_WORDS += 1
            NUM_STOP_WORD += 1
        else:
            NUM_WORDS += 1
            terms.add(token)
            lexemes.add(lemmatizer.lemmatize(token))
Exemplo n.º 5
0
def count_words(text):
    global num_words
    text = re.sub(",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", text)
    for token in regexp_tokenize(text.lower(), pattern='\w+|\$[\d\.]+|\S+'):
        if token in punctuation or token == " ":
            continue # ignore punctuation
        elif token in stop:
            num_words += 1
        else:
            num_words += 1
Exemplo n.º 6
0
    def __getitem__(self, idx):
        x = regexp_tokenize(self.x[idx], pattern="\s+", gaps=True)
        y = self.y_dict[self.y[idx]]
        x = [self.questions.index(word) for word in x]
        if len(x) > self.max_len:
            x = x[:self.max_len]
        else:
            x = x + [self.questions.index("padding")] * (self.max_len - len(x))

        return torch.tensor(x), torch.tensor(y)
Exemplo n.º 7
0
def tokenize_text(text, clean=False):
    """
        text: text to tokenize
        output: list of words
    """

    if clean:
        text = clean_text(text)

    words = regexp_tokenize(text, pattern="\s+", gaps=True)

    return (words)
Exemplo n.º 8
0
def Tokenize(s):
    global hash_stem
    words = regexp_tokenize(s,pattern="[a-z]+ | [0-9]+")
    ret = []
    for word in words:
        if(len(word)>2 and len(word)<41 and word not in cachedStopWords):
            word = word.strip()
            if word not in hash_stem:
                hash_stem[word] = stemmer.stem(word)   
            ret.append(hash_stem[word])
            # if(len(hash_stem) == 100000000):
            #     print(len(hash_stem))
            #     hash_stem = {}
    return ret
Exemplo n.º 9
0
def process_data(fileName):
    category = fileName.replace('.txt', '').replace('data/', '')
    data = open(fileName, 'r').read().split("\n|||\n")
    data_original = []
    data_tokenized = []
    for text in data:
        text = text.lower()
        text = re.sub("\[.*?\]", "", text)
        text_tokens = regexp_tokenize(text, r"[a-zA-z]+")
        text_tokens_ns = [
            word for word in text_tokens if not word in stopwords
        ]
        data_original.append((' '.join(text_tokens_ns), category))
        data_tokenized.append((text_tokens_ns, category))
    return data, data_original, data_tokenized
Exemplo n.º 10
0
def normalize_section_nltk_fast(text):
    offset = 1 # position value
    doc_lexemes = {}
    doc_offsets = defaultdict(list)
    
    text = re.sub(",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", text)
    for token in regexp_tokenize(text.lower(), pattern='\w+|\$[\d\.]+|\S+'):
        if token in punctuation or token == " ":
            continue # ignore punctuation
        elif token in stop:
            offset += 1 # increase offset but don't save
        else:
            doc_lexemes[token] = lemmatizer.lemmatize(token)
            doc_offsets[token].append(offset)
            offset += 1
    def custom_tokenizer(self, text, tokenizer='nltk_regexp'):
        text = self.clean_text(text)

        if (tokenizer == 'nltk'):
            tokens = word_tokenize(text)
        elif (tokenizer == 'gensim'):
            tokens = gensim.utils.simple_preprocess(str(tokens), deacc=True)
        else:
            tokens = regexp_tokenize(text, pattern='\s+', gaps=True)

        for token in tokens:
            if len(token) > 512:
                print('TOKER > 512', token)
                token = token[:512]

        return tokens
Exemplo n.º 12
0
    def process(self, item):
        """Process item - Reduce inflectional forms to a common base form.

        Args:
            item (dict): item
        Returns:
            dict: Returns the updated item
        """
        try:
            self._log.debug("Lemmatizer Step")
            text = regexp_tokenize(item["data"], pattern="\s+", gaps=True)
            text = " ".join([self._lemmatizer.lemmatize(w) for w in text])
            item["data"] = text
        except Exception as e:
            self._log.error("Error with lemmatizer on item id:{} - {}".format(
                item["id"], e))
        return item
Exemplo n.º 13
0
    def process(self, item):
        """Process Item - Tokenize text into tokens.

        Args:
            item (dict): item
        Returns:
            dict: Returns the updated item
        """
        try:
            self._log.debug("NLTK Regex Tokenize Step")
            text = regexp_tokenize(item["data"], pattern="\s+", gaps=True)
            item["data"] = text
        except Exception as e:
            self._log.error(
                "Error in NLTK Regex Tokenize from item id:{} - {}".format(
                    item["id"], e))
        return item
Exemplo n.º 14
0
def normalize_doc(doc):
    paragraphs = doc.split("\n\n")
    secs = []
    for para in paragraphs:
        words = set()
        para = re.sub(
            ",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ",
            para)
        for token in regexp_tokenize(para.lower(),
                                     pattern='\w+|\$[\d\.]+|\S+'):
            if token in punctuation or token == " ":
                continue  # ignore punctuation
            elif token in stop:
                pass
            else:
                words.add(token)
    secs.append(words)
    return secs
    def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.
        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()

        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [
                word.lower()
                for word in regexp_tokenize(sentence, pattern='\w+|\$[\d\.]+')
            ]
            print(word_list)
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list
Exemplo n.º 16
0
    def process(self, item):
        """Process item - Expand Contractions step.

        Args:
            item (dict): item
        Returns:
            dict: Returns the updated item
        """
        try:
            self._log.debug("Expand Contractions Step")
            text = regexp_tokenize(item["data"], pattern="\s+",
                                   gaps=True)  # noqa

            for index, word in enumerate(text):
                if CONTRACTIONS.get(word):
                    text[index] = CONTRACTIONS[word]

            item["data"] = " ".join(text)
        except Exception as e:
            self._log.error("Error debugging (item id:{}) - {}".format(
                item["id"], e))
        return item
Exemplo n.º 17
0
def normalize_section_nltk_pos_tag(text):
    offset = 1 # position value
    doc_lexemes = {}
    doc_offsets = defaultdict(list)
    
    text = re.sub(",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", text)
    for token, tag in pos_tag(regexp_tokenize(text.lower(), pattern='\w+|\$[\d\.]+|\S+')):
        if token in punctuation or token == " ":
            continue # ignore punctuation
        elif token in stop:
            offset += 1 # increase offset but don't save
        else:
            lemma = ""
            tag = tag[0].lower()
            tag = tag if tag in ['a', 'r', 'n', 'v'] else None
            if not tag:
                lemma = token
            else:
                lemma = lemmatizer.lemmatize(token, tag)
            doc_lexemes[token] = lemma
            doc_offsets[token].append(offset)
            offset += 1
    num_words = offset - 1
    return (doc_lexemes, doc_offsets, num_words)
Exemplo n.º 18
0
 def nltk_regexp_tokenize(raw_corpus):
     # regular expression pattern includes punctuation
     re_pattern = '\w+|\$[\d\.]+|\S+'
     return [regexp_tokenize(doc, re_pattern) for doc in raw_corpus]
Exemplo n.º 19
0
from nltk.tokenize.regexp import regexp_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

# List of stopwords
all_stopwords = set(stopwords.words('english'))

pos_data = open("allergies.txt", "r").read().split("\n|||\n")
pos_document = []
pos_p = []
for text in pos_data:
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text_tokens = regexp_tokenize(text, r"[a-zA-z]+")
    text_tokens_ns = [word for word in text_tokens if not word in all_stopwords]
    pos_p.append((' '.join(text_tokens_ns), 'allergies'))
    pos_document.append((text_tokens_ns, 'allergies'))

neg_data = open("social_history.txt", "r").read().split("\n|||\n")
neg_document = []
neg_p = []
for text in neg_data:
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text_tokens = regexp_tokenize(text, r"[a-zA-z]+")
    text_tokens_ns = [word for word in text_tokens if not word in all_stopwords]
    neg_p.append((' '.join(text_tokens_ns), 'social_history'))
    neg_document.append((text_tokens_ns, 'social_history'))
Exemplo n.º 20
0
def renltk_tokenize(text):
    return regexp_tokenize(text, pattern='\s', gaps=True)
Exemplo n.º 21
0
#!/usr/bin/python
# coding=utf-8
# -*- encoding: utf-8 -*-

# Programa que separa les lletres d'una paraula amb espais

#sys.setdefaultencoding('utf-8');

import sys;
from nltk.tokenize.regexp import regexp_tokenize;

from sys import stdin;
from sys import stderr;

from codecs import getreader;
from codecs import getwriter;

stdin  = getreader('utf-8')(stdin);
sys.stdout = getwriter('utf-8')(sys.stdout);
stderr = getwriter('utf-8')(stderr);

s = stdin.read();
tokens = regexp_tokenize(s,'\w+|\$[\d\.]+');

for token in tokens:
    print " ".join(regexp_tokenize(token,'\w')) + '.';
Exemplo n.º 22
0
import sys
from nltk.probability import FreqDist
from nltk.tokenize.regexp import regexp_tokenize
import re

lines = sys.stdin.readlines()

N = int(lines[0])
data = lines[1:]

for d in data:
    if d.strip() == "":
        continue
    sent = FreqDist(word.lower()
                    for word in regexp_tokenize(d, pattern=r'[a-zA-Z0-9]+'))

    print(sent["a"])
    print(sent["an"])
    print(sent["the"])

    seperator = r'[ \\/,.-]+'
    datePattern = r'\d{1,2}(?:st|nd|rd|th)*'

    monthPattern = r'(?:\d\d|jan|feb|mar|apr|may|jun|july|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)+'

    yearPattern = r'\d{2,4}'
    pattern = datePattern + seperator + monthPattern + seperator + yearPattern
    #     print(re.findall(pattern, d, re.IGNORECASE));
    dates = len(re.findall(pattern, d, re.IGNORECASE))

    pattern = monthPattern + seperator + datePattern + seperator + yearPattern