Пример #1
0
def strict_preprocessor(corpus):
    # strip non-ascii characters
    tokenized = [i for i in word_tokenize(corpus) if not match(r'(^\W*$)', i)]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(i) for i in tokenized]
Пример #2
0
from qa_engine.score_answers import main as score_answers
import nltk, operator
from nltk.corpus import wordnet as wn
from word2vec_extractor import Word2vecExtractor
from nltk import WordNetLemmatizer
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
"""
GLOBALS
"""

STOPWORDS = set(nltk.corpus.stopwords.words("english"))
glove_w2v_file = "data/glove-w2v.txt"
DATA_DIR = "./wordnet"
W2vecextractor = Word2vecExtractor(glove_w2v_file)
lmtzr = WordNetLemmatizer()
"""
UTILITY FUNCTIONS
"""


# The standard NLTK pipeline for POS tagging a document
def get_sentences(text):

    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

    return sentences

@author: elliott
"""

from numpy import prod
from collections import Counter, Set
from nltk import sent_tokenize, ngrams, PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tag import perceptron
import utils
import phrase_similarity
import numpy as np
import pickle

tagger = perceptron.PerceptronTagger()
porter = PorterStemmer()
snowball = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

thermometers = [
    'democrats', 'republicans', 'protestants', 'catholics', 'jews', 'blacks',
    'whites', 'southerners', 'big business', 'labor unions', 'liberals',
    'conservatives', 'military', 'policemen', 'black militants',
    'civil rights leaders', 'chicanos hispanics', 'democratic party',
    'middle class people', 'people on welfare', 'political independents',
    'political parties', 'poor people', 'republican party',
    'womens right activist', 'young people', 'asian-americans', 'congress',
    'environmentalists', 'anti abortionists', 'federal government',
    'illegal aliens', 'christian fundamentalists', 'radical students',
    'farmers', 'feminists', 'evangelical groups', 'elderly', 'supreme court',
    'women'
]
Пример #4
0
 def lemmatize_stemming(self, token):
     return SnowballStemmer("english").stem(WordNetLemmatizer().lemmatize(token, pos='v'))
Пример #5
0
 def __init__(self):
   self.lemmatizer = WordNetLemmatizer()
Пример #6
0
def clean_text(text):
    import nltk
    nltk.download('stopwords')
    nltk.download('wordnet')

    # split into words by white space
    words = text.split()
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in words]
    text = " ".join(text)
    #print(stripped[:100])

    ## Remove puncuation
    #text = text.translate(string.punctuation)

    ########################################################################################
    # replace urls
    re_url = re.compile(
        r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                        .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
        re.MULTILINE | re.UNICODE)
    # replace ips
    re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

    # replace URLs
    text = re_url.sub("URL", text)

    # replace IPs
    text = re_ip.sub("IPADDRESS", text)
    ####################################################################

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    #stops = set(stopwords.words("english"))
    #text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    #stemmer = SnowballStemmer('english')
    #stemmed_words = [stemmer.stem(word) for word in text]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)

    return text
Пример #7
0
def lemmatize_stemming(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))
Пример #8
0
 def __init__(self):
     self.wnl = WordNetLemmatizer()
Пример #9
0
 def __init__(self, model):
     self.model = model
     self.lemmatizer = WordNetLemmatizer()
     self.intents = json.loads(open('intents.json').read())
     self.words = pickle.load(open('words.pkl', 'rb'))
     self.classes = pickle.load(open('classes.pkl', 'rb'))
Пример #10
0
def feature_maker(embed_file, dataframe, embed_signal='n'):
    '''takes a path to embeddings file, dataframe as input - default keyword
    embed-signal means that embeddings are not encoded by default
    returns an expanded dataframe with:
    a column of lemmatised words; a column of stemmed words; a column indicating
    capitalisation status; a column indicating capilatisation status of previous
    token; columns indicating shape, previous shape, short shape, previous
    short shape, following token short shape.
    If kwarg embed_signal is 'y', a list of embeddings is also generated.

    '''

    wnl = WordNetLemmatizer()
    prtr = PorterStemmer()
    stringed_list = [str(x) for x in dataframe['token']]
    wn_lemma_list = [wnl.lemmatize(t) for t in stringed_list]
    dataframe['lemma'] = wn_lemma_list
    prtr_stemmer_list = [prtr.stem(t) for t in stringed_list]
    dataframe['stem'] = prtr_stemmer_list

    dataframe['caps'] = 'no caps'
    dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'),
                  ['caps']] = 'begin_cap'
    dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'),
                  ['caps']] = 'all_caps'
    dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'),
                  ['caps']] = 'caps_inside'

    temp_list = dataframe['caps'].to_list()
    temp_list.insert(0, 'no_cap')
    temp_list.pop()
    dataframe['prev_caps'] = temp_list

    dataframe['short_shape'] = 'x'
    dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'),
                  ['short_shape']] = 'Xx'
    dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'),
                  ['short_shape']] = 'XX'
    dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'),
                  ['short_shape']] = 'xXx'
    dataframe.loc[dataframe['token'].str.contains('\W'), ['short_shape']] = '-'

    prev_short_shape_list = []
    prev_short_shape_list = dataframe['short_shape'].to_list()
    prev_short_shape_list.insert(0, '-')
    prev_short_shape_list.pop()
    dataframe['prev_short_shape'] = prev_short_shape_list

    next_short_shape_list = []
    next_short_shape_list = dataframe['short_shape'].to_list()
    next_short_shape_list.pop(0)
    next_short_shape_list.append('-')
    dataframe['next_short_shape'] = next_short_shape_list

    shape_list = []
    pre_list = []
    suf_list = []
    for text in dataframe['token']:

        prefix = text[:3]
        suffix = text[-3:]
        pre_list.append(prefix)
        suf_list.append(suffix)
        replace_caps = re.sub('[A-Z]', 'X', text)
        replace_lowers = re.sub('[a-z]', 'x', replace_caps)
        replace_digits = re.sub('\d', 'd', replace_lowers)

        shape_list.append(replace_digits)

    dataframe['shape'] = shape_list

    prev_shape_list = []
    prev_shape_list = dataframe['shape'].to_list()
    prev_shape_list.insert(0, '-')
    prev_shape_list.pop()
    dataframe['prev_shape'] = prev_shape_list

    dataframe['prefix'] = pre_list
    dataframe['suffix'] = suf_list

    if embed_signal == 'y':
        word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(
            embed_file, binary=True)
        embeddings = []
        for token in dataframe['token']:
            if token in word_embedding_model:
                vector = word_embedding_model[token]
            else:
                vector = [0] * 300
            embeddings.append(vector)
        return dataframe, embeddings
    else:
        return dataframe
Пример #11
0
def lemmatize(word, pos):
    global lemmer
    if lemmer is None:
        lemmer = WordNetLemmatizer()

    return lemmer.lemmatize(word, get_wordnet_pos(pos))
Пример #12
0
def test_main():

    test_data = pd.read_csv(
        r'C:\Users\Lovely\PycharmProjects\fake_news_classifier\data\raw\test.csv'
    )
    test_data.head()

    test_data.isnull().sum()

    test_data.info()

    test_data.dropna(inplace=True)
    print("null count in data \n", test_data.isnull().sum())

    test_data.drop(['id', 'author'], axis=1, inplace=True)

    print(test_data.head())

    # # cleaning test.csv data

    def data_cleaning(raw_data):
        #print(raw_data)
        raw_data = str(raw_data)
        raw_data = re.sub(r'\W', " ", raw_data)  #Removing non-word character
        raw_data = re.sub(r'[0-9]', " ", raw_data)
        raw_data = re.sub(r'\s+', " ", raw_data)  # Removing xtra space
        raw_data = raw_data.lower()

        return raw_data

    test_data['text'] = test_data['text'].apply(lambda x: data_cleaning(x))

    test_data['title'] = test_data['title'].apply(lambda x: data_cleaning(x))

    # As we have combined text and title in our train.csv and trained model on it accordingly , we will follow same thing here to avoid data loss and make data format same

    test_data['text'] = test_data['title'] + test_data['text']

    lm = WordNetLemmatizer()

    data_collect1 = []
    stoplist = set(nltk.corpus.stopwords.words("english"))

    def data_preprocessing_test(raw_data):

        #raw_data=str(raw_data)
        words = word_tokenize(raw_data)
        words = [
            word for word in words
            if word not in stoplist and word not in string.punctuation
        ]
        words = [word for word in words if len(word) > 1]
        words = [lm.lemmatize(word) for word in words]
        words = " ".join(words)
        data_collect1.append(words)
        return words

    test_data['text'] = test_data['text'].apply(
        lambda x: data_preprocessing_test(x))
    test_data['text'].head()

    voc_words1 = 8000
    c = 0
    for word in data_collect1:
        c += len(set(word))
    print(c)

    # # Making test.csv in model input data  format so that we can predict on it

    one_hot_repr1 = [one_hot(word, voc_words1) for word in data_collect1]
    one_hot_repr1[:5]

    sent_len = 50
    padded_data1 = pad_sequences(one_hot_repr1, padding='pre', maxlen=sent_len)
    padded_data1[0]

    x_test_data = np.array(padded_data1)
    print("priting x_test_data \n", x_test_data)

    # # Loading our Trained model from pickle  file

    filename = r'C:\Users\Lovely\PycharmProjects\fake_news_classifier\src\models\model_pickle.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    model = loaded_model

    # # Model Prediction on test data

    y_pred1 = model.predict_classes(x_test_data)

    print("predicted data shape \n", y_pred1.shape)

    print("predicted data :\n ", y_pred1)

    from pandas.core.common import flatten
    y_pred1 = list(flatten(y_pred1))  #making it 2D to 1D

    result = pd.Series(y_pred1, name='label')
    result.unique()

    output = pd.concat([pd.Series(range(1, 4576), name="Id"), result], axis=1)
    print("saving output to submit file\n")
    output.to_csv(
        r"C:\Users\Lovely\PycharmProjects\fake_news_classifier\data\processed\submit.csv",
        index=False)

    submit_data = pd.read_csv(
        r"C:\Users\Lovely\PycharmProjects\fake_news_classifier\data\processed\submit.csv"
    )
    submit_data.tail(10)

    print(submit_data.info())

    print("Top 5 data of Submit file :\n")
    print(submit_data.head())
Пример #13
0
 def __init__(self):
     self.w = WordNetLemmatizer()
     self.cache = MemoryCache()
Пример #14
0
from nltk import LancasterStemmer

lnst = LancasterStemmer()
for words in words_stem:
    print(words + " :" + lnst.stem(words))

from nltk import SnowballStemmer

snl = SnowballStemmer("english")
for words in words_stem:
    print(words + " :" + snl.stem(words))

# lemmetizing

from nltk import WordNetLemmatizer
wordnet = WordNetLemmatizer()

for words in words_stem:
    print(words + " :" + wordnet.lemmatize(words))

# pos parts of speech

# stopwords
from nltk.corpus import stopwords

stopwords.words("english")
print(len(stopwords.words("english")))

import re
punctuation = re.compile(r'[-.?,:;()|{}|0-9]')
post_punctuation = []
Пример #15
0
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [
        lemmatizer.lemmatize(word.lower())
        for word in word_tokenize(unicode(sentence, errors='ignore'))
    ]
Пример #16
0
def generate_part2_dict(ibex_data, unique_id):
    """Given an ibex results file, returns a dictionary of the following format --
    mystery word: [target, highest rated guess, lowest rated guess]"""
    Lemmy = WordNetLemmatizer()
    with open(ibex_data, 'rb+') as ibex_data:
        ibex_data = csv.reader(
            filter(lambda data_row: data_row[0] != '#', ibex_data))
        ibex_data = list(ibex_data)

        subject_id = unique_id
        subject_age = ibex_data[1][8]
        subject_sex = ibex_data[2][8]

        ibex_data = filter(lambda row: row[5] != 'end', ibex_data)
        ibex_data = filter(lambda row: row[5] != 'intro3', ibex_data)
        ibex_data = [[x.lower() for x in y] for y in ibex_data]
        subj_dict = {}
        guess_and_confidence = []

        previous_line = ['', '', '', '', '', '', '', '', '']
        trial_identifier = 5
        mystery_word, target_word, guess, confidence = 0, 1, 2, 2

        for current_line in ibex_data:
            # print "Current line:" + str(current_line)

            if current_line[trial_identifier] == previous_line[
                    trial_identifier]:
                # print "match"
                current_line_info = current_line[trial_identifier].split("_")
                previous_line_info = previous_line[trial_identifier].split("_")

                current_line_info = [x.lower() for x in current_line_info]
                previous_line_info = [x.lower() for x in previous_line_info]

                if (current_line_info[target_word],
                        current_line_info[mystery_word]) not in subj_dict:
                    subj_dict[(current_line_info[target_word],
                               current_line_info[mystery_word])] = [
                                   (previous_line[8], current_line[8])
                               ]
                else:
                    if (current_line_info[target_word],
                            current_line_info[mystery_word]) in subj_dict:
                        subj_dict[(current_line_info[target_word],
                                   current_line_info[mystery_word])] += [
                                       (previous_line[8], current_line[8])
                                   ]

            previous_line = current_line

        if len(subj_dict) != 12:
            raise ValueError(
                "ERROR: subj_dict does not equal 12. Check input results file")

        part_2_dict = defaultdict(list)

        # initialize a new dictionary for tracking some stats about the subject responses
        response_stats = defaultdict(list)

        for target_w_mystery_w, g_c_list in subj_dict.iteritems():

            g_c_reversed = reversed(g_c_list)
            g_c_reversed = list(g_c_reversed)
            guesses = []

            correct_answer_alternate_form = False
            for gc in g_c_reversed:
                lemmatized_guess = Lemmy.lemmatize(
                    gc[0].strip().decode('unicode_escape').encode(
                        'ascii', 'ignore'),
                    pos='n')
                lemmatized_guess = lemmatized_guess.encode('utf-8')

                for k, v in correct_answers.iteritems():
                    if lemmatized_guess in v:
                        correct_answer_alternate_form = lemmatized_guess
                        lemmatized_guess = k
                guesses.append((lemmatized_guess, gc[1]))

            guesses = [(x[0], int(x[1])) for x in guesses]
            # find if the target word was guessed during learning
            # and, find the highest confidence for that guess
            # and, find the number of times it was guessed
            target_guessed = 0
            target_highest_confidence = 'NA'
            target_n_times_guessed = 'NA'
            if correct_answer_alternate_form:
                target_guessed = 1
                target_highest_confidence = max(x[1] for x in guesses
                                                if x[0] == lemmatized_guess)
                target_n_times_guessed = sum(x[0] == lemmatized_guess
                                             for x in guesses)
            elif target_w_mystery_w[0] in [x[0] for x in g_c_reversed]:
                target_guessed = 1
                target_highest_confidence = max(
                    x[1] for x in guesses if x[0] == target_w_mystery_w[0])
                target_n_times_guessed = sum(x[0] == target_w_mystery_w[0]
                                             for x in guesses)

            response_stats[target_w_mystery_w[0]] = [
                target_guessed, target_highest_confidence,
                target_n_times_guessed
            ]

            guesses = [gc for gc in guesses if gc[0] != target_w_mystery_w[0]]

            if not guesses:
                guesses = [
                    (random.choice(frequent_words), random.randint(1, 5)),
                    (random.choice(frequent_words), random.randint(1, 5)),
                    (random.choice(frequent_words), random.randint(1, 5))
                ]

            highest_confidence = max(x[1] for x in guesses)
            lowest_confidence = min(x[1] for x in guesses)

            highest_guesses = map(
                lambda x: x if x[1] >= highest_confidence else None, guesses)
            lowest_guesses = map(
                lambda x: x if x[1] <= lowest_confidence else None, guesses)

            highest_guesses = (x for x in highest_guesses if x is not None)
            lowest_guesses = (x for x in lowest_guesses if x is not None)

            highest_guess = next(highest_guesses, None)
            lowest_guess = next(lowest_guesses, None)

            highest_guess = highest_guess[0]
            lowest_guess = lowest_guess[0]

            if highest_guess == lowest_guess:
                # print "high-low match"
                lowest_guess = next(lowest_guesses, None)
                lowest_guess = lowest_guess[0] if type(
                    lowest_guess) is tuple else None

            highest_guessed = 0
            highest_guess_highest_confidence = 'NA'
            highest_guess_n_times_guessed = 'NA'
            lowest_guessed = 0
            lowest_guess_highest_confidence = 'NA'
            lowest_guess_n_times_guessed = 'NA'

            if highest_guess in [x[0] for x in guesses]:
                highest_guessed = 1
                highest_guess_highest_confidence = max(
                    x[1] for x in guesses if x[0] == highest_guess)
                highest_guess_n_times_guessed = sum(x[0] == highest_guess
                                                    for x in guesses)

            if lowest_guess in [x[0] for x in guesses]:
                lowest_guessed = 1
                lowest_guess_highest_confidence = max(x[1] for x in guesses
                                                      if x[0] == lowest_guess)
                lowest_guess_n_times_guessed = sum(x[0] == lowest_guess
                                                   for x in guesses)

            response_stats[highest_guess] = [
                highest_guessed, highest_guess_highest_confidence,
                highest_guess_n_times_guessed
            ]
            response_stats[lowest_guess] = [
                lowest_guessed, lowest_guess_highest_confidence,
                lowest_guess_n_times_guessed
            ]
            response_stats['distractor'] = [0, 'NA', 'NA']

            target_word = correct_answer_alternate_form if correct_answer_alternate_form else target_w_mystery_w[
                0]
            part_2_dict[target_w_mystery_w[1]] = [
                target_word, highest_guess, lowest_guess
            ]
        # print subject_id

        return [part_2_dict, response_stats]
Пример #17
0
#==========================================
# Author: Shierene Cervantes
#==========================================
# python script

import pandas as pd
import numpy as np
import pylab as pl
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk import WordNetLemmatizer

stemmer = LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

training_data = pd.read_csv('oneyearcategorized.csv', encoding = "latin1")
training_data['Title'] = training_data['Title'].astype(str)
training_data['Description'] = training_data['Description'].astype(str)
training_data['title_and_description'] = training_data[['Title', 'Description']].apply(tuple, axis=1)
training_data = training_data.astype(str)

training_data['title_and_description'] = training_data['title_and_description'].astype(str)

train_data = training_data.to_dict('records')
train_data

corpus_words ={}
class_words = {}
unnecessary_words = ['please','it','we','hi','is',"'s",'?',',',':','..','.','|','#','-','<','>','(',')','{','}']
Пример #18
0
# nltk.download() # To make sure all ntlk site packages are upto date and installed to get started with nltk
from nltk import PorterStemmer
from nltk import WordNetLemmatizer

paragraph = """Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly, I just want to say this: Making The Revenant was about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much."""

## Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)
# print(sentences)

## Tokenizing words
# wordz = nltk.word_tokenize(paragraph)
# print(wordz)

# stemmer = PorterStemmer() # Creating an object of PorterStemmer class
lemmatizer = WordNetLemmatizer()  # Creating an object of PorterStemmer class

## Stemming
# for i in range(len(sentences)):
#     words = nltk.word_tokenize(sentences[i]) # Word Tokenization on sentences list.
#     stemmed_words = [stemmer.stem(word) for word in words] #List Comprehension usage and stemming each word of a single sentence at a time.
#     sentences[i] = ' '.join(stemmed_words) # Joining all stemmed words back into sentences using space delimiter and join function

# print(sentences)

## Lemmatization
for j in range(len(sentences)):
    words = nltk.word_tokenize(
        sentences[j])  # Word Tokenization on sentences list.
    lemmatized_words = [
        lemmatizer.lemmatize(word) for word in words
 def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
     self.lower = lower
     self.strip = strip
     self.stopwords = stopwords or set(sw.words('english'))
     self.punct = punct or set(string.punctuation)
     self.lemmatizer = WordNetLemmatizer()
Пример #20
0
import os
Пример #21
0
def universal_check(words, tag, universal_tag, not_arranged_universal_tag):
    value = ASL_Structure_DB.get_one_where(universal_tag)['ASL']
    gloss = ""
    supergloss = []
    [second_value] = [value.split()]
    for tag2 in second_value:
        for value, tag1, tag3, tag4 in zip(words, tag, universal_tag,
                                           not_arranged_universal_tag):
            # if value == "a":
            #     continue
            if value.lower() == "this":
                value = "ix-that"
            if tag2 == tag3:
                lamentizer = WordNetLemmatizer()
                lemmatized_tokens = lamentizer.lemmatize(value.lower(),
                                                         pos='v')
                if lemmatized_tokens.lower() == "i":
                    lemmatized_tokens = "me"
                elif lemmatized_tokens.lower() == "n't":
                    lemmatized_tokens = "not"
                elif lemmatized_tokens.lower() == "'s":
                    lemmatized_tokens = "is"
                if value.lower() == "bit":
                    lemmatized_tokens = value
                supergloss.append((lemmatized_tokens, tag4))
                gloss += lemmatized_tokens + ","
    gloss = gloss.replace(",", " ").upper()
    gloss = gloss[:-1]
    value = []
    new_new_value = []
    rules = {
        '1': 'one',
        '2': 'two',
        '3': 'three',
        '4': 'four',
        '5': 'five',
        '6': 'six',
        '7': 'seven',
        '8': 'eight',
        '9': 'nine'
    }
    change = []
    cou = 0
    for word, tag in supergloss:
        if word != ".":
            new_value = Database.get_one_where(word.capitalize(), tag)
            if new_value is None:
                if cou < len(gloss.split()):
                    change.append(cou)
                if word.isdigit():
                    new_value = "None"
                    for letter in word:
                        new_letter = rules[letter]
                        for letter in new_letter:
                            new_value = Database.get_one_where(
                                letter.capitalize(), '.')
                            value.append(new_value)
                else:
                    for letter in word:
                        # print(letter)
                        if letter != "'":
                            new_value = Database.get_one_where(
                                letter.capitalize(), '.')
                            value.append(new_value)
                new_new_value.append(value)
                value = []
            else:
                new_new_value.append(new_value)
        cou = cou + 1
    new_new_new_value = []
    for i in enumerate(new_new_value):
        a = 0
        if isinstance(new_new_value[i[0]], str):
            new_new_new_value.append(new_new_value[i[0]])
        else:
            while a < len(new_new_value[i[0]]):
                new_new_new_value.append(new_new_value[i[0]][a])
                a += 1
    value = " ".join(new_new_new_value)
    gloss = gloss.split()

    for y in change:
        # print(y)
        word1 = ""
        for i, word in enumerate(gloss):
            if i == y:
                if word[:1] == "'":
                    word = word[1:]
                for i in word:
                    word1 += i + "-"
            # print(word1[:-1])
        gloss[y] = word1[:-1]
    gloss = "  ".join(gloss)
    # gloss = str(gloss).replace(",", " ").upper()

    return {'value': value, 'gloss': gloss}
Пример #22
0
 def lemmatizing(self, text):
     wl = WordNetLemmatizer()
     return [wl.lemmatize(word) for word in text]
Пример #23
0
 def __init__(self):
     self.stopwords = set(stopwords.words('english'))
     self.stemmer = PorterStemmer()
     self.lemmatizer = WordNetLemmatizer()
     self.sia = SentimentIntensityAnalyzer()
Пример #24
0
def tokenizer(data: DataFrame, rows, columns):
    tokenDict = dict() #"<entry>": (tf(overall), df, [list of docs it appears in])
    tokenDocs = dict()
    tokPostings = dict() #"<entry>": {docid: [tf in that doc, max_tf, doclen], ...}
    docInfo = dict()
    lematizer = WordNetLemmatizer()
    stopWords = set(stopwords.words("english"))

    for i in range(0, rows):
        tf = 1
        max_tf = 1
        doclen = 0
        docNo = i
        tokens1 = word_tokenize(data["Title"][i])
        tokens = list()
        #print(data["Text"][i])
        sentenceList = sent_tokenize(data["Text"][i])
        for sentence in sentenceList:
            tmp = word_tokenize(sentence)
            for t in tmp:
                tokens.append(t)

        #tokens = word_tokenize(sent_tokenize(data["Text"]))

        for t in tokens1:
            tokens.append(t)

        for tok in tokens:
            doclen += 1
            if tok in stopWords:
                continue
            word = lematizer.lemmatize(tok)
            if word in tokenDict:
                tokenDict[word] = tokenDict.get(word) + 1
                tokenDocs[word].add(docNo)
                # tokPostings[word].
            else:
                tokenDict[word] = 1
                tokenDocs[word] = {docNo}
                # tokPostings[word] = {docNo:1}
            if word in tokPostings:
                if docNo in tokPostings[word].keys():
                    tokPostings[word][docNo][0] = tokPostings[word][docNo][0] + 1
                    tf = tokPostings[word][docNo][0]
                    if tf > max_tf:
                        max_tf = tf
                else:
                    tokPostings[word][docNo] = [1, 0, 0]
            else:
                tokPostings[word] = {docNo: [1, 0, 0]}  # {docid: (tf,max_tf, doclen)}

        docInfo[docNo] = [max_tf, doclen]
        for word in tokPostings.keys():
            for doc in tokPostings[word]:
                tokPostings[word][int(doc)][1] = docInfo[int(doc)][0]
                tokPostings[word][int(doc)][2] = docInfo[int(doc)][1]
    sumOfDoclens = 0
    for doc in docInfo:
        sumOfDoclens += docInfo[doc][1]
    avgDoclen = sumOfDoclens / rows
    fullTokenDict = combineDicts(tokenDict, tokenDocs)  # combine dictionaries with same key set


    if fullTokenDict == -1:
        print("Failed in combining dictionaries")
        return
    # else:
    #     print(fullTokenDict)
    # print(tokenDict)
    # stemmedTokenDict, stemmedTokenDocs = stemmer(tokenDict)
    return fullTokenDict, tokPostings, avgDoclen
Пример #25
0
 def __init__(self, stopwords=None):
     self.stopwords = stopwords or set(sw.words('english'))
     self.lemmatizer = WordNetLemmatizer()
     self.word_cache = {}
Пример #26
0
 def build_analyzer(self):
     lemm = WordNetLemmatizer()
     analyzer = super(LemmaCountVectorizer, self).build_analyzer()
     return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc)
                         if (not doc.isdigit()) and len(doc) >= 3)
Пример #27
0
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [
        lemmatizer.lemmatize(word.lower())
        for word in word_tokenize(str(sentence))
    ]
Пример #28
0
    def __init__(self):
        # Lemmatizer for shortening each word to a more-commonly-used form of the word
        self._lemmatizer = WordNetLemmatizer()
        # Scraper to get common keywords from response
        self._keyword_scraper = KeywordScraper
        # Maximum number of types
        self._max_types = 3
        # Obvious religious keywords. These must be lowercase
        self._religion_words = [
            'god', 'spiritual', 'religion', 'worship', 'church', 'prayer'
        ]
        # Regex for url of government websites
        self._government_detector = r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[Gg][Oo][Vv](\.[a-zA-Z]{2})?$'
        # Lowest (highest number) rank a keyword can have and still count towards determining organization type
        self._max_rank = 40
        # Keywords to look for for other types. These must be lowercase
        self._type_words = {
            OrgTypesEnum.EDUCATION: [
                'education',
                'school',
                'study',
                'teach',
            ],
            OrgTypesEnum.ADVOCACY: [
                'advocacy',
                'lobby',
                'policy',
            ],
            OrgTypesEnum.RESEARCH: [
                'research',
                'conduct',
                'document',
                'identify',
                'analyze',
                'correlate',
                'compile',
                'report',
                'data',
                'publication',
                'journal',
                'periodical',
                'newsletter',
            ],
            OrgTypesEnum.PREVENTION: [
                'prevention',
                'intervention',
                'education',
                'development',
                'community',
                'ownership',
                'avoidance',
                'blockage',
                'determent',
                'forestalling',
                'halt',
                'hindrance',
                'impediment',
                'inhibitor',
                'interception',
                'interruption',
                'obstacle',
                'obstruction',
                'prohibition',
                'stoppage',
                'thwarting',
                'deterence',
            ],
            OrgTypesEnum.PROTECTION: [
                'protection',
                'rescue',
                'rehabilitation',
                'reintegration',
                'repatriation',
                'empowerment',
                'repatriation',
                'fulfilment',
                'freedom',
                'opportunity',
                'women',
                'conservation',
                'insurance',
                'preservation',
                'safeguard',
                'safety',
                'security',
                'shelter',
                'stability',
                'assurance',
                'barrier',
                'cover',
                'custody',
                'defense',
                'fix',
                'guard',
                'invulnerability',
                'reassurance',
                'refuge',
                'safekeeping',
                'salvation',
                'screen',
                'self-defense',
                'shield',
                'strength',
                'surety',
                'guarding',
            ],
            OrgTypesEnum.PROSECUTION: [
                'prosecution',
                'compliance',
                'abolish',
                'law',
                'enforcement',
                'regulatory',
                'regulation',
                'justice',
                'case',
                'cause',
                'claim',
                'lawsuit',
                'litigation',
                'proceeding',
                'suit',
            ],
        }

        # Stem search words (religious, general)
        self._religion_words = [
            self._lemmatizer.lemmatize(word) for word in self._religion_words
        ]
        for key in self._type_words.iterkeys():
            self._type_words[key] = [
                self._lemmatizer.lemmatize(word)
                for word in self._type_words[key]
            ]
Пример #29
0
 or 'Negative' sentiment but a continuous rating.
"""
import nltk as nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk import PorterStemmer
from nltk import word_tokenize, WordNetLemmatizer
from collections import Counter
from nltk import NaiveBayesClassifier, classify

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stop = set(stopwords.words('english'))
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()


def perform_nlp(text):
    """
    remove English stopwords
    removes punctuations
    lemmatizes words
    stems words
    """

    text = ''.join(c for c in text if c not in punctuation)

    # tokenize the sentence
    text = word_tokenize(text)
Пример #30
0
def get_processed_posting_list_operations(query_words_deque: deque,
                                          operations: deque) -> dict:
    left_word_query: str = query_words_deque.popleft()  # get first input word
    left_word_query = WordNetLemmatizer().lemmatize(left_word_query)
    left_dict_post_list: dict = get_posting_list_for_token(left_word_query)
    multiple_postings: list = list()
    flag_next: bool = False
    while True:
        try:
            right_word_query: str = query_words_deque.popleft(
            )  # get next word
            right_word_query = WordNetLemmatizer().lemmatize(right_word_query)
            right_dict_post_list: dict = get_posting_list_for_token(
                right_word_query)
            curr_operation: str = operations.popleft(
            )  # current operation match keywords

            if curr_operation != "and":  # and only process by multiple

                if multiple_postings:  # if reached end of and list (next operation for exp: or)
                    left_dict_post_list = intersect_many_posting_lists(
                        multiple_postings)
                    multiple_postings = list()  # clean for future

                if curr_operation == "or":
                    left_dict_post_list = union_posting_lists(
                        left_dict_post_list,
                        right_dict_post_list)  # update left postings with
                    # with intersect value
                elif curr_operation == "ornot":
                    left_dict_post_list = union_posting_lists(
                        left_dict_post_list,
                        not_postings_list(right_dict_post_list))
                elif curr_operation == "andnot":
                    if left_dict_post_list and right_dict_post_list:
                        left_dict_post_list = subtract_from_left_right_posting_lists(
                            left_dict_post_list, right_dict_post_list)
            else:
                if not flag_next:  # there are no more combined with and tokens
                    if multiple_postings:
                        multiple_postings.append(
                            right_dict_post_list
                        )  # store while not reached end
                    else:
                        multiple_postings = [
                            left_dict_post_list, right_dict_post_list
                        ]  # initialization

                if not (left_dict_post_list
                        or right_dict_post_list):  # only empty
                    flag_next = True
                    multiple_postings = list()

        except IndexError:
            # print("End of query")
            # if curr_operation == "not":  # empty right already checked
            #     left_dict_post_list = not_postings_list(left_dict_post_list)  # update left postings
            # with intersect value
            if multiple_postings:
                left_dict_post_list = intersect_many_posting_lists(
                    multiple_postings)  # all clearly and processed here
            break
    return left_dict_post_list