Пример #1
0
 def stem(self, query_df: pd.DataFrame, col: str):
     porter = PorterStemmer()
     query_df[col] = query_df[col].apply(
         lambda query: [porter.stem(word) for word in query])
     return query_df
Пример #2
0
def stemmer(text):
    """Applies stemming input text."""
    ps = PorterStemmer()
    return ps.stem(text)
Пример #3
0
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

##for w in example_words:
##    print(ps.stem(w))

new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))
Пример #4
0
def calc_word_freq(
    df: dd.DataFrame,
    top_words: int = 30,
    stopword: bool = True,
    lemmatize: bool = False,
    stem: bool = False,
) -> Dict[str, Any]:
    """
    Parse a categorical column of text data into words, and then
    compute the frequency distribution of words and the total
    number of words.

    Parameters
    ----------
    df
        Groupby-count on the categorical column as a dataframe
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """
    col = df.columns[0]
    if stopword:
        # use a regex to replace stop words and non-alphanumeric characters with empty string
        df[col] = df[col].str.replace(fr"\b(?:{'|'.join(ess)})\b|[^\w+ ]", "")
    else:
        df[col] = df[col].str.replace(r"[^\w+ ]", "")
    # convert to lowercase and split
    df[col] = df[col].str.lower().str.split()
    # "explode()" to "stack" all the words in a list into a new column
    df = df.explode(col)

    # lemmatize and stem
    if lemmatize or stem:
        df[col] = df[col].dropna()
    if lemmatize:
        lem = WordNetLemmatizer()
        df[col] = df[col].apply(lem.lemmatize, meta="object")
    if stem:
        porter = PorterStemmer()
        df[col] = df[col].apply(porter.stem, meta="object")

    # counts of words, excludes null values
    word_cnts = df.groupby(col)[df.columns[1]].sum()
    # total number of words
    nwords = word_cnts.sum()
    # total uniq words
    nuniq_words = word_cnts.shape[0]
    # words with the highest frequency
    fnl_word_cnts = word_cnts.nlargest(n=top_words)

    return {
        "word_cnts": fnl_word_cnts,
        "nwords": nwords,
        "nuniq_words": nuniq_words
    }
def preprocess_story(story,
                     stem=True,
                     remove_stop_words=True,
                     remove_punctuation=True,
                     metaparagraph_size=5):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Split into a list of paragraphs
    paragraphs = story.split("<newline>")
    simplified_paragraphs = []
    untokenized_paragraphs = []
    par_index = 0

    # Loop through paragraphs
    while par_index < len(paragraphs):
        meta_paragraph = []

        # Combine small paragraphs into meta_paragraphs with at least some minimum number of sentences
        while par_index < len(paragraphs) and len(
                meta_paragraph) < metaparagraph_size:
            paragraph = paragraphs[par_index]

            # Split paragraph into a list of sentences
            sentences = nltk.sent_tokenize(paragraph)
            meta_paragraph += sentences
            par_index += 1

        meta_paragraph_unprocessed = meta_paragraph

        if remove_stop_words:
            meta_paragraph = [
                sentence.replace("<num>", " ") for sentence in meta_paragraph
            ]

        # For the tokenized version, split each sentence into a list of words
        paragraph_tokenized = [
            nltk.word_tokenize(sentence) for sentence in meta_paragraph
        ]
        # Extra preprocessing
        if remove_stop_words:
            paragraph_tokenized = [[
                word for word in sentence if word not in stop_words
            ] for sentence in paragraph_tokenized]
        if remove_punctuation:
            paragraph_tokenized = [[
                regex.sub('[\p{P}\p{Sm}`]+', '', word) for word in sentence
            ] for sentence in paragraph_tokenized]
            paragraph_tokenized = [[word for word in sentence if word != ""]
                                   for sentence in paragraph_tokenized]
        if stem:
            paragraph_tokenized = [[stemmer.stem(word) for word in sentence]
                                   for sentence in paragraph_tokenized]

        if len(meta_paragraph) < metaparagraph_size and len(
                untokenized_paragraphs) > 0:
            untokenized_paragraphs[-1] += meta_paragraph_unprocessed
            simplified_paragraphs[-1] += paragraph_tokenized
        else:
            if len(meta_paragraph) != 0:
                untokenized_paragraphs.append(meta_paragraph_unprocessed)
                simplified_paragraphs.append(paragraph_tokenized)

    return untokenized_paragraphs, simplified_paragraphs
Пример #6
0
def stemmingWords(sent):
    PS = PorterStemmer()
    stemmed_sent = []
    for w in sent:
        stemmed_sent.append(PS.stem(w))
    return stemmed_sent
Пример #7
0
e.g to read files, preprocess text, etc.
"""
import sys
import platform
from os import system
from os import listdir
from os.path import isfile, join
from string import punctuation
from nltk import pos_tag, sent_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer() # Initialize lemmatizer once.
stemmer = PorterStemmer() # Initialize Porter's stemmer once.

stop_words = set(stopwords.words('english')).union([ # Augment the stopwords set.
    'don','didn', 'doesn', 'aren', 'ain', 'hadn',
    'hasn', 'mightn', 'mustn', 'couldn', 'shouldn',
    'dont', 'didnt', 'doesnt', 'arent', 'aint',
    'hadnt', 'hasnt', 'may', 'mightve', 'couldnt',
    'shouldnt', 'shouldnot', 'shouldntve', 'mustnt',
    'would', 'woulda', 'wouldany', 'wouldnot', 'woudnt',
    'wouldve', 'must', 'could', 'can', 'have', 'has',
    'do', 'does', 'did', 'are', 'is', 'ive', 'cant', 'thats',
    'isnt', 'youre', 'wont', 'from', 'subject', 'hes', 'etc',
    'edu', 'com', 'org', 've', 'll', 'd', 're', 't', 's'])

def get_wordnet_tag(tag):
    """
Пример #8
0
def process_email(filename):
        
    import re
    import numpy as np
    import pandas as pd
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer() 
    #==============================================================================
    #   Process email function
    #==============================================================================
    email_pattern = r'[A-Z0-9._%+-]+@[A-Z0-9._%+-]+\.[A-Z]{2,4}'
    email_regex = re.compile(email_pattern, flags = re.IGNORECASE)
    #url_pattern = r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?'
    url_pattern = r'(http|https)://[^\s]*'
    url_regex = re.compile(url_pattern, flags = re.IGNORECASE)
    
    number_pattern = r'[0-9]+'
    number_regex = re.compile(number_pattern)
    
    dollar_pattern = r'[$]+'
    dollar_regex = re.compile(dollar_pattern)
    
    http_pattern = r'<[^<>]+>'
    http_regex = re.compile(http_pattern)
    
    nonword_pattern = r'[^a-zA-Z0-9]'
    nonword_regex = re.compile(nonword_pattern)
    
    email_list = []
    
    # Get the body of the email
    line = get_email_text(filename)
    
    # process the body of the email.
    line = line.lower()
    line = http_regex.sub(' ',line)
    line = email_regex.sub('emailaddr',line)
    line = url_regex.sub('httpaddr',line)
    line = number_regex.sub('number',line)
    line = dollar_regex.sub('dollar',line)
    line = nonword_regex.sub(' ',line)
    listline = line.split()    
    newline = []
    for word in listline:
        word = word.strip()
        word = stemmer.stem(word)
        newline.append(word)

#   print(line)    
    email_list.extend(newline)
#   print(email_list)    
    vocab_filename = '../vocab.txt' 
    b = pd.read_table(vocab_filename, header = None)
    vocab = pd.DataFrame(b) 
    vocab = pd.Series(vocab[1])
    
    invocab = np.array(vocab[vocab.isin(email_list)].index)
#   print(invocab)
    x = np.zeros(len(vocab))
    x[invocab] = 1
    
    # Sanity checks
#    print(invocab.shape)
#    print(x.shape)
#    print(x[x==1].shape)
    return x    
Пример #9
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False, mode=None):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(
            cached_features_file) and not args.overwrite_cache and False:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        if mode == 'train':
            examples = processor.get_train_examples(
                os.path.join(args.data_dir, args.train_file))
        elif mode == 'eval':
            examples = processor.get_dev_examples(
                os.path.join(args.data_dir, args.dev_file))
        elif mode == 'predict':
            examples = processor.get_test_examples(
                os.path.join(args.data_dir, args.test_file))

        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.use_matchings:
            assert len(features) == len(examples)
            if args.do_eval or args.do_train:
                with open(os.path.join(args.data_dir, 'train_instances_train'),
                          'rb') as f:
                    train_examples = pickle.load(f)
            if args.do_predict:
                with open(os.path.join(args.data_dir, 'train_instances'),
                          'rb') as f:
                    train_examples = pickle.load(f)
            ps = PorterStemmer()
            for i in range(len(features)):
                features[i].matchings = get_matchings(examples[i].text_a,
                                                      train_examples, ps)

        if args.local_rank in [-1, 0] and False:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.float)

    if args.use_matchings:
        all_matchings = torch.tensor([f.matchings for f in features],
                                     dtype=torch.float)
        dataset = TensorDataset(all_input_ids, all_attention_mask,
                                all_token_type_ids, all_labels, all_matchings)
    else:
        dataset = TensorDataset(all_input_ids, all_attention_mask,
                                all_token_type_ids, all_labels)
    return dataset
Пример #10
0
 def stemming(self, bow):
     stemmer = PorterStemmer()
     return [stemmer.stem(word) for word in bow]
def extract_cooccurence():
    global cfd
    if len(sys.argv) > 1:
        # Define the data path
        data_path = sys.argv[1]
    start_time = time.time()

    list_of_file = sorted(glob.glob(data_path))
    cfd = nltk.ConditionalFreqDist()
    list_freq = nltk.FreqDist()

    stop = set(stopwords.words('english'))
    if not STOP_FLAG:
        stop = []
    ps = PorterStemmer()

    for index, fname in enumerate(list_of_file):
        print("No.{} File: {}".format(index, fname))
        with open(fname, encoding='latin') as file:
            raw = file.read()
            # Extract all the <TEXT> field
            result = re.findall(r'<TEXT>(.*?)</TEXT>', raw, re.DOTALL)
            texts = ''.join(result)
            # Tokenize
            tokens = word_tokenize(texts)
            # Filter Tokens is alphabetical and keep the in lower case
            # Filter by stopwords
            tokens_norm = [t.lower() for t in tokens if t.isalpha() and (t.lower() not in stop)]

            # Count the Frequency for each word
            list_freq = nltk.FreqDist(tokens_norm)

            # Tokes neighbors window
            wnd = [''*WND_SIZE]
            for t in tokens_norm:
                wnd.append(t)
                wnd = wnd[-WND_SIZE:]
                # Add to conditional frequency table
                add_conditional_frequence_table(wnd)

    print("Time1: {}".format(time.time() - start_time))

    cfd_filter = nltk.ConditionalFreqDist()
    # Filter the MIN_COOCC and Calculate the score

    # Calculate cfd.N()
    cfd_N = list_freq.N()*TERM_DISTANCE*2
    for term_i in cfd:
        cfd_filter[term_i] = nltk.FreqDist({term_j: score_term_in_term(term_j, term_i, cfd_N)
                                            for term_j in cfd[term_i] if cfd[term_i][term_j] > MIN_COOCC})
        cfd[term_i].pop[term_i]
    print("Time2: {}".format(time.time() - start_time))
    cfd_topn = nltk.ConditionalFreqDist()
    # Get the TOP N
    for w in cfd_filter:
        cfd_topn[w] = nltk.FreqDist(dict(cfd_filter[w].most_common(DOUBLE_TOP_N)))
    print("Time3: {}".format(time.time() - start_time))

    print("Time4: {}".format(time.time() - start_time))

    file_tag = {
        'dist': '_dist'+str(TERM_DISTANCE),
        'min': '_min'+str(MIN_COOCC),
        'top': '_top'+str(TOP_N),
        'stop': '_stp' if STOP_FLAG else '',
        'pmi': '_pmi' if PMI_FLAG else ''
    }

    ujson.dump(cfd_topn, open("/Users/jeanneluo/Downloads/ap_cfd{dist}{min}{top}{stop}{pmi}.json".format(
        **file_tag), "w"), double_precision=3)

    print("Time5: {}".format(time.time() - start_time))
    pdb.set_trace()
    return cfd_topn
Пример #12
0
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer

stem = PorterStemmer()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

file = open("articles/computer.txt", "rt")
contents = file.read()
file.close()

tokenized_text = sent_tokenize(contents)
# print("TOKENIZED TEXT ------------------------------------------------------------")
# print(tokenized_text)

tokenized_word = word_tokenize(contents)
print("TOKENIZED WORD ------------------------------------------------------------")
print(tokenized_word)
def stemming(file):

    print("Performing Stemming ...")

    #Using PorterStemmer from nltk for stemming
    ps = PorterStemmer()

    #Path for output file
    output_file = 'Data/stemmed_Lemetized_comments.csv'

    #Reading data from input file
    data = pd.read_csv(file)
    date = list(data.Date)
    label = list(data.Tag)
    thread = list(data.Comment_thread_id)
    comment_list = list(data.comments)
    comment_pos = list(data.comment_position)

    tokenized_comments_list = []
    pos_tagged_comments = []
    stemmed_tokens = []
    stemmed_comments = []
    tokenized_comments = []

    #Iterating through each comment one by one and stemming
    for cmt in comment_list:
        comment = ''
        if (str(cmt) == 'nan'):
            tokenized_comments = []
        else:
            tokenized_comments = word_tokenize(cmt)

        for word in tokenized_comments:
            #Stemming the tokenized words and appending to a new string comment
            stemmed_word = ps.stem(word)
            comment = comment + " " + stemmed_word

        #Appending the stemmed comment to processed comments list
        stemmed_comments.append(comment)
        tokenized_comments.clear()

    i = 0
    #Output file creation to store the stemmed comments
    out = open(output_file, 'w', newline='', encoding='utf8')
    fieldnames = [
        'Date', 'Comment_thread_id', 'comment_position', 'Tag', 'comments'
    ]
    writer = csv.DictWriter(out, fieldnames)
    writer.writerow({
        'Date': 'Date',
        'Comment_thread_id': 'Comment_thread_id',
        'comment_position': 'comment_position',
        'Tag': 'Tag',
        'comments': 'comments'
    })

    #Iterating through the stemmed comments and storying it into the output csv file
    for cmt in stemmed_comments:

        cmt = cmt.replace('\\n', '')
        writer.writerow({
            'Date': date[i],
            'Comment_thread_id': thread[i],
            'comment_position': comment_pos[i],
            'Tag': label[i],
            'comments': cmt
        })
        i = i + 1

    print("Stemming successfully done")

    #Return the file_name of the output file
    return output_file
Пример #14
0
def stemming(text):
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text]
    return text
Пример #15
0
        #Loop gather content from doc

        for i in f:

            #Convert to lower case

            s += ' '.join(i.split()).lower()

    return s


#Main Code
#Create object for posterStemmer

PorterS = PorterStemmer()

#Stop words
stop_words = set(stopwords.words('english'))

#call mwethod to Read file you should give your path as parameter instead of '/content/drive/My Drive/Colab Notebooks/result1.txt'
document = readFile('.txt')

#Token converter
word_tokens = word_token(document)

#Stem content
filter_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]

print("File content before stem:\n", word_tokens)
Пример #16
0
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.stemmer = PorterStemmer()
Пример #17
0
class Bayes_stemmed_porter(Bayes_stemmed):

    STEMMER = PorterStemmer()
def stem_text(text):
    ps = PorterStemmer()
    stemmed_text = []
    for word in text.split(" "):
        stemmed_text.append(ps.stem(word))
    return " ".join(stemmed_text)
Пример #19
0
    def __steeming(self):

        porter = PorterStemmer()
        return [porter.stem(word) for word in self.tokenized]
Пример #20
0
def preprocess(tweets, text_options):

    #Translate emojis for all tweets (this is the default in parsing)
    emojis = {}
    with open('data/emojilist5.csv', 'r') as f:
        for line in f:
            unic = line.split(',')[0].lower()
            trans = line.split(',')[1]
            emojis[unic] = trans
    tweets = tweets.apply(emojify, args=(emojis, ))

    #Metadata information clean for all tweets
    pat1 = r'https?://[A-Za-z0-9./]+'
    pat2 = r'www\\.[^ ]+'
    combined_pat = r'|'.join((pat1, pat2))
    url_pat = re.compile(combined_pat)

    pat3 = r'\\u[^ ]+'
    unicode_pat = re.compile(pat3)

    pat4 = r'@[A-Za-z0-9_]+'
    mention_pat = re.compile(pat4)
    tweets = tweets.apply(metadata_clean,
                          args=(url_pat, unicode_pat, mention_pat))

    #Expand negations
    if text_options['negation_expand'] is True:
        negations_dic = {
            "isn\'t": "is not",
            "aren\'t": "are not",
            "wasn\'t": "was not",
            "weren\'t": "were not",
            "haven\'t": "have not",
            "hasn\'t": "has not",
            "hadn\'t": "had not",
            "won\'t": "will not",
            "wouldn\'t": "would not",
            "don\'t": "do not",
            "doesn\'t": "does not",
            "didn\'t": "did not",
            "can\'t": "can not",
            "couldn\'t": "could not",
            "shouldn\'t": "should not",
            "mightn\'t": "might not",
            "mustn\'t": "must not",
            "shan\'t": "shall not",
            "ain\'t": "am not"
        }

        neg_expand_pattern = re.compile(r'\b(' +
                                        '|'.join(negations_dic.keys()) +
                                        r')\b')
        tweets = tweets.apply(negation_expand,
                              args=(neg_expand_pattern, negations_dic))

    #Remove punctuation
    if text_options['punctuation_remove'] is True:
        tweets = tweets.apply(punctuation_remove)

    #Remove metadata- hashtags, urls, mentions, unicode
    if text_options['metadata_remove'] is True:
        tweets = tweets.apply(metadata_remove)

    #Remove emojis from tweets
    if text_options['emoji_remove'] is True:
        tweets = tweets.apply(emoji_remove)

    #Remove digits
    if text_options['digits_remove'] is True:
        tweets = tweets.apply(digits_remove)

    #Mark negations
    if text_options['negation_mark'] is True:
        neg_words = [
            'not', 'never', 'no', 'nothing', 'noone', 'nowhere', 'none',
            'isnt', 'arent', 'wasnt', 'werent', 'havent', 'hasnt', 'hadnt',
            'wont', 'wouldnt', 'dont', 'doesnt', 'didnt', 'cant', 'couldnt',
            'shouldnt', 'mightnt', 'mustnt', 'shant', 'aint'
        ]
        neg_mark_pattern = re.compile(r'\b(' + '|'.join(neg_words) + r')\b')
        tweets = tweets.apply(negation_marking, args=(neg_mark_pattern, ))

    #Normalization
    if text_options['normalize'] is True:
        repeat_pattern = re.compile(r"(.)\1{2,}")
        tweets = tweets.apply(normalize_text, args=(repeat_pattern, ))

    #Remove stopwords: done before stemming and after negation marking (stemmer stems stopwords, if done before negation marking, some negation words removed)
    if text_options['stopwords_remove'] is True:
        tweets = tweets.apply(stopwords_remove)

    #Stemming. Note: will convert to lowercase and also stem stopwords (such as 'was' to 'wa')
    if text_options['stemming'] is True:
        ps = PorterStemmer()
        tweets = tweets.apply(stemming_apply, args=(ps, ))

    #Lowercasing of text
    if text_options['lower'] is True:
        tweets = tweets.str.lower()

    return tweets
Пример #21
0
 def __init__(self):
     self.porterStemmer = PorterStemmer()
Пример #22
0
    def fillblanks(self, sent):
        """Méthode principale :
        
        Génère une phrase avec un verbe manquant depuis un sous-titre reçu dont l'élève doit trouver le bon verbe parmis d'autres.
        """

        ext = ["'s", "'re", "is", "are", "'ve", "'m", "am"]

        # cherche le verbe dans le texte en postagant avec nltk
        p = []
        for s in sent:
            token = nltk.word_tokenize(s)
            pos = nltk.pos_tag(token)
            p.append(pos)

        # créer une liste de verbes postagués
        f = []
        for i in range(0, len(p)):
            for l in range(0, len(p[i])):
                if p[i][l][1] in list(M.modes.keys()):
                    f.append(p[i][l])

        # retire les doublons par un set()
        g = []
        for i in range(len(f)):
            if f[i][0] in ext:
                continue
            else:
                g.append(f[i])
        g = dict(set(g))

        # créé les phrases avec les blancs
        ans = []
        fill = []
        for s in sent:
            for w in g.keys():
                if w in s:
                    m = s.replace(w, ' ___________ ')
                    fill.append(m)
                    ans.append(w)
        prefinal = np.column_stack((fill, ans))
        prefinal = pd.DataFrame(prefinal, columns=['Phrase', 'Answer'])

        answers = pd.DataFrame(prefinal.iloc[:, 1])
        answers['2'] = None
        answers['3'] = None
        answers['4'] = None

        # itère toutes les réponses,
        # identifie le postag depuis le set g[] et
        # lui assigne les synonymes
        stem = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        for i in range(len(answers)):
            synonyms = []
            antonyms = []

            word = answers.iloc[i, 0]
            word_tag = g[word]

            # identifie les synonymes et antonymes de chaque mot
            for syn in wordnet.synsets(word):
                for l in syn.lemmas():
                    p1 = re.compile('[A-Za-z]+_[A-Za-z]+')
                    p2 = re.compile('[A-Za-z]+_[A-Za-z]+_[A-Za-z]+')
                    p3 = re.compile('[A-Za-z]+_[A-Za-z]+_[A-Za-z]+_[A-Za-z]+')
                    p4 = re.compile('[A-Za-z]+-[A-Za-z]+-[A-Za-z]+')

                    if any([
                            re.match(l.name(), word, re.IGNORECASE),
                            re.match(word, l.name(), re.IGNORECASE),
                            l.name() == stem.stem(word),
                            l.name() == lemmatizer.lemmatize(word, 'v'),
                            p1.match(l.name()),
                            p2.match(l.name()),
                            p3.match(l.name()),
                            p4.match(l.name())
                    ]):
                        continue
                    synonyms.append(l.name())

                    if l.antonyms():
                        antonyms.append(l.antonyms()[0].name())

            # teste si la liste des synonymes est non nulle
            if len(synonyms) != 0:
                answers.iloc[i, 1] = self.fill(synonyms[0], word_tag)
                answers.iloc[i, 2] = self.fill(synonyms[1], word_tag)
            else:
                continue

        # concatène les réponses dans un data frame
        final = np.column_stack((prefinal, answers))
        final = pd.DataFrame(final,
                             columns=['Phrase', 'Answer', '1', '2', '3', '4'])

        return final
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    temp = text.replace("\\r", " ")
    temp = temp.replace("\\n", " ")
    return [stemmer.stem(w) for w in word_tokenize(temp)]
Пример #24
0
               we must protect and nurture and build on. If we are not free, no one will respect us.
               My second vision for India’s development. For fifty years we have been a developing nation.
               It is time we see ourselves as a developed nation. We are among the top 5 nations of the world
               in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling.
               Our achievements are being globally recognised today. Yet we lack the self-confidence to
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career
'''

sp = PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(para)
corpus = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [
        wordnet.lemmatize(word) for word in review
        if word not in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)
Пример #25
0
def clean_tweets(data,
                 save_to_file=False,
                 stopwords_stemming=False,
                 path="resources/Sentiment140_clean.csv"):
    """
    This function prepares and cleans all tweets within the given data frame. Contractions handling, lowercase
    delete special signs, @s with username, https, word stemming, stopwords removal, ...

    :param data: pandas data frame containing raw Sentiment140 tweets
    :param save_to_file: option to save cleaned data
    :param stopwords_stemming: set True if stopwords removal and word stemming should be applied
    :param path: path to save cleaned data
    :return: cleaned/prepared pandas data frame containing tweets
    """

    tweets = []
    labels = []
    data['label'].replace([4], 1, inplace=True)
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()

    for i in range(len(data.text)):
        if (i + 1) % 100000 == 0:
            print(
                f"INFO:{i + 1} of {len(data.text)} tweets have been processed."
            )

        # apply different cleaning steps on each tweet
        tweet = data.text[i]
        label = data.label[i]
        tweet = tweet.lower()
        tweet = re.sub(r"@\S+", "", tweet)
        tweet = re.sub(r"http\S+", "", tweet)
        tweet = re.sub(r"www.\S+", "", tweet)
        tweet = " ".join(
            [contractions_mapping.get(i, i) for i in tweet.split()])
        tweet = re.sub("[^a-zA-Z]", " ", tweet)

        # tokenize tweet
        word_tokens = word_tokenize(tweet)

        # if parameter is set: apply stopwords removal and stemming
        if stopwords_stemming is True:
            words_temp = [w for w in word_tokens if w not in stop_words]
            word_tokens = [ps.stem(w) for w in words_temp]

        #  put words back together to tweet (eliminates 'double-spaces' which might have been created)
        tweet = []
        for w in word_tokens:
            tweet.append(w)
        tweet = " ".join(tweet)

        tweets.append(tweet)
        labels.append(label)

    print(
        f"INFO:{len(data.text)} of {len(data.text)} tweets have been processed."
    )

    # concat to data frame and drop empty entries
    data = pd.DataFrame({'label': labels, 'text': tweets})
    data_clean = drop_empty_entries(data)

    # save csv file if demanded
    if save_to_file is True:
        data_clean.to_csv(path, encoding="utf-8")
        print(f"INFO: cleaned tweets saved to path: {path}")

    return data_clean
Пример #26
0
    def form_full_questions(self, candidate, jsondata, tagged):
        """
        flag : is a flag to distinguis the blank ques nd actual ques 
        0 = blank ques    1 = actual ques
        ans : is a flag to keep a track of modified answer
        0 = no change in answer  
        ans = <other then 0> is the modified ans 
        """
        full_ques = candidate['Question']
        sentence = candidate['Sentence']
        answer = candidate['Answer']
        flag = 0
        ans = 0
        new_full_ques = []

        pattern_strings = self.pattern_verb_noun(candidate['Sentence'],
                                                 jsondata)

        for word, pos in tagged:
            # to check is word is in answer
            if ((answer.find(word)) >= 0):
                #to check if blank is in starting
                if (flag == 0) and ((sentence.find(word)) == 0):
                    if ((('NN' == pos) or ('NNP' == pos) or
                         ('NNPS' == pos)) and jsondata.has_key("PERSON")) and (
                             word in jsondata['PERSON']):
                        full_ques = full_ques.replace("_____", 'Who')
                        full_ques = full_ques + "?"
                        flag = 1
                    elif (jsondata.has_key("LOCATION") and
                          (word in jsondata['LOCATION'])) or (
                              jsondata.has_key("GPE") and
                              (word in jsondata['GPE'])):

                        full_ques = full_ques.replace("_____", 'Where')
                        full_ques = full_ques + "?"
                        flag = 1
                    elif ('NN' == pos) or ('NNP' == pos) or ('NNPS' == pos):
                        full_ques = full_ques.replace("_____", 'What')
                        full_ques = full_ques + "?"
                        flag = 1

                if (flag == 0 and (len(pattern_strings) > 0)):
                    for pattern_string_no in range(len(pattern_strings)):
                        if ((('NN' == pos) or ('NNP' == pos) or
                             ('NNPS' == pos)) and jsondata.has_key("PERSON")
                            ) and (word in jsondata['PERSON']):
                            individual_words = pattern_strings[
                                pattern_string_no].split()
                            verb = [
                                word for word in individual_words
                                if word not in jsondata['PERSON']
                            ]
                            print "Verb : ", str(verb)
                            print "pattern_strings[pattern_string_no] : ", str(
                                pattern_strings[pattern_string_no])
                            full_ques = sentence.replace(
                                str(pattern_strings[pattern_string_no]), '')
                            full_ques = "What " + str(
                                verb[0]) + " " + str(full_ques).lower() + "?"
                            print "word : " + word + "  pos : " + pos
                            flag = 1

                if (flag == 0):
                    pattern_strings, verbs, nouns = self.pattern_verb_dt_adj_noun(
                        candidate['Sentence'], jsondata)
                    print "pattern_strings : "
                    print pattern_strings
                    if (len(pattern_strings) > 0):
                        if (jsondata.has_key("LOCATION") and
                            (word in jsondata['LOCATION'])) or (
                                jsondata.has_key("GPE") and
                                (word in jsondata['GPE'])):
                            for pattern_string_no in range(
                                    len(pattern_strings)):
                                if (pattern_strings[pattern_string_no].find(
                                        answer) >= 0):
                                    print "pattern_string_no : ", pattern_string_no
                                    individual_words = pattern_strings[
                                        pattern_string_no].split()
                                    print "individual_words :"
                                    print individual_words
                                    verb = [
                                        word for word in individual_words
                                        if word in verbs
                                    ]
                                    print "Verb : ", str(verb)
                                    print "pattern_strings[pattern_string_no] : ", pattern_strings[
                                        pattern_string_no]
                                    full_ques = sentence.replace(
                                        pattern_strings[pattern_string_no], '')
                                    full_ques = "Where " + str(
                                        verb[0]) + " " + str(
                                            full_ques).lower() + "?"

                                    noun = [
                                        word for word in individual_words
                                        if word in nouns
                                    ]
                                    ps = PorterStemmer()
                                    ans = ps.stem(word)
                                    # print "word : " + word + "  pos : " + pos
                                    flag = 1

        new_full_ques.append(full_ques)
        print new_full_ques
        return new_full_ques, ans, flag


# if __name__ == '__main__':
#     form_full_questions("Ahmedabad is walking run a very good city")
Пример #27
0
# To find the frequency of top 10 words
fdist1 = fdist.most_common(10)
fdist1


#--------------------------------------------------
### Stemming
#--------------------------------------------------

# Stemming is the process of categorizing words into 
# the root form.

# Approach 1: Importing Porterstemmer from nltk library
# Checking for the word ‘giving’ 
from nltk.stem import PorterStemmer
pst = PorterStemmer()
pst.stem("waiting")

# Categorizing the list of words
stm = ["waited", "waiting", "waits"]
for word in stm :
   print(word+ ":" +pst.stem(word))


# Approach 2: Importing LancasterStemmer from nltk
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
stm = ["giving", "given", "given", "gave"]
for word in stm :
    print(word + ":" + lst.stem(word))
Пример #28
0
    def stemming_tweets(self, tweet):
        ps = PorterStemmer()

        tweets_stemming = ps.stem(tweet)

        return tweets_stemming
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""

#cleaning the text
import re  #for regularization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

sentences = nltk.sent_tokenize(paragraph)

cleaned_sentences = []

for i in range(len(sentences)):

    review = re.sub(
        '[^a-zA-Z]', " ", sentences[i]
    )  #replaces all the characters except a-z and A-Z letters with spaces
    review = review.lower()  # lowering uppercase characters
    review = review.split(
    )  # splitting on the basis of spaces creates list of words
Пример #30
0
def processEmail(email_contents):
    vocabList = getVocabList()
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]
    
    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and get rid of any punctuation
#    [str, email_contents] = ...
#       strtok(email_contents, ...
#              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
    email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s]+', email_contents)
#    print(email_contents)

    # Output the email to screen as well
    #print('\n==== Processed Email ====\n\n')
    # Process file
    l = 0
    for token in email_contents:
        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)
        # Stem the word 
        token = PorterStemmer().stem(token.strip())
        # Skip the word if it is too short
        if len(token) < 1:
           continue
        idx = vocabList[token] if token in vocabList else 0
        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0, 
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)
        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print(token)
        l = l + len(token) + 1

    # Print footer
    #print('\n\n=========================\n')
    
    return word_indices