def clean_document(document):
    '''
    Takes in a string.
    Returns cleaned string.
    '''
    # lowercase the strings
    doc_lower = document.lower() 

    #tokenize
    tokens = word_tokenize(doc_lower) 
    
    # remove punctuation
    punc = set(string.punctuation)
    tokens_no_punc = [word for word in tokens if word not in punc]
   
    # remove stopwords
    s_words = set(stopwords.words('english'))
    s_words_list = ['tablespoon', 'tbsp', 'teaspoon', 'tsp', 'cup', 'oz', 'lb', 'c.']
    for word in s_words_list:
        s_words.add(word)
    tokens_no_sw = [word for word in tokens_no_punc if word not in s_words]
    
    # stem the words to get rid of multiple forms of the same word
    porter = PS()
    tokens_stemmed = [porter.stem(word) for word in tokens_no_sw]
    
    # join all words into one string
    cleaned_doc = ' '.join(tokens_stemmed)
    
    return cleaned_doc
示例#2
0
def nlp2line(f_name='sentiment.txt'):
    sentences = []
    sentence_word_list = []
    word_list = []
    reg = re.compile(r'''
        (?<=[.;:?!])    # (. or ; or : or ? or !) に続いて
        \s              # 空白文字
        (?=[A-Z])       # 英大文字が続く場合だけマッチする
        ''',
                     flags=re.VERBOSE)
    with open(f_name, "r", encoding='latin-1') as f:
        for line in map(lambda x: x.rstrip(), f):
            if not line:
                continue
            for res_line in reg.split(line):
                sentences.append(res_line)
    for i, sentence in enumerate(sentences):
        words = re.findall("[a-zA-Z0-9]{2,}", sentence)
        for word in words:
            word_list.append(word)
        sentence_word_list.append(word_list)
    for i, sentence in enumerate(sentence_word_list):
        len_sentence = len(sentence)
        for j in range(1, len_sentence):
            if is_stopword_takahashi(word):
                sentence_word_list[i].pop(j)
    ps = PS()
    len_word_list = len(sentence_word_list)
    for i in range(len_word_list):
        len_sentence = len(sentence)
        for j in range(1, len_sentence):
            sentence_word_list[i][j] = ps.stem(sentence_word_list[i][j])
    return sentence_word_list
示例#3
0
def extract_features(file_path):
    ps = PS()
    labels, docs = [], []
    for line in open(file_path):
        label, *sentence = line.split()
        tokens = [stem for stem in map(ps.stem, sentence) if check(stem)]
        labels.append(int(label))
        docs.append(" ".join(tokens))
    return labels, docs
示例#4
0
def remove_noise(words):
    from nltk.stem.porter import PorterStemmer as PS

    result = []
    ps = PS()
    for w in words:
        # ストップワードを除去
        if not ex71.is_stop_word(w) and len(w) > 1:
            # 各単語をステミング処理して登録する
            result += [ps.stem(w)]
    return result
示例#5
0
def data_preprocessing ( data):
  tokens = word_tokenize(data)
  words = [token for token in tokens if token.isalpha()]
  no_integers = [x for x in words if not isinstance(x, int)]
  porter = PS()
  stemmed = [porter.stem(word) for word in no_integers]
  stop_words = stopwords.words('english')
  words_new = [word for word in stemmed if word not in stop_words]
  tokens = [w.lower() for w in words_new]
  cleaned_sentence = " "
  cleaned_sentence = cleaned_sentence.join(tokens)
  return cleaned_sentence
示例#6
0
def getFeatures(line):
    ps = PS()
    features = []
    sentiment = line[:2]
    for word in line[3:].split():
        word = word.strip()
        if is_stopword(word):
            continue
        else:
            if len(word) > 1:
                features.append(ps.stem(word))
    return features, sentiment
def doc_read(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    tokens=text.lower()
    tokens= re.sub(r'[^\w\s]','',tokens)
    tokens=text.split()
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    ps=PS()
    tokens = [ps.stem(w) for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens
示例#8
0
def doc_read(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    tokens = text.lower()
    tokens = text.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    ps = PS()
    tokens = [ps.stem(w) for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    tokens = " ".join(tokens)
    return tokens
示例#9
0
def stemming(no_stopword_features="no_stopword_features.txt",
             features="features.txt"):
    pbar = tqdm(total=10662)
    with open(no_stopword_features,
              "r", encoding="latin-1") as f, open(features,
                                                  "w",
                                                  encoding="latin-1") as fw:
        ps = PS()
        for line in f:
            label_words = line.rstrip().split(" ")
            label = label_words[0]
            words = label_words[1:]
            fw.write(label)
            for word in words:
                word = ps.stem(word)
                fw.write(" " + word)
            fw.write("\n")
            pbar.update(1)
    pbar.close()
def Result():
    print("Enter a string to judge sentiment:")
    S=str(input())
    S=S.lower()
    S= re.sub(r'[^\w\s]','',S)
    S=S.split()
    S=[word for word in S if word.isalpha()]
    stop_words=set(stopwords.words('english'))
    ps=PS()
    S = [ps.stem(w) for w in S if not w in stop_words]
    S= [word for word in S if len(word) > 1]
    good=0
    bad=0
    for i in S:
        if(i in common):
            tmpvar1=common.index(i)
            tmpvar2=commonprob[tmpvar1]
            good=good+(S.count(i)*tmpvar2[1])
            bad=bad+(S.count(i)*tmpvar2[2])
        elif(i in finalp):
            good=good+1
        elif(i in finaln):
            bad=bad+1
    for i in range(0,len(S)):
        if(S[i]=='highli' or S[i]=='much'):
            if(S[i+1] in common):
                tmpvar1=common.index(S[i])
                tmpvar2=commonprob[tmpvar1]
                good=good+(2*S.count(i)*tmpvar2[1])
                bad=bad+(2*S.count(i)*tmpvar2[2])
            elif(S[i+1] in finaln):
                bad=bad+2
            elif(S[i+1] in finalp):
                good=good+2
    if(good>bad):
        print("\nPositive")
    else:
        print("\nNegative")
示例#11
0
def extract_features(title):
    tokens = title.split(" ")
    tokens = filter(check, map(PS().stem, tokens))
    return " ".join(tokens)
示例#12
0
def stemming(words_list):
    new_words_list = []
    ps = PS()
    for words in words_list:
        new_words_list.append(ps.stem(words))
    return new_words_list
 def __init__(self):
     self._porter_stemmer = PS()
示例#14
0
def stemming():
    # 問題にあるstemmingモジュールはPython3に非対応らしい
    # Porterのステミングアルゴリズムはnltkのもので代用
    ps = PS()
    for w in word_split():
        yield [w, ps.stem(w)]
示例#15
0
def stem(x):
    return PS().stem(x)
示例#16
0
stop_words.extend(
    [".", ",", ":", ";", "!", "?", "-", "--", "(", ")", "\"", "\'"])
# re用
symbol = [".", ",", ":", ";", "!", "?", "\-", "(", ")", "\"", "\'"]


def isStopWord(word: str) -> bool:
    # ストップワードならTrue
    return word.lower() in stop_words


def my_sub(word: str) -> str:
    # 記号を削除
    return re.sub(f"[{''.join(symbol)}]", "", word)


if __name__ == "__main__":
    words = []
    ps = PS()
    with open("sentiment.txt") as f:
        for l in f:
            # ストップワードの除去
            # 1文字の単語も除く
            words.extend([
                ps.stem(my_sub(w.strip())) for w in l.split(" ")[1:]
                if not isStopWord(w) and len(my_sub(w.strip())) > 1
            ])

    with open("stem_words.txt", "w") as f:
        f.write("\n".join([w for w in set(words) if w]))
Created on Thu Aug  9 22:50:11 2018

@author: yohei
"""

import pyprind
import pandas as pd
import os
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer as CV
from nltk.stem.porter import PorterStemmer as PS
from nltk.corpus import stopwords

count = CV()
porter = PS()
stop = stopwords.words('english')


class Final_Assignment:
    def tokenizer_porter(self, text):
        return [porter.stem(word) for word in text.split()]

    def merge_dict_add_values(self, d1, d2):
        return dict(Counter(d1) + Counter(d2))

    def making_csv(self):
        basepath = 'aclImdb'
        labels = {'pos': 1, 'neg': 1}
        pbar = pyprind.ProgBar(50000)
        df = pd.DataFrame()