예제 #1
0
def stem_file(Inputcsv, Outcsv, delim):
    """
    :param Inputcsv: The input file with the original news articles
    :param Outcsv:   The name of the output file that we wish the stemmed articles to be saved to
    :param delim:    The delimeter used in the reading file
    :return:         Does not return any value
    """
    csv.register_dialect('perispwmeni', delimiter=delim)
    fw = open(Outcsv, 'wb')
    fw2 = csv.writer(fw, delimiter=delim)
    f = open(Inputcsv, 'r')
    try:
        reader = csv.reader(f, dialect='perispwmeni')
        cnt_row = 0
        for row in reader:  # reads per line
            #print row
            cmp_two = []                                    # two strings that will be compared
            cnt_row = cnt_row +1                            # row counter
            if len(row)>1:                                  # if len(row) == 1, then it is an empty line, skip it
                for elem in [5, 6]:                         # The text (news articles) is in columns 6,7 in the current format
                    line_out = ''                           # line after the processing
                    line1 = row[elem]
                    words = line1.split()
                    for word in words:                      # loop over each element of list "words"
                        ww = stem.get_decoded_input(word)   # it was 'str' before and becomes 'unicode' from type (ww)
                        last_char_spec =''.encode('utf-8')
                        last_char = ww[-1]                          # if the last character is a special
                        if last_char == ',' or last_char == '.' or last_char == '!' or last_char == ';':    # char (',','.','!',';'), we trim it
                            ww = ww[:-1]
                            last_char_spec = last_char.encode('utf-8')
                        ww = strip_accents(ww)
                        if len(ww)<1:
                            continue
                        english_char = re.search('[a-zA-Z]', ww)    # Check whether the word contains English characters
                        cont_dig = not contains_digits(ww)
                        if (not ww[0].isupper()) and (english_char is None) and cont_dig:
                        # if the first letter is capital, it contains English characters or it is a number,
                        # then we don't want to stem it
                            ww = ww.upper()
                            stemmed = stem.stem(ww)
                            stemmed = stemmed.lower()
                            stemmed = stemmed.encode('utf-8')
                            line_out = line_out + ' ' + stemmed + last_char_spec
                        elif cont_dig:
                            #ww = ww.upper()
                            line_out = line_out + ' ' + ww.encode('utf-8') + last_char_spec
                        else:
                            line_out = line_out + ' NUM' + last_char_spec
                    # programmer comment: if cm_two = [cmp_two,line_out], the encoding does not appear correctly
                    if elem == 5:
                        cmp_two = line_out
                    else:
                        cmp_two = [cmp_two, line_out]
                fw2.writerow(cmp_two)
            else:
                fw2.writerow([' '])
    finally:
        f.close()
예제 #2
0
def prepare_text(text):
    text = (i if i[0] in capital_letters else dummy
            for i in text.strip(' «»').split())
    text = it.groupby(text, lambda i: i is dummy)
    text = {
        ' '.join(stem(normalise(i)) for i in v)
        for k, v in text if k is False
    }
    return text
예제 #3
0
def stem_emotions(dirpath, emotionlist):
    for emotion in emotionlist:
        filepath = os.path.join(dirpath, emotion + '.txt')
        words = load_csv_file(filepath)
        stemmed_words = {}
        for word, score in words.items():
            stemmed_word = stem(word)
            if  stemmed_word in stemmed_words:
                stemmed_words[stemmed_word] = stemmed_words[stemmed_word] \
                if stemmed_words[stemmed_word] > score else score
            else:
                stemmed_words[stemmed_word] = score
        write_csv_file(filepath, stemmed_words)   
예제 #4
0
def search_and_create_data(fileIn,data):
    csv.register_dialect('perispwmeni', delimiter='~')
    f = open(fileIn, 'r')
    try:
        reader = csv.reader(f, dialect='perispwmeni')

        cnt_row = 0
        new_data = []
        for row in reader:  # reads per line
            #print row
            cnt_row = cnt_row +1
            if len(row)>1:                                  # if len(row) == 1, then it is an empty line, skip it
                for elem in [5, 6]:                         # The text (news articles) is in columns 6,7 in the current format
                    line1 = row[elem]
                    words = line1.split()
                    for word in words:                      # loop over each element of list "words"
                        ww = stem.get_decoded_input(word)   # it was 'str' before and becomes 'unicode' from type (ww)
                        last_char_spec =''.encode('utf-8')
                        last_char = ww[-1]                          # if the last character is a special
                        if last_char == ',' or last_char == '.' or last_char == '!' or last_char == ';':    # char (',','.','!',';'), we trim it
                            ww = ww[:-1]
                            #last_char_spec = last_char.encode('utf-8')
                        if len(ww) <= 4:                      # cut down the articles and small words
                            continue
                        ww = strip_accents(ww)
                        english_char = re.search('[a-zA-Z]', ww)    # Check whether the word contains English characters
                        cont_dig = not contains_digits(ww)
                        if (not ww[0].isupper()) and (english_char is None) and cont_dig:
                        # if the first letter is capital, it contains English characters or it is a number,
                        # then we don't want to stem it
                            ww = ww.upper()
                            stemmed = stem.stem(ww)
                            stemmed = stemmed.lower()
                            stemmed = stemmed.encode('utf-8')
                            try:                                    # try to locate the word in the current list
                                data.index(stemmed)
                            except:
                                try:                                # try to locate the word in the new list
                                    new_data.index(stemmed)
                                except:                             # if it's not in neither of them, append it to the new list
                                    new_data.append(stemmed)
                                    print stemmed
    finally:
        f.close()
    return new_data
예제 #5
0
 def test_stem(self):
     words = [
         'aufeinander', 'aufeinanderbiss', 'aufeinanderfolge',
         'aufeinanderfolgen', 'aufeinanderfolgend', 'aufeinanderfolgende',
         'aufeinanderfolgenden', 'aufeinanderfolgender', 'aufeinanderfolgt',
         'Käufer', 'Kätzchen', 'katholischer', 'auffallen', 'auffallend',
         'auffallenden', 'auffallender', 'auffällig', 'auffälligen',
         'auffälliges'
     ]
     stems = [
         'aufeinand', 'aufeinanderbiss', 'aufeinanderfolg',
         'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolg',
         'aufeinanderfolg', 'aufeinanderfolg', 'aufeinanderfolgt', 'kauf',
         'katzch', 'kathol', 'auffall', 'auffall', 'auffall', 'auffall',
         'auffall', 'auffall', 'auffall'
     ]
     results = [stem(word) for word in words]
     self.assertEqual(results, stems)
예제 #6
0
    def document_terms():
        for filepath, content, date in documents():
            print(filepath)

            extension = path.splitext(filepath)[1]

            words = None
            title = filename(filepath)

            if extension in ['.html', '.htm', '.jspy']:
                html_title, words, links = tokenize_html(content)
                html_title = html_title.strip()
                if html_title:
                    title = html_title
            else:
                words = tokenize_text(content)

            words = remove_stopwords(words, stopword_list)

            words = (stem(word) for word in words)

            yield title, filepath, words, date
예제 #7
0
 def document_terms():
   for filepath, content, date in documents():
     print(filepath)
     
     extension = path.splitext(filepath)[1]
     
     words = None
     title = filename(filepath)
     
     if extension in ['.html', '.htm', '.jspy']:
       html_title, words, links = tokenize_html(content)
       html_title = html_title.strip()
       if html_title:
         title = html_title
     else:
       words = tokenize_text(content)
     
     
     words = remove_stopwords(words, stopword_list)
     
     words = (stem(word) for word in words)
     
     yield title, filepath, words, date
예제 #8
0
google_tokenize = tk.tokenize(content_google)

# REMOVING STOPWORDS AND LOWERCASING

import remove_stopwords as stopwords

facebook_removed_stopwords = stopwords.remove(facebook_tokenize)

apple_removed_stopwords = stopwords.remove(apple_tokenize)

amazon_removed_stopwords = stopwords.remove(amazon_tokenize)

netflix_removed_stopwords = stopwords.remove(netflix_tokenize)

google_removed_stopwords = stopwords.remove(google_tokenize)

# STEMMING

import stemming as stemming

facebook_stemmed = stemming.stem(facebook_removed_stopwords)

apple_stemmed = stemming.stem(apple_removed_stopwords)

amazon_stemmed = stemming.stem(amazon_removed_stopwords)

netflix_stemmed = stemming.stem(netflix_removed_stopwords)

google_stemmed = stemming.stem(google_removed_stopwords)
예제 #9
0
파일: music.py 프로젝트: harshchaplot/IRS
import remove_stopwords as stopwords

imagine_dragons_removed_stopwords = stopwords.remove(imagine_dragons_tokenize)

maroon5_removed_stopwords = stopwords.remove(maroon5_tokenize)

one_republic_removed_stopwords = stopwords.remove(one_republic_tokenize)

coldplay_removed_stopwords = stopwords.remove(coldplay_tokenize)

the_beatles_removed_stopwords = stopwords.remove(the_beatles_tokenize)

# STEMMING

import stemming as stemming

imagine_dragons_stemmed = stemming.stem(imagine_dragons_removed_stopwords)

maroon5_stemmed = stemming.stem(maroon5_removed_stopwords)

one_republic_stemmed = stemming.stem(one_republic_removed_stopwords)

coldplay_stemmed = stemming.stem(coldplay_removed_stopwords)

the_beatles_stemmed = stemming.stem(the_beatles_removed_stopwords)

# import stemming

# ps = stemming.PorterStemmer()
# print(ps.stem('laughing'))
예제 #10
0
파일: Press.py 프로젝트: nestoor22/Press
def words_steam_cleaner(first_list):
    return Counter(
        [stem(word) for word in first_list if word not in UKRAINIAN])
예제 #11
0
capital_letters = 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'

decompose = icu.Transliterator.createInstance('any-NFD').transliterate
normalise = icu.Transliterator.createInstance(
    'any-NFD; '
    '[:nonspacing mark:] any-remove; '
    '[:punctuation:] any-remove; '
    'any-upper').transliterate

locations = [{
    **l, 'name': decompose(l['name'])
} for p in Path('data').glob('childrenJSON*')
             for l in json.load(p.open())['geonames']]
location_pairs = MultiDict(
    (' '.join(stem(w) for w in normalise(l['name']).split()), l)
    for l in locations)
location_stems = set(location_pairs)


def prepare_text(text):
    text = (i if i[0] in capital_letters else dummy
            for i in text.strip(' «»').split())
    text = it.groupby(text, lambda i: i is dummy)
    text = {
        ' '.join(stem(normalise(i)) for i in v)
        for k, v in text if k is False
    }
    return text

예제 #12
0
def preprocessing(str):
    word=tokenize(str)
    word=stopword_removal(word)
    wordlist=stemming.stem(word)
    return word
예제 #13
0
# sm.makeSimilarityMatrixToFile(stemmed_newsgroups_train)
print nostop_newsgroups_train
print("finish stop word")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectors_noIDF_train = vectorizer.fit_transform(nostop_newsgroups_train)
vectors_noIDF_test = vectorizer.transform(nostop_newsgroups_test)
pprint(vectors_noIDF_train.shape) #2034 
pprint(vectors_noIDF_test.shape) #

########################  Stemming ###################


pprint("stemming")
import stemming
stemmed_newsgroups_train = stemming.stem(newsgroups_train.data)
stemmed_newsgroups_test = stemming.stem(newsgroups_test.data)
pprint("finish stemming")


########################  ###################
pprint("vectorize")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# vectors_noIDF_train = vectorizer.fit_transform(newsgroups_train.data)
# vectors_noIDF_test = vectorizer.transform(newsgroups_test.data)
# pprint(vectors_noIDF_train.shape) #2034 
# pprint(vectors_noIDF_test.shape) #1353 
vectors_noIDF_train = vectorizer.fit_transform(stemmed_newsgroups_train)
vectors_noIDF_test = vectorizer.transform(stemmed_newsgroups_test)
예제 #14
0
from document_read_write import doc_read, doc_write
from stemming import stem
from tf_idf_score import scoring
import re
import operator

all_sentence_list = []
stemmed_sentence_list = []

document = doc_read("input1.txt")
all_sentence_list = document.all_sentence

stm = stem()
for s in all_sentence_list:
    word_list = re.split('\s+', s)
    new_snt = ""
    for w in word_list:
        if new_snt == "":
            new_snt = new_snt + stm.stemmed(w)
        else:
            new_snt = new_snt + " " + stm.stemmed(w)

    stemmed_sentence_list.append(new_snt)

snt_scr = scoring(stemmed_sentence_list)
snt_scr.update()
snt_scr_list = snt_scr.sentence_score_list

score = list(zip(all_sentence_list, snt_scr_list))

score.sort(key=operator.itemgetter(1), reverse=True)
예제 #15
0
파일: main.py 프로젝트: AdithyaJain/Courses
def stemmer(tokens, new_tokens):
    for i in tokens:
        if i not in stopwords.dict:
            new_tokens.append(stemming.stem(i))
    return new_tokens
예제 #16
0
def stem_words(filepath):
    with open(filepath, 'r') as f:
        words = f.readlines()
    with open(filepath, 'w') as f:
        f.writelines("%s\n" % stem(w) for w in words)