import numpy as np  # for working with numbers
import pickle  # For working with .pkl files
from tqdm import tqdm  # Shows progress over iterations, including in pandas via "progress_apply"
import sys  # For terminal tricks
import _pickle as cPickle  # Optimized version of pickle
import gc  # For managing garbage collector
import timeit  # For counting time taken for a process
import datetime  # For working with dates & times

# Import packages for cleaning, tokenizing, and stemming text
import re  # For parsing text
from unicodedata import normalize  # for cleaning text by converting unicode character encodings into readable format
from nltk import word_tokenize, sent_tokenize  # widely used text tokenizer
from nltk.stem.porter import PorterStemmer  # an approximate method of stemming words (it just cuts off the ends)
from nltk.stem.porter import PorterStemmer  # approximate but effective (and common) method of normalizing words: stems words by implementing a hierarchy of linguistic rules that transform or cut off word endings
stem = PorterStemmer().stem  # Makes stemming more accessible
from nltk.corpus import stopwords  # for eliminating stop words
import gensim  # For word embedding models
from gensim.models.phrases import Phrases  # Makes word2vec more robust: Looks not just at  To look for multi-word phrases within word2vec

# Import packages for multiprocessing
import os  # For navigation
numcpus = len(
    os.sched_getaffinity(0))  # Detect and assign number of available CPUs
from multiprocessing import Pool  # key function for multiprocessing, to increase processing speed
pool = Pool(processes=numcpus)  # Pre-load number of CPUs into pool function
import Cython  # For parallelizing word2vec
mpdo = False  # Set to 'True' if using multiprocessing--faster for creating words by sentence file, but more complicated
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
예제 #2
0
import re
import sys
import os
import json
import numpy as np

# put in the path to the kaggle data
PATH_TO_JSON = "/user/alexeys/KaggleDato/Preprocessed/"
PATH_TO_TRAIN_LABELS = "/scratch/network/alexeys/KaggleDato/train.json"
PATH_TO_SUB_LABELS = "/scratch/network/alexeys/KaggleDato/sampleSubmission.json"

# Module-level global variables for the `tokenize` function below
#PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()


# Function to break text into "tokens"
def tokenize(text):
    tokens = word_tokenize(text)
    no_stopwords = filter(lambda x: x not in STOPWORDS, tokens)
    stemmed = map(lambda w: STEMMER.stem(w), no_stopwords)
    s = set(stemmed)
    stemmed = list(s)
    return filter(None, stemmed)


# Load and parse the data
def parsePoint(label, beast):
    #This is the beast:
예제 #3
0
import os
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()

file = 'spache_easy.txt'
cur_path = os.path.dirname(os.path.realpath(__file__))
dale_chall_path = os.path.join(cur_path, file)
words = None
with open(dale_chall_path) as f:
    words = list(line.strip() for line in f)

for w in words:
    print(porter_stemmer.stem(w))
예제 #4
0
DEFAULT_QUERY_WEIGHTS = {
    'fulltext': 0.4,
    'title': 0.2,
    'abstract': 0.2,
    'authors': 0.2,
}

pg_conn = psycopg2.connect(
    "dbname='sharesci' user='******' host='137.148.143.96' password='******'"
)
mongo_client = pymongo.MongoClient('137.148.143.48', 27017)

mongo_db = mongo_client['sharesci']
papers_collection = mongo_db['papers']

stemmer = PorterStemmer(mode=PorterStemmer.MARTIN_EXTENSIONS)


## Get the IDF values for the given terms
#
# @param terms (list-like)
# <br>	Format: A list of terms (each term as str)
#
# @return (dict)
# <br>	-- a dict with keys being terms (as str) and values being tuples
# 	of `(gram_id, IDF)`
def get_idfs(terms):
    cur = pg_conn.cursor()
    result = None
    num_docs = 1
    try:
예제 #5
0
def show_entry_fields():
    url = 'http://api.hh.ru/vacancies?text=' + (
        e1.get()) + '&page=0&per_page=100'
    data = requests.get(url).json()
    print("Поиск вакансий")
    p = json.dumps(data)
    res2 = json.loads(p)
    i = 0
    texts = []
    total_word = []
    window = tk.Toplevel(root)
    window.minsize(1300, 1000)
    window.title(u"Вывод данных")
    #webbrowser.open("index.html")
    w00 = Label(window, text=u"ВАКАНСИИ", font="Times")
    w00.place(relx=0.2, rely=0.01)
    t1 = Text(window, height=60, width=75)
    t1.place(relx=0.01, rely=0.03)
    w11 = Label(window, text=u"НАПИСАТЬ СОПРОВОДИТЕЛЬНОЕ ПИСЬМО", font="Times")
    w11.place(relx=0.64, rely=0.57)
    t2 = Text(window, height=20, width=70)
    t2.place(relx=0.52, rely=0.6)
    while i < len(res2['items']):
        a = ((res2['items'][i]['id']))  #['requirement']
        #print (a)
        #print ((res2['items'][i]['name']))
        aa = ((res2['items'][i]['snippet']['requirement']))
        #aa=(res2['items'][i]['snippet']['requirement']).replace('<highlighttext>', '')
        #patt = re.compile('(\s*)aa(\s*)')
        print(aa)

        texts.append(aa)
        #wordpunct_tokenize(str(aa))
        tokenizer = RegexpTokenizer(r'\w+')
        #print (stopwords.words('english'))
        (total_word.extend(tokenizer.tokenize(str(aa))))

        aaa = str(i + 1) + ') ' + str(res2['items'][i]['name']) + ' | ' + str(
            res2['items'][i]['area']['name']) + '\n'
        t1.insert(END, (aaa))
        i = i + 1

    #----------------------------------------------------------------------формирование окна выдачи результатов
    stopwords = nltk.corpus.stopwords.words('english')
    en_stop = get_stop_words('en')
    stemmer = SnowballStemmer("english")
    #print stopwords[:10]

    #--------------------------------------------------------------------------скрытое размещение дирихле
    #w8=Label(window,text=u"ОСНОВНЫЕ ТЕМЫ И СЛОВА", font = "Times")
    #w8.place(relx=0.17, rely=0.53)
    #t8=Text(window, height=24, width=75)
    #t8.place(relx=0.01, rely=0.57)
    texts = []
    stopped_tokens = [i for i in total_word if not i in en_stop]
    #print le(stopped_tokens)
    p_stemmer = PorterStemmer()
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    #print len(stemmed_tokens), stemmed_tokens
    texts.append(stemmed_tokens)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.LdaModel(corpus,
                                      num_topics=100,
                                      id2word=dictionary,
                                      passes=20)
    a = ldamodel.print_topics(num_topics=10, num_words=7)
    #print ldamodel.print_topics(num_topics=4, num_words=7)[0][1]
    #print a
    num_topics = 5
    topic_words = []
    for i in range(num_topics):
        tt = ldamodel.get_topic_terms(i, 10)
        topic_words.append([dictionary[pair[0]] for pair in tt])
    #print topic_words[0]
    jj = 0
    while jj < len(topic_words):
        topic11 = ((u"Тема #%d:" % (jj + 1)) + "\n" +
                   "-".join(topic_words[jj]) + "\n")
        #t8.insert(END, topic11)
        #print(u"Тема #%d:" % (jj+1))
        #print("-".join(topic_words[jj]))
        jj = jj + 1
    #--------------------------------------------------------------------------определение основных компетенций
    vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=.5)
    tfv = vec.fit_transform(stopped_tokens)
    terms = vec.get_feature_names()
    result = list(set(list_skills) & set(terms))
    print(result)
    text_file = open("Output.txt", "w")
    text_file.write(result[2])
    text_file.close()
    wc = WordCloud(height=1000, width=1000,
                   max_words=1000).generate(" ".join(terms))
    nmf = NMF(n_components=11).fit(tfv)
    #for idx, topic in enumerate(nmf.components_):
    #print(u"Тема #%d:" % (idx+1))
    #print(" ".join([terms[i] for i in topic.argsort()[:-10 - 1:-1]]))
    #--------------------------------------------------------------------------рисунок распределения терминов
    w8 = Label(window, text=u"РАСПРЕДЕЛЕНИЕ НАВЫКОВ", font="Times")
    w8.place(relx=0.66, rely=0.01)
    fig = plt.figure(figsize=(5, 5))
    im = plt.imshow(wc)
    canvas = FigureCanvasTkAgg(fig, master=window)
    canvas.show()
    canvas.get_tk_widget().place(
        relx=0.54, rely=0.03)  #pack(side=TOP, fill=BOTH, expand=1)
    canvas._tkcanvas.place(relx=0.52,
                           rely=0.03)  #pack(side=TOP, fill=BOTH, expand=1)
    #--------------------------------------------------------------------------оцека тональности
    c = Button(window,
               text=u"Подтвердить квалификацию",
               font="Times 14 bold",
               command=scoring,
               bg="deep sky blue")
    c.place(relx=0.95, rely=0.97, anchor=SE)
    c1 = Button(window,
                text=u"Откликнуться",
                font="Times 14 bold",
                command=testing,
                bg="lime green")
    c1.place(relx=0.7, rely=0.97, anchor=SE)
 def stemmer(self):
     stemmed = []
     porter = PorterStemmer()
     for s in self.words_list:
         stemmed.append([porter.stem(word) for word in s])
     self.words_list = stemmed
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the texts
import re  #regular expressions
import nltk  #natural language toolkit
nltk.download('stopwords')  #nachaine words like [is, are the, this,etc]
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer  #root word nikalne. eg. loving, loved =>love
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ',
                    dataset['Review'][i])  #letter bahek sabai hataune
    review = review.lower()  #lower ma lane.
    review = review.split()  #sentence lai array ma split garne
    ps = PorterStemmer()  #port stemmer class instantiate gareko
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]  #stopword hataune, root rakhne
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()  #review ra words haru ko mapping
y = dataset.iloc[:, 1].values  #sabai row 1 column ko (liked wala lcoulm)

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
nltk.download('stopwords')
from nltk.corpus import stopwords
'''this downloads and imports the stopwords that we will use to remove from the texts,which include the,a,is etc'''
from nltk.stem.porter import PorterStemmer
'''this will help us to take only the root of the word which indicates enough about the meaning eg loved->love
if we do not do this , there would be a separate feature generated for loved and love ..though they mean the same thing'''
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    ''' '[^a-zA-Z]' means which is not a-z or A-Z '''
    '''in the second qoute we put what we have replace the contents of the first qoute with'''
    '''here we are removing everything which is not a-z or A-Z by a space , sub function helps us to do that'''
    review = review.lower()  #to convert everything to lowercase
    review = review.split(
    )  #to split the words of a particular review as elements of a list
    ps = PorterStemmer()  #creating an object of the Porter Stemmer Class
    all_stopwords = stopwords.words(
        'english')  #storing all the stopwords of eng in a variable
    all_stopwords.remove(
        'not'
    )  #removing 'not' from the list of stopwords ...so that they are not removed from the reviews
    review = [
        ps.stem(word) for word in review if not word in set(all_stopwords)
    ]
    '''Here we rermove all the stopwords from the particular review and ad stemming to it by using a for loop 
    in each review that will run word by word'''
    review = ' '.join(
        review)  # we join all the stemmed words with a space between them
    corpus.append(review)  # add the cleaned review to the corpus list

#Creating the Bag OF Words Model
예제 #9
0
import numpy as np  # for working with numbers
import pickle  # For working with .pkl files
from tqdm import tqdm  # Shows progress over iterations, including in pandas via "progress_apply"
import sys  # For terminal tricks
import _pickle as cPickle  # Optimized version of pickle
import gc  # For managing garbage collector
import timeit  # For counting time taken for a process
import datetime  # For workin g with dates & times

# Import packages for cleaning, tokenizing, and stemming text
import re  # For parsing text
from unicodedata import normalize  # for cleaning text by converting unicode character encodings into readable format
from nltk import word_tokenize, sent_tokenize  # widely used text tokenizer
from nltk.stem.porter import PorterStemmer  # an approximate method of stemming words (it just cuts off the ends)
from nltk.stem.porter import PorterStemmer  # approximate but effective (and common) method of normalizing words: stems words by implementing a hierarchy of linguistic rules that transform or cut off word endings
stem = PorterStemmer().stem  # Makes stemming more accessible
from nltk.corpus import stopwords  # for eliminating stop words
import gensim  # For word embedding models
from gensim.models.phrases import Phrases, Phraser  # Makes word2vec more robust: Looks not just at  To look for multi-word phrases within word2vec
from gensim.models.doc2vec import TaggedDocument  #for preparing data for doc2vec input

import string  # for one method of eliminating punctuation
from nltk.corpus import stopwords  # for eliminating stop words
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer(
)  # approximate but effective (and common) method of stemming words

#setting up multiprocessing
import multiprocessing
from sklearn import utils
예제 #10
0
	if pre_pre == kw_interpret or pre == kw_interpret or nxt == kw_interpret or nxt_nxt == kw_interpret:
		around_kw_interpret[tk] = around_kw_interpret.get(tk, 0) + 1
	if pre_pre == kw_difference or pre == kw_difference or nxt == kw_difference or nxt_nxt == kw_difference:
		around_kw_difference[tk] = around_kw_difference.get(tk, 0) + 1
	if pre_pre == kw_book or pre == kw_book or nxt == kw_book or nxt_nxt == kw_book:
		around_kw_book[tk] = around_kw_book.get(tk, 0) + 1
	if pre_pre == kw_knowledge or pre == kw_knowledge or nxt == kw_knowledge or nxt_nxt == kw_knowledge:
		around_kw_knowledge[tk] = around_kw_knowledge.get(tk, 0) + 1

## Temp data structure for basic statistics
capital_grams = {} # {word:number}
all_lower_grams = {} # {word:number}
all_upper_grams = {} # {word:number}
pattern_ques = re.compile("\s*([a-z0-9]+)\?") # 0 or more spaces + alphabet (or digit) + '?'
before_ques = {} # {word:number}
stemmer1 = PorterStemmer()
kw_tag = stemmer1.stem('tag')
around_kw_tag = {} # {word:number}
kw_understand = stemmer1.stem('understand')
around_kw_understand = {} # {word:number}
kw_study = stemmer1.stem('study')
around_kw_study = {} # {word:number}
kw_introduction = stemmer1.stem('introduction')
around_kw_introduction = {} # {word:number}
kw_explain = stemmer1.stem('explain')
around_kw_explain = {} # {word:number}
kw_principle = stemmer1.stem('principle')
around_kw_principle = {} # {word:number}
kw_interpret = stemmer1.stem('interpret')
around_kw_interpret = {} # {word:number}
kw_difference = stemmer1.stem('difference')
예제 #11
0
import nltk
from nltk.stem.porter import PorterStemmer
import csv
from collections import defaultdict

columns = defaultdict(list)  # each value in each column is appended to a list

with open('text_sensibility.csv', newline='') as csvfile:
    spamreader = csv.DictReader(csvfile, delimiter=';')
    for row in spamreader:
        for (k, v) in row.items():
            columns[k].append(v)
# print(' '.join(columns['word']))
csvfile.close()

filtered = nltk.word_tokenize(' '.join(columns['word']))
stemmed = []
for f in filtered:
    stemmed.append(PorterStemmer().stem(f))

print(stemmed)

with open("text_sensibility.csv", "w+") as to_file:
    writer = csv.writer(to_file)
    for new_row in stemmed:
        writer.writerow(new_row)
def count_word_overlap(sent1,sent2):
    porterStemmer = PorterStemmer()
    sent1 = [porterStemmer.stem(w) for w in preprocess(sent1)]
    sent2 = [porterStemmer.stem(w) for w in preprocess(sent2)]
    n= set(sent1).intersection(set(sent2))
    return n,len(n)
예제 #13
0
def clean_stemmer(titles):
    stemmer = PorterStemmer()
    new_titles = []
    for item in titles:
        new_titles.append(stemmer.stem(item))
    return new_titles
예제 #14
0
import feedparser
import nltk
import sys
from nltk.stem.porter import PorterStemmer
if int(sys.version[0]) >= 3:
    from bs4 import BeautifulSoup
    Parser = BeautifulSoup
else:
    import BeautifulSoup
    Parser = BeautifulSoup.RobustHTMLParser

nltk_ver = tuple([int(_) for _ in nltk.__version__.split('.')])
if nltk_ver >= (3, 2, 2):
    stem = PorterStemmer().stem
else:
    stem = PorterStemmer().word_stem

import numpy as np
import config


def get_keys():
    fp = open('keyword_list', 'r')
    keys = []
    origkeys = []
    for l in fp:
        fullstr = l.rstrip()
        if len(fullstr) == 0:
            continue
        words = nltk.wordpunct_tokenize(fullstr)
        #fullstr.split(' ')
예제 #15
0
afinn = Afinn()
print(afinn.score(lines1))

neg_review = (glob.glob(""))[20]

with open(pos_review, 'r') as f:
    lines2 = f.readlines()[0]

afinn = Afinn()
print(afinn.score(lines2))

NRC = pd.read_csv()
NRC = NRC[(NRC != 0).all(1)]
NRC = NRC.reset_index(drop=True)
tokenizer = RegexpTokenizer('[\w]+')
stop_words = stopwords.words('english')
p_stremmer = PorterStemmer()

raw = line1.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in stop_words]
match_words = [x for x in stopped_tokens if x in list(NRC[0])]
emotion = []
for i in match_words:
    temp = list(NRC.iloc[np.where(NRC[0] == i)[0], 1])
    for j in temp:
        emotion.append(j)

sentiment_result1 = pd.Series(emotion).value_count()
sentiment_result1.plot.bar()
예제 #16
0
def text_cleaning_titles():
    stemmer = PorterStemmer()
    stop = set(stopwords.words('english'))
    stop.add('get')  # Add some ad-hoc stopwords often appearing in listings
    stop.add('use')
    stop.add('good')
    stop.add('best')
    stop.add('custom')
    stop.add('list')
    stop.add('free')
    stop.add('send')
    stop.add('ship')
    stop.add('onion')
    stop.add('feedback')
    stop.add('qualiti')
    stop.add('quality')
    stop.add('grams')
    stop.add('mg')
    stop.add('gr')
    stop.add('address')
    stop.add('order')
    stop.add('pleas')
    stop.add('price')
    stop.add('product')
    stop.add('check')
    stop.add('discuss')
    stop.add('name')
    stop.add('shipping')
    stop.add('one')
    stop.add('track')
    stop.add('day')
    stop.add('time')
    stop.add('packag')
    frequency = defaultdict(int)
    for key in datasetMap:
        currentTitle = key
        # Performing cleaning on key (title of listing)
        # Tokenizing
        tokens = nltk.word_tokenize(currentTitle)
        tokens_nostop = []
        for token in tokens:
            #Removal of numerical tokens
            if token not in stop:
                if token.isalpha():
                    tokens_nostop.append(token)
        # Stemming
        stems = []
        for token in tokens_nostop:
            word = stemmer.stem(token)
            if word not in stop:
                stems.append(word)
            else:
                continue
            frequency[word] += 1
        datasetMap[key] = stems
    # Now discard unique tokens or monograms or bigrams and typos (too rare words)
    for word in frequency.keys():
        forGephi.append(word)
    for key in datasetMap:
        title = datasetMap[key]
        newTitle = []
        for word in title:
            if len(word) > 1:  # A word must be at least 2 letters long
                if frequency[word] > 5:  # A word must appear at least 5 times
                    newTitle.append(word)
            else:
                frequency[word] = 0
                continue
        datasetMap[key] = newTitle
    print "Text cleaning completed.\n"
예제 #17
0
def UrlCheck(e):
        links = []
        texts = []
        global stemmed
        global linksabc
        result = urlparse(e)
        a = all([result.scheme, result.netloc, result.path])
        if(a == True):
            suffix = '/'
            if(e.endswith(suffix)):
                e = e[:len(e)-len(suffix)]
          
            page = requests.get(e)    
            data = page.text
            soup = BeautifulSoup(data , features="html.parser")

            for link in soup.find_all('a'):
                text = soup.find_all(text=True)
                links.append(link.get('href'))

            linksabc = list(dict.fromkeys(links))
            l2.config(text = len(linksabc))

            a = len(get_fld(e))
            l7.config(text = a)
            
            html_page = page.content
            soup = BeautifulSoup(html_page, 'html.parser')
            text = soup.find_all(text=True)
            output = ''
            blacklist = ['[document]','noscript','header','html','meta','head', 'input','script','style' ,'li' , 'b' , 'href' , 'div' , 'th']
            for t in text:
                if t.parent.name not in blacklist:
                    output += '{} '.format(t)
            res = len(output.split())
            l4.config(text = res)

            tokens = word_tokenize(output)  #splitting
            # convert to lower case
            tokens = [w.lower() for w in tokens]
            # remove punctuation from each word
            table = str.maketrans('', '', string.punctuation)
            stripped = [w.translate(table) for w in tokens]
            # remove remaining tokens that are not alphabetic
            words = [word for word in stripped if word.isalpha()]
            # filter out stop words
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if not w in stop_words]

            porter = PorterStemmer()
            stemmed = [porter.stem(word) for word in words]

            res = [key for key, value in Counter(stemmed).most_common()]
            key1.config(text = res[0])
            key2.config(text = res[1])
            key3.config(text = res[2])
            key4.config(text = res[3])
            key5.config(text = res[4])
            key6.config(text = res[5])
            key7.config(text = res[6])
            key8.config(text = res[7])
            key9.config(text = res[8])

            
        else:
            print(showwarning("Alert" , "No such website or url exists"))
예제 #18
0
def text_cleaning_descriptions():
    stemmer = PorterStemmer()
    stop = set(stopwords.words('english'))
    stop.add('get')  # Add some ad-hoc stopwords often appearing in listings
    stop.add('use')
    stop.add('aaa')
    stop.add('good')
    stop.add('best')
    stop.add('custom')
    stop.add('list')
    stop.add('free')
    stop.add('send')
    stop.add('ship')
    stop.add('onion')
    stop.add('feedback')
    stop.add('qualiti')
    stop.add('quality')
    stop.add('grams')
    stop.add('address')
    stop.add('order')
    stop.add('pleas')
    stop.add('price')
    stop.add('product')
    stop.add('check')
    stop.add('discuss')
    stop.add('name')
    stop.add('shipping')
    stop.add('one')
    stop.add('track')
    stop.add('day')
    stop.add('time')
    stop.add('packag')
    frequency = defaultdict(int)
    for key in datasetMap:
        currentDescription = datasetMap[key]
        # Performing cleaning on currentDescription
        # Tokenizing
        tokens = nltk.word_tokenize(currentDescription)
        tokens_nostop = []
        for token in tokens:
            if token not in stop:
                tokens_nostop.append(token)
        # Stemming
        stems = []
        for token in tokens_nostop:
            word = stemmer.stem(token)
            if word not in stop:
                stems.append(word)
            else:
                continue
            frequency[word] += 1
        datasetMap[key] = stems
    # Now discard unique tokens or monograms or bigrams and typos (too rare words)
    for key in datasetMap:
        description = datasetMap[key]
        newDescription = []
        for word in description:
            if len(word) > 2:  # A word must be at least 3 letters long
                if frequency[word] > 5:  # A word must appear at least 5 times
                    newDescription.append(word)

                # else:
                # print "Rare token found: " + word
            else:
                frequency[word] = 0
                continue
        datasetMap[key] = newDescription
    print "Text cleaning completed.\n"
예제 #19
0
def create(request):
    if request.method == 'POST':
        data = request.POST['parag']
        paragraph = data
        text = data.replace('\n', '')
        data = text
        for k in text.split("\n"):
            text2 = re.sub(r"[^a-zA-Z0-9&]+", ' ', k)
        text = text2
        tokens = [t for t in text.split()]
        sr = stopwords.words('english')
        clean_tokens = tokens[:]
        for token in tokens:
            if token in stopwords.words('english'):

                clean_tokens.remove(token)
        freq = nltk.FreqDist(clean_tokens)

        s = [(k, freq[k]) for k in sorted(freq, key=freq.get, reverse=True)]
        title = s[0][0]
        search_queries = [
            sorted(freq.items(), key=lambda kv:
                   (kv[1], kv[0]), reverse=True)[0][0] + "  " +
            sorted(freq.items(), key=lambda kv:
                   (kv[1], kv[0]), reverse=True)[1][0]
        ]
        for query in search_queries:
            downloadimages(query, title)

        stop_words = stopwords.words('english')
        summarize_text = []
        # Step 1 - Read text anc split it
        article = data.split(". ")
        sentences = []
        sentences_list = ''
        count_sentence = 0
        for sentence in article:
            count_sentence = count_sentence + 1
            sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
        sentences.pop()
        top_n = int(count_sentence / 3)
        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = build_similarity_matrix(
            sentences, stop_words)
        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(
            sentence_similarity_martix)
        scores = nx.pagerank(sentence_similarity_graph)
        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = sorted(
            ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        for i in range(top_n):
            summarize_text.append(" ".join(ranked_sentence[i][1]))
        # Step 5 - Offcourse, output the summarize texr
        m = 1
        # Driver Code
        with open("visualizer/input/op.tsv", "w") as text_file:
            text_file.write("content" + "\t" + "val" + '\n')
            for i in summarize_text:
                sentences_list = sentences_list + i
                search_queries.append(i)
                text_file.write(i + "\t" + str(m) + '\n')
                m = m + 1
        emotion = predict()
        for query in search_queries:
            review = re.sub('[^a-zA-Z]', ' ', query)
            review = review.lower()
            review = review.split()
            ps = PorterStemmer()
            review = [
                ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))
            ]
            review = ' '.join(review)
            downloadimages(review, title)

        fps = 0.2

        file_list = glob.glob(
            'visualizer/images/' + title +
            '/*.jpg')  # Get all the pngs in the current directory
        file_list_sorted = natsorted(file_list,
                                     reverse=False)  # Sort the images

        clips = [ImageClip(m).set_duration(5) for m in file_list_sorted]

        concat_clip = concatenate(clips, method="compose")
        concat_clip.write_videofile("visualizer/output/project.mp4", fps=fps)

        folder = 'visualizer/images/' + title + '/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                #elif os.path.isdir(file_path): shutil.rmtree(file_path)
            except Exception as e:
                print(e)
        textClip = gTTS(text=sentences_list, lang=language, slow=False)
        textClip.save("visualizer/output/voice.mp3")
        audioclip = AudioFileClip("visualizer/output/voice.mp3")
        my_clip = VideoFileClip('visualizer/output/project.mp4')
        audio_background = AudioFileClip('visualizer/emotions/' + emotion +
                                         '.mp3')
        new_audioclip = CompositeAudioClip(
            [audio_background.volumex(0.08),
             audioclip.volumex(1)])

        final_audio = CompositeAudioClip([new_audioclip])
        audio = afx.audio_loop(final_audio, duration=audioclip.duration)
        final_clip = my_clip.set_audio(audio)
        final_clip.write_videofile("visualizer/output/" + title + '.mp4')
        data = title
        file_path = 'visualizer/output/' + data + '.mp4'
        video = Video()
        video.data = paragraph
        video.name = data
        video.videofile = file_path
        video.save()
        return redirect(video.videofile.url)

    if request.method == 'GET':
        return render(request, 'index.html')
예제 #20
0
def single_meteor_score(
    reference,
    hypothesis,
    preprocess=str.lower,
    stemmer=PorterStemmer(),
    wordnet=wordnet,
    alpha=0.9,
    beta=3,
    gamma=0.5,
):
    """
    Calculates METEOR score for single hypothesis and reference as per
    "Meteor: An Automatic Metric for MT Evaluation with HighLevels of
    Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
    in Proceedings of ACL.
    http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf


    >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'

    >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'


    >>> round(single_meteor_score(reference1, hypothesis1),4)
    0.7398

        If there is no words match during the alignment the method returns the
        score as 0. We can safely  return a zero instead of raising a
        division by zero error as no match usually implies a bad translation.

    >>> round(meteor_score('this is a cat', 'non matching hypothesis'),4)
    0.0

    :param reference: reference sentence
    :type reference: str
    :param hypothesis: a hypothesis sentence
    :type hypothesis: str
    :param preprocess: preprocessing function (default str.lower)
    :type preprocess: method
    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
    :type wordnet: WordNetCorpusReader
    :param alpha: parameter for controlling relative weights of precision and recall.
    :type alpha: float
    :param beta: parameter for controlling shape of penalty as a
                 function of as a function of fragmentation.
    :type beta: float
    :param gamma: relative weight assigned to fragmentation penalty.
    :type gamma: float
    :return: The sentence-level METEOR score.
    :rtype: float
    """
    enum_hypothesis, enum_reference = _generate_enums(
        hypothesis, reference, preprocess=preprocess
    )
    translation_length = len(enum_hypothesis)
    reference_length = len(enum_reference)
    matches, _, _ = _enum_align_words(
        enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet
    )
    matches_count = len(matches)
    try:
        precision = float(matches_count) / translation_length
        recall = float(matches_count) / reference_length
        fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
        chunk_count = float(_count_chunks(matches))
        frag_frac = chunk_count / matches_count
    except ZeroDivisionError:
        return 0.0
    penalty = gamma * frag_frac ** beta
    return (1 - penalty) * fmean
예제 #21
0
    names=["label", "message"])
# the above note pad is 2 parts 1st column represent the lable spam or ham
# then the dependent var and independent var is is sepereated by one tab so /t

#and there is no column name so im forcingly specifying 2 heading . 1st is lable and 2nd is message

# now data cleaning and pre processssssssssngs

import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
#nltk.download('stopwords')
from nltk.corpus import stopwords

ps = PorterStemmer()  # stemming purpose
lem = WordNetLemmatizer()
corpus = []

for i in range(0, len(mail)):

    review = re.sub(
        '[^a-zA-z]', ' ',
        mail['message'][i])  # space is given in 2nd parameter of sub
    review = review.lower()
    review = review.split()

    review = [
        lem.lemmatize(word) for word in review
        if not word in stopwords.words('english')
    ]
예제 #22
0
def meteor_score(
    references,
    hypothesis,
    preprocess=str.lower,
    stemmer=PorterStemmer(),
    wordnet=wordnet,
    alpha=0.9,
    beta=3,
    gamma=0.5,
):
    """
    Calculates METEOR score for hypothesis with multiple references as
    described in "Meteor: An Automatic Metric for MT Evaluation with
    HighLevels of Correlation with Human Judgments" by Alon Lavie and
    Abhaya Agarwal, in Proceedings of ACL.
    http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf


    In case of multiple references the best score is chosen. This method
    iterates over single_meteor_score and picks the best pair among all
    the references for a given hypothesis

    >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
    >>> hypothesis2 = 'It is to insure the troops forever hearing the activity guidebook that party direct'

    >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
    >>> reference2 = 'It is the guiding principle which guarantees the military forces always being under the command of the Party'
    >>> reference3 = 'It is the practical guide for the army always to heed the directions of the party'

    >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
    0.7398

        If there is no words match during the alignment the method returns the
        score as 0. We can safely  return a zero instead of raising a
        division by zero error as no match usually implies a bad translation.

    >>> round(meteor_score(['this is a cat'], 'non matching hypothesis'),4)
    0.0

    :param references: reference sentences
    :type references: list(str)
    :param hypothesis: a hypothesis sentence
    :type hypothesis: str
    :param preprocess: preprocessing function (default str.lower)
    :type preprocess: method
    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
    :type wordnet: WordNetCorpusReader
    :param alpha: parameter for controlling relative weights of precision and recall.
    :type alpha: float
    :param beta: parameter for controlling shape of penalty as a function
                 of as a function of fragmentation.
    :type beta: float
    :param gamma: relative weight assigned to fragmentation penalty.
    :type gamma: float
    :return: The sentence-level METEOR score.
    :rtype: float
    """
    return max(
        single_meteor_score(
            reference,
            hypothesis,
            preprocess=preprocess,
            stemmer=stemmer,
            wordnet=wordnet,
            alpha=alpha,
            beta=beta,
            gamma=gamma,
        )
        for reference in references
    )
nltk.download('treebank')
nltk.download('wordnet')
nltk.download('punkt')

documentsPath = 'text'


class IndexBuilder:
    def __init__(self, path: str, preprocessor: TokenPreprocessor):
        self.__tokenPreprocessor = preprocessor
        self.__documentsPath: path

    def buildIndex(self, name):
        indexer = Indexer(name)
        documentsFileNames = os.listdir(documentsPath)
        for docId, documentFileName in enumerate(documentsFileNames):
            with open(documentsPath + '/' + documentFileName,
                      'r+') as fileHandler:
                content = fileHandler.read()
                tokens = tokenProcessor.preprocess(nltk.word_tokenize(content))
                for token in tokens:
                    indexer.add_word_to_document(token, docId + 1)
        print(indexer)
        indexer.save_indexer_to_disk()


if __name__ == '__main__':
    tokenProcessor = TokenPreprocessor(PorterStemmer(mode='NLTK_EXTENSIONS'),
                                       stopwords.words('english'))
    indexBuilder = IndexBuilder(tokenProcessor, documentsPath)
    indexBuilder.buildIndex('myIndex')
예제 #24
0
 def stemming_words(self):
     porter = PorterStemmer()
     self.stemmed_words = [porter.stem(word) for word in self.words]
예제 #25
0
train_nans = ds_train['keyword'].isnull().sum()
print(train_nans)
train_nans = ds_train['location'].isnull().sum()
print(train_nans)

# In[7]:

print(ds_train.shape[0])

# In[ ]:

# In[8]:

# Creating Corpus after preprocessing the training data
corpus = []
pstem = PorterStemmer()
for i in range(ds_train['text'].shape[0]):
    text = re.sub("[^a-zA-Z]", ' ', ds_train['text'][i])
    text = text.lower()
    text = text.split()
    text = [
        pstem.stem(word) for word in text
        if not word in set(stopwords.words('english'))
    ]
    text = ' '.join(text)
    corpus.append(text)

# In[9]:

#print((corpus))
예제 #26
0
 def stem_all(self, sentence):
     stemmer = PorterStemmer()
     return [stemmer.stem(word) for word in sentence]
예제 #27
0
import re


def rm_stopwords(tokens):
    '''
    Returns a list of the elements from the given list that aren't an english stopword
    '''
    good_tokens = []
    for token in tokens:
        if not (token in stopwords.words('english')):
            good_tokens.append(token)
    return good_tokens


### modified from csi4106 notebook 5
port = PorterStemmer()


def stemmer(tokens):  #sometimes case-folds
    '''
    Returns a list of stemmed elements based off the given list
    '''

    return [port.stem(t) for t in tokens]
    ###end


###modified from https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
lemmatizer = WordNetLemmatizer()

예제 #28
0
 def __init__(self):
     self._porterStemmer = PorterStemmer()
# -*- coding: utf-8 -*-
from nltk.stem.porter import PorterStemmer
import json
import re
import sys
ps = PorterStemmer()
#path_to_vectors = '/path/to/numberbatch-en.txt'
path_to_vectors = sys.argv[1]
import numpy as np


def dump_stemmed_vectors(filepath):
    vectors = []
    with open(filepath, 'r', encoding="utf8") as myfile:
        vectors = myfile.readlines()
        vectors = [vector.strip() for vector in vectors]
    word_vector_dict = {}
    for word_vector in vectors:
        word = word_vector.split()[0].encode('ascii', 'ignore').decode("utf8")
        vector = word_vector.split()[1:]
        if '#' not in word and '_' not in word:
            word = ps.stem(word)
            if word in word_vector_dict:
                pass
            else:
                word_vector_dict[word] = vector
    with open('stemmed_vectors', 'w') as myfile:
        json.dump(word_vector_dict, myfile)


def generate_in_correct_format(filename):
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems