import numpy as np # for working with numbers import pickle # For working with .pkl files from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply" import sys # For terminal tricks import _pickle as cPickle # Optimized version of pickle import gc # For managing garbage collector import timeit # For counting time taken for a process import datetime # For working with dates & times # Import packages for cleaning, tokenizing, and stemming text import re # For parsing text from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format from nltk import word_tokenize, sent_tokenize # widely used text tokenizer from nltk.stem.porter import PorterStemmer # an approximate method of stemming words (it just cuts off the ends) from nltk.stem.porter import PorterStemmer # approximate but effective (and common) method of normalizing words: stems words by implementing a hierarchy of linguistic rules that transform or cut off word endings stem = PorterStemmer().stem # Makes stemming more accessible from nltk.corpus import stopwords # for eliminating stop words import gensim # For word embedding models from gensim.models.phrases import Phrases # Makes word2vec more robust: Looks not just at To look for multi-word phrases within word2vec # Import packages for multiprocessing import os # For navigation numcpus = len( os.sched_getaffinity(0)) # Detect and assign number of available CPUs from multiprocessing import Pool # key function for multiprocessing, to increase processing speed pool = Pool(processes=numcpus) # Pre-load number of CPUs into pool function import Cython # For parallelizing word2vec mpdo = False # Set to 'True' if using multiprocessing--faster for creating words by sentence file, but more complicated nltk.download('stopwords') nltk.download('punkt') nltk.download('words')
import re import sys import os import json import numpy as np # put in the path to the kaggle data PATH_TO_JSON = "/user/alexeys/KaggleDato/Preprocessed/" PATH_TO_TRAIN_LABELS = "/scratch/network/alexeys/KaggleDato/train.json" PATH_TO_SUB_LABELS = "/scratch/network/alexeys/KaggleDato/sampleSubmission.json" # Module-level global variables for the `tokenize` function below #PUNCTUATION = set(string.punctuation) STOPWORDS = set(stopwords.words('english')) STEMMER = PorterStemmer() # Function to break text into "tokens" def tokenize(text): tokens = word_tokenize(text) no_stopwords = filter(lambda x: x not in STOPWORDS, tokens) stemmed = map(lambda w: STEMMER.stem(w), no_stopwords) s = set(stemmed) stemmed = list(s) return filter(None, stemmed) # Load and parse the data def parsePoint(label, beast): #This is the beast:
import os from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() file = 'spache_easy.txt' cur_path = os.path.dirname(os.path.realpath(__file__)) dale_chall_path = os.path.join(cur_path, file) words = None with open(dale_chall_path) as f: words = list(line.strip() for line in f) for w in words: print(porter_stemmer.stem(w))
DEFAULT_QUERY_WEIGHTS = { 'fulltext': 0.4, 'title': 0.2, 'abstract': 0.2, 'authors': 0.2, } pg_conn = psycopg2.connect( "dbname='sharesci' user='******' host='137.148.143.96' password='******'" ) mongo_client = pymongo.MongoClient('137.148.143.48', 27017) mongo_db = mongo_client['sharesci'] papers_collection = mongo_db['papers'] stemmer = PorterStemmer(mode=PorterStemmer.MARTIN_EXTENSIONS) ## Get the IDF values for the given terms # # @param terms (list-like) # <br> Format: A list of terms (each term as str) # # @return (dict) # <br> -- a dict with keys being terms (as str) and values being tuples # of `(gram_id, IDF)` def get_idfs(terms): cur = pg_conn.cursor() result = None num_docs = 1 try:
def show_entry_fields(): url = 'http://api.hh.ru/vacancies?text=' + ( e1.get()) + '&page=0&per_page=100' data = requests.get(url).json() print("Поиск вакансий") p = json.dumps(data) res2 = json.loads(p) i = 0 texts = [] total_word = [] window = tk.Toplevel(root) window.minsize(1300, 1000) window.title(u"Вывод данных") #webbrowser.open("index.html") w00 = Label(window, text=u"ВАКАНСИИ", font="Times") w00.place(relx=0.2, rely=0.01) t1 = Text(window, height=60, width=75) t1.place(relx=0.01, rely=0.03) w11 = Label(window, text=u"НАПИСАТЬ СОПРОВОДИТЕЛЬНОЕ ПИСЬМО", font="Times") w11.place(relx=0.64, rely=0.57) t2 = Text(window, height=20, width=70) t2.place(relx=0.52, rely=0.6) while i < len(res2['items']): a = ((res2['items'][i]['id'])) #['requirement'] #print (a) #print ((res2['items'][i]['name'])) aa = ((res2['items'][i]['snippet']['requirement'])) #aa=(res2['items'][i]['snippet']['requirement']).replace('<highlighttext>', '') #patt = re.compile('(\s*)aa(\s*)') print(aa) texts.append(aa) #wordpunct_tokenize(str(aa)) tokenizer = RegexpTokenizer(r'\w+') #print (stopwords.words('english')) (total_word.extend(tokenizer.tokenize(str(aa)))) aaa = str(i + 1) + ') ' + str(res2['items'][i]['name']) + ' | ' + str( res2['items'][i]['area']['name']) + '\n' t1.insert(END, (aaa)) i = i + 1 #----------------------------------------------------------------------формирование окна выдачи результатов stopwords = nltk.corpus.stopwords.words('english') en_stop = get_stop_words('en') stemmer = SnowballStemmer("english") #print stopwords[:10] #--------------------------------------------------------------------------скрытое размещение дирихле #w8=Label(window,text=u"ОСНОВНЫЕ ТЕМЫ И СЛОВА", font = "Times") #w8.place(relx=0.17, rely=0.53) #t8=Text(window, height=24, width=75) #t8.place(relx=0.01, rely=0.57) texts = [] stopped_tokens = [i for i in total_word if not i in en_stop] #print le(stopped_tokens) p_stemmer = PorterStemmer() stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] #print len(stemmed_tokens), stemmed_tokens texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.LdaModel(corpus, num_topics=100, id2word=dictionary, passes=20) a = ldamodel.print_topics(num_topics=10, num_words=7) #print ldamodel.print_topics(num_topics=4, num_words=7)[0][1] #print a num_topics = 5 topic_words = [] for i in range(num_topics): tt = ldamodel.get_topic_terms(i, 10) topic_words.append([dictionary[pair[0]] for pair in tt]) #print topic_words[0] jj = 0 while jj < len(topic_words): topic11 = ((u"Тема #%d:" % (jj + 1)) + "\n" + "-".join(topic_words[jj]) + "\n") #t8.insert(END, topic11) #print(u"Тема #%d:" % (jj+1)) #print("-".join(topic_words[jj])) jj = jj + 1 #--------------------------------------------------------------------------определение основных компетенций vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=.5) tfv = vec.fit_transform(stopped_tokens) terms = vec.get_feature_names() result = list(set(list_skills) & set(terms)) print(result) text_file = open("Output.txt", "w") text_file.write(result[2]) text_file.close() wc = WordCloud(height=1000, width=1000, max_words=1000).generate(" ".join(terms)) nmf = NMF(n_components=11).fit(tfv) #for idx, topic in enumerate(nmf.components_): #print(u"Тема #%d:" % (idx+1)) #print(" ".join([terms[i] for i in topic.argsort()[:-10 - 1:-1]])) #--------------------------------------------------------------------------рисунок распределения терминов w8 = Label(window, text=u"РАСПРЕДЕЛЕНИЕ НАВЫКОВ", font="Times") w8.place(relx=0.66, rely=0.01) fig = plt.figure(figsize=(5, 5)) im = plt.imshow(wc) canvas = FigureCanvasTkAgg(fig, master=window) canvas.show() canvas.get_tk_widget().place( relx=0.54, rely=0.03) #pack(side=TOP, fill=BOTH, expand=1) canvas._tkcanvas.place(relx=0.52, rely=0.03) #pack(side=TOP, fill=BOTH, expand=1) #--------------------------------------------------------------------------оцека тональности c = Button(window, text=u"Подтвердить квалификацию", font="Times 14 bold", command=scoring, bg="deep sky blue") c.place(relx=0.95, rely=0.97, anchor=SE) c1 = Button(window, text=u"Откликнуться", font="Times 14 bold", command=testing, bg="lime green") c1.place(relx=0.7, rely=0.97, anchor=SE)
def stemmer(self): stemmed = [] porter = PorterStemmer() for s in self.words_list: stemmed.append([porter.stem(word) for word in s]) self.words_list = stemmed
# Importing the dataset dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3) # Cleaning the texts import re #regular expressions import nltk #natural language toolkit nltk.download('stopwords') #nachaine words like [is, are the, this,etc] from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer #root word nikalne. eg. loving, loved =>love corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #letter bahek sabai hataune review = review.lower() #lower ma lane. review = review.split() #sentence lai array ma split garne ps = PorterStemmer() #port stemmer class instantiate gareko review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ] #stopword hataune, root rakhne review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(corpus).toarray() #review ra words haru ko mapping y = dataset.iloc[:, 1].values #sabai row 1 column ko (liked wala lcoulm) # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split
nltk.download('stopwords') from nltk.corpus import stopwords '''this downloads and imports the stopwords that we will use to remove from the texts,which include the,a,is etc''' from nltk.stem.porter import PorterStemmer '''this will help us to take only the root of the word which indicates enough about the meaning eg loved->love if we do not do this , there would be a separate feature generated for loved and love ..though they mean the same thing''' corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) ''' '[^a-zA-Z]' means which is not a-z or A-Z ''' '''in the second qoute we put what we have replace the contents of the first qoute with''' '''here we are removing everything which is not a-z or A-Z by a space , sub function helps us to do that''' review = review.lower() #to convert everything to lowercase review = review.split( ) #to split the words of a particular review as elements of a list ps = PorterStemmer() #creating an object of the Porter Stemmer Class all_stopwords = stopwords.words( 'english') #storing all the stopwords of eng in a variable all_stopwords.remove( 'not' ) #removing 'not' from the list of stopwords ...so that they are not removed from the reviews review = [ ps.stem(word) for word in review if not word in set(all_stopwords) ] '''Here we rermove all the stopwords from the particular review and ad stemming to it by using a for loop in each review that will run word by word''' review = ' '.join( review) # we join all the stemmed words with a space between them corpus.append(review) # add the cleaned review to the corpus list #Creating the Bag OF Words Model
import numpy as np # for working with numbers import pickle # For working with .pkl files from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply" import sys # For terminal tricks import _pickle as cPickle # Optimized version of pickle import gc # For managing garbage collector import timeit # For counting time taken for a process import datetime # For workin g with dates & times # Import packages for cleaning, tokenizing, and stemming text import re # For parsing text from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format from nltk import word_tokenize, sent_tokenize # widely used text tokenizer from nltk.stem.porter import PorterStemmer # an approximate method of stemming words (it just cuts off the ends) from nltk.stem.porter import PorterStemmer # approximate but effective (and common) method of normalizing words: stems words by implementing a hierarchy of linguistic rules that transform or cut off word endings stem = PorterStemmer().stem # Makes stemming more accessible from nltk.corpus import stopwords # for eliminating stop words import gensim # For word embedding models from gensim.models.phrases import Phrases, Phraser # Makes word2vec more robust: Looks not just at To look for multi-word phrases within word2vec from gensim.models.doc2vec import TaggedDocument #for preparing data for doc2vec input import string # for one method of eliminating punctuation from nltk.corpus import stopwords # for eliminating stop words from sklearn.feature_extraction import text from nltk.stem.porter import PorterStemmer ps = PorterStemmer( ) # approximate but effective (and common) method of stemming words #setting up multiprocessing import multiprocessing from sklearn import utils
if pre_pre == kw_interpret or pre == kw_interpret or nxt == kw_interpret or nxt_nxt == kw_interpret: around_kw_interpret[tk] = around_kw_interpret.get(tk, 0) + 1 if pre_pre == kw_difference or pre == kw_difference or nxt == kw_difference or nxt_nxt == kw_difference: around_kw_difference[tk] = around_kw_difference.get(tk, 0) + 1 if pre_pre == kw_book or pre == kw_book or nxt == kw_book or nxt_nxt == kw_book: around_kw_book[tk] = around_kw_book.get(tk, 0) + 1 if pre_pre == kw_knowledge or pre == kw_knowledge or nxt == kw_knowledge or nxt_nxt == kw_knowledge: around_kw_knowledge[tk] = around_kw_knowledge.get(tk, 0) + 1 ## Temp data structure for basic statistics capital_grams = {} # {word:number} all_lower_grams = {} # {word:number} all_upper_grams = {} # {word:number} pattern_ques = re.compile("\s*([a-z0-9]+)\?") # 0 or more spaces + alphabet (or digit) + '?' before_ques = {} # {word:number} stemmer1 = PorterStemmer() kw_tag = stemmer1.stem('tag') around_kw_tag = {} # {word:number} kw_understand = stemmer1.stem('understand') around_kw_understand = {} # {word:number} kw_study = stemmer1.stem('study') around_kw_study = {} # {word:number} kw_introduction = stemmer1.stem('introduction') around_kw_introduction = {} # {word:number} kw_explain = stemmer1.stem('explain') around_kw_explain = {} # {word:number} kw_principle = stemmer1.stem('principle') around_kw_principle = {} # {word:number} kw_interpret = stemmer1.stem('interpret') around_kw_interpret = {} # {word:number} kw_difference = stemmer1.stem('difference')
import nltk from nltk.stem.porter import PorterStemmer import csv from collections import defaultdict columns = defaultdict(list) # each value in each column is appended to a list with open('text_sensibility.csv', newline='') as csvfile: spamreader = csv.DictReader(csvfile, delimiter=';') for row in spamreader: for (k, v) in row.items(): columns[k].append(v) # print(' '.join(columns['word'])) csvfile.close() filtered = nltk.word_tokenize(' '.join(columns['word'])) stemmed = [] for f in filtered: stemmed.append(PorterStemmer().stem(f)) print(stemmed) with open("text_sensibility.csv", "w+") as to_file: writer = csv.writer(to_file) for new_row in stemmed: writer.writerow(new_row)
def count_word_overlap(sent1,sent2): porterStemmer = PorterStemmer() sent1 = [porterStemmer.stem(w) for w in preprocess(sent1)] sent2 = [porterStemmer.stem(w) for w in preprocess(sent2)] n= set(sent1).intersection(set(sent2)) return n,len(n)
def clean_stemmer(titles): stemmer = PorterStemmer() new_titles = [] for item in titles: new_titles.append(stemmer.stem(item)) return new_titles
import feedparser import nltk import sys from nltk.stem.porter import PorterStemmer if int(sys.version[0]) >= 3: from bs4 import BeautifulSoup Parser = BeautifulSoup else: import BeautifulSoup Parser = BeautifulSoup.RobustHTMLParser nltk_ver = tuple([int(_) for _ in nltk.__version__.split('.')]) if nltk_ver >= (3, 2, 2): stem = PorterStemmer().stem else: stem = PorterStemmer().word_stem import numpy as np import config def get_keys(): fp = open('keyword_list', 'r') keys = [] origkeys = [] for l in fp: fullstr = l.rstrip() if len(fullstr) == 0: continue words = nltk.wordpunct_tokenize(fullstr) #fullstr.split(' ')
afinn = Afinn() print(afinn.score(lines1)) neg_review = (glob.glob(""))[20] with open(pos_review, 'r') as f: lines2 = f.readlines()[0] afinn = Afinn() print(afinn.score(lines2)) NRC = pd.read_csv() NRC = NRC[(NRC != 0).all(1)] NRC = NRC.reset_index(drop=True) tokenizer = RegexpTokenizer('[\w]+') stop_words = stopwords.words('english') p_stremmer = PorterStemmer() raw = line1.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in stop_words] match_words = [x for x in stopped_tokens if x in list(NRC[0])] emotion = [] for i in match_words: temp = list(NRC.iloc[np.where(NRC[0] == i)[0], 1]) for j in temp: emotion.append(j) sentiment_result1 = pd.Series(emotion).value_count() sentiment_result1.plot.bar()
def text_cleaning_titles(): stemmer = PorterStemmer() stop = set(stopwords.words('english')) stop.add('get') # Add some ad-hoc stopwords often appearing in listings stop.add('use') stop.add('good') stop.add('best') stop.add('custom') stop.add('list') stop.add('free') stop.add('send') stop.add('ship') stop.add('onion') stop.add('feedback') stop.add('qualiti') stop.add('quality') stop.add('grams') stop.add('mg') stop.add('gr') stop.add('address') stop.add('order') stop.add('pleas') stop.add('price') stop.add('product') stop.add('check') stop.add('discuss') stop.add('name') stop.add('shipping') stop.add('one') stop.add('track') stop.add('day') stop.add('time') stop.add('packag') frequency = defaultdict(int) for key in datasetMap: currentTitle = key # Performing cleaning on key (title of listing) # Tokenizing tokens = nltk.word_tokenize(currentTitle) tokens_nostop = [] for token in tokens: #Removal of numerical tokens if token not in stop: if token.isalpha(): tokens_nostop.append(token) # Stemming stems = [] for token in tokens_nostop: word = stemmer.stem(token) if word not in stop: stems.append(word) else: continue frequency[word] += 1 datasetMap[key] = stems # Now discard unique tokens or monograms or bigrams and typos (too rare words) for word in frequency.keys(): forGephi.append(word) for key in datasetMap: title = datasetMap[key] newTitle = [] for word in title: if len(word) > 1: # A word must be at least 2 letters long if frequency[word] > 5: # A word must appear at least 5 times newTitle.append(word) else: frequency[word] = 0 continue datasetMap[key] = newTitle print "Text cleaning completed.\n"
def UrlCheck(e): links = [] texts = [] global stemmed global linksabc result = urlparse(e) a = all([result.scheme, result.netloc, result.path]) if(a == True): suffix = '/' if(e.endswith(suffix)): e = e[:len(e)-len(suffix)] page = requests.get(e) data = page.text soup = BeautifulSoup(data , features="html.parser") for link in soup.find_all('a'): text = soup.find_all(text=True) links.append(link.get('href')) linksabc = list(dict.fromkeys(links)) l2.config(text = len(linksabc)) a = len(get_fld(e)) l7.config(text = a) html_page = page.content soup = BeautifulSoup(html_page, 'html.parser') text = soup.find_all(text=True) output = '' blacklist = ['[document]','noscript','header','html','meta','head', 'input','script','style' ,'li' , 'b' , 'href' , 'div' , 'th'] for t in text: if t.parent.name not in blacklist: output += '{} '.format(t) res = len(output.split()) l4.config(text = res) tokens = word_tokenize(output) #splitting # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] porter = PorterStemmer() stemmed = [porter.stem(word) for word in words] res = [key for key, value in Counter(stemmed).most_common()] key1.config(text = res[0]) key2.config(text = res[1]) key3.config(text = res[2]) key4.config(text = res[3]) key5.config(text = res[4]) key6.config(text = res[5]) key7.config(text = res[6]) key8.config(text = res[7]) key9.config(text = res[8]) else: print(showwarning("Alert" , "No such website or url exists"))
def text_cleaning_descriptions(): stemmer = PorterStemmer() stop = set(stopwords.words('english')) stop.add('get') # Add some ad-hoc stopwords often appearing in listings stop.add('use') stop.add('aaa') stop.add('good') stop.add('best') stop.add('custom') stop.add('list') stop.add('free') stop.add('send') stop.add('ship') stop.add('onion') stop.add('feedback') stop.add('qualiti') stop.add('quality') stop.add('grams') stop.add('address') stop.add('order') stop.add('pleas') stop.add('price') stop.add('product') stop.add('check') stop.add('discuss') stop.add('name') stop.add('shipping') stop.add('one') stop.add('track') stop.add('day') stop.add('time') stop.add('packag') frequency = defaultdict(int) for key in datasetMap: currentDescription = datasetMap[key] # Performing cleaning on currentDescription # Tokenizing tokens = nltk.word_tokenize(currentDescription) tokens_nostop = [] for token in tokens: if token not in stop: tokens_nostop.append(token) # Stemming stems = [] for token in tokens_nostop: word = stemmer.stem(token) if word not in stop: stems.append(word) else: continue frequency[word] += 1 datasetMap[key] = stems # Now discard unique tokens or monograms or bigrams and typos (too rare words) for key in datasetMap: description = datasetMap[key] newDescription = [] for word in description: if len(word) > 2: # A word must be at least 3 letters long if frequency[word] > 5: # A word must appear at least 5 times newDescription.append(word) # else: # print "Rare token found: " + word else: frequency[word] = 0 continue datasetMap[key] = newDescription print "Text cleaning completed.\n"
def create(request): if request.method == 'POST': data = request.POST['parag'] paragraph = data text = data.replace('\n', '') data = text for k in text.split("\n"): text2 = re.sub(r"[^a-zA-Z0-9&]+", ' ', k) text = text2 tokens = [t for t in text.split()] sr = stopwords.words('english') clean_tokens = tokens[:] for token in tokens: if token in stopwords.words('english'): clean_tokens.remove(token) freq = nltk.FreqDist(clean_tokens) s = [(k, freq[k]) for k in sorted(freq, key=freq.get, reverse=True)] title = s[0][0] search_queries = [ sorted(freq.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)[0][0] + " " + sorted(freq.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)[1][0] ] for query in search_queries: downloadimages(query, title) stop_words = stopwords.words('english') summarize_text = [] # Step 1 - Read text anc split it article = data.split(". ") sentences = [] sentences_list = '' count_sentence = 0 for sentence in article: count_sentence = count_sentence + 1 sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" ")) sentences.pop() top_n = int(count_sentence / 3) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = build_similarity_matrix( sentences, stop_words) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences ranked_sentence = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # Step 5 - Offcourse, output the summarize texr m = 1 # Driver Code with open("visualizer/input/op.tsv", "w") as text_file: text_file.write("content" + "\t" + "val" + '\n') for i in summarize_text: sentences_list = sentences_list + i search_queries.append(i) text_file.write(i + "\t" + str(m) + '\n') m = m + 1 emotion = predict() for query in search_queries: review = re.sub('[^a-zA-Z]', ' ', query) review = review.lower() review = review.split() ps = PorterStemmer() review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) downloadimages(review, title) fps = 0.2 file_list = glob.glob( 'visualizer/images/' + title + '/*.jpg') # Get all the pngs in the current directory file_list_sorted = natsorted(file_list, reverse=False) # Sort the images clips = [ImageClip(m).set_duration(5) for m in file_list_sorted] concat_clip = concatenate(clips, method="compose") concat_clip.write_videofile("visualizer/output/project.mp4", fps=fps) folder = 'visualizer/images/' + title + '/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) #elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print(e) textClip = gTTS(text=sentences_list, lang=language, slow=False) textClip.save("visualizer/output/voice.mp3") audioclip = AudioFileClip("visualizer/output/voice.mp3") my_clip = VideoFileClip('visualizer/output/project.mp4') audio_background = AudioFileClip('visualizer/emotions/' + emotion + '.mp3') new_audioclip = CompositeAudioClip( [audio_background.volumex(0.08), audioclip.volumex(1)]) final_audio = CompositeAudioClip([new_audioclip]) audio = afx.audio_loop(final_audio, duration=audioclip.duration) final_clip = my_clip.set_audio(audio) final_clip.write_videofile("visualizer/output/" + title + '.mp4') data = title file_path = 'visualizer/output/' + data + '.mp4' video = Video() video.data = paragraph video.name = data video.videofile = file_path video.save() return redirect(video.videofile.url) if request.method == 'GET': return render(request, 'index.html')
def single_meteor_score( reference, hypothesis, preprocess=str.lower, stemmer=PorterStemmer(), wordnet=wordnet, alpha=0.9, beta=3, gamma=0.5, ): """ Calculates METEOR score for single hypothesis and reference as per "Meteor: An Automatic Metric for MT Evaluation with HighLevels of Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal, in Proceedings of ACL. http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party' >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands' >>> round(single_meteor_score(reference1, hypothesis1),4) 0.7398 If there is no words match during the alignment the method returns the score as 0. We can safely return a zero instead of raising a division by zero error as no match usually implies a bad translation. >>> round(meteor_score('this is a cat', 'non matching hypothesis'),4) 0.0 :param reference: reference sentence :type reference: str :param hypothesis: a hypothesis sentence :type hypothesis: str :param preprocess: preprocessing function (default str.lower) :type preprocess: method :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :type wordnet: WordNetCorpusReader :param alpha: parameter for controlling relative weights of precision and recall. :type alpha: float :param beta: parameter for controlling shape of penalty as a function of as a function of fragmentation. :type beta: float :param gamma: relative weight assigned to fragmentation penalty. :type gamma: float :return: The sentence-level METEOR score. :rtype: float """ enum_hypothesis, enum_reference = _generate_enums( hypothesis, reference, preprocess=preprocess ) translation_length = len(enum_hypothesis) reference_length = len(enum_reference) matches, _, _ = _enum_align_words( enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet ) matches_count = len(matches) try: precision = float(matches_count) / translation_length recall = float(matches_count) / reference_length fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall) chunk_count = float(_count_chunks(matches)) frag_frac = chunk_count / matches_count except ZeroDivisionError: return 0.0 penalty = gamma * frag_frac ** beta return (1 - penalty) * fmean
names=["label", "message"]) # the above note pad is 2 parts 1st column represent the lable spam or ham # then the dependent var and independent var is is sepereated by one tab so /t #and there is no column name so im forcingly specifying 2 heading . 1st is lable and 2nd is message # now data cleaning and pre processssssssssngs import nltk import re from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer #nltk.download('stopwords') from nltk.corpus import stopwords ps = PorterStemmer() # stemming purpose lem = WordNetLemmatizer() corpus = [] for i in range(0, len(mail)): review = re.sub( '[^a-zA-z]', ' ', mail['message'][i]) # space is given in 2nd parameter of sub review = review.lower() review = review.split() review = [ lem.lemmatize(word) for word in review if not word in stopwords.words('english') ]
def meteor_score( references, hypothesis, preprocess=str.lower, stemmer=PorterStemmer(), wordnet=wordnet, alpha=0.9, beta=3, gamma=0.5, ): """ Calculates METEOR score for hypothesis with multiple references as described in "Meteor: An Automatic Metric for MT Evaluation with HighLevels of Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal, in Proceedings of ACL. http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf In case of multiple references the best score is chosen. This method iterates over single_meteor_score and picks the best pair among all the references for a given hypothesis >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party' >>> hypothesis2 = 'It is to insure the troops forever hearing the activity guidebook that party direct' >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands' >>> reference2 = 'It is the guiding principle which guarantees the military forces always being under the command of the Party' >>> reference3 = 'It is the practical guide for the army always to heed the directions of the party' >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4) 0.7398 If there is no words match during the alignment the method returns the score as 0. We can safely return a zero instead of raising a division by zero error as no match usually implies a bad translation. >>> round(meteor_score(['this is a cat'], 'non matching hypothesis'),4) 0.0 :param references: reference sentences :type references: list(str) :param hypothesis: a hypothesis sentence :type hypothesis: str :param preprocess: preprocessing function (default str.lower) :type preprocess: method :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :type wordnet: WordNetCorpusReader :param alpha: parameter for controlling relative weights of precision and recall. :type alpha: float :param beta: parameter for controlling shape of penalty as a function of as a function of fragmentation. :type beta: float :param gamma: relative weight assigned to fragmentation penalty. :type gamma: float :return: The sentence-level METEOR score. :rtype: float """ return max( single_meteor_score( reference, hypothesis, preprocess=preprocess, stemmer=stemmer, wordnet=wordnet, alpha=alpha, beta=beta, gamma=gamma, ) for reference in references )
nltk.download('treebank') nltk.download('wordnet') nltk.download('punkt') documentsPath = 'text' class IndexBuilder: def __init__(self, path: str, preprocessor: TokenPreprocessor): self.__tokenPreprocessor = preprocessor self.__documentsPath: path def buildIndex(self, name): indexer = Indexer(name) documentsFileNames = os.listdir(documentsPath) for docId, documentFileName in enumerate(documentsFileNames): with open(documentsPath + '/' + documentFileName, 'r+') as fileHandler: content = fileHandler.read() tokens = tokenProcessor.preprocess(nltk.word_tokenize(content)) for token in tokens: indexer.add_word_to_document(token, docId + 1) print(indexer) indexer.save_indexer_to_disk() if __name__ == '__main__': tokenProcessor = TokenPreprocessor(PorterStemmer(mode='NLTK_EXTENSIONS'), stopwords.words('english')) indexBuilder = IndexBuilder(tokenProcessor, documentsPath) indexBuilder.buildIndex('myIndex')
def stemming_words(self): porter = PorterStemmer() self.stemmed_words = [porter.stem(word) for word in self.words]
train_nans = ds_train['keyword'].isnull().sum() print(train_nans) train_nans = ds_train['location'].isnull().sum() print(train_nans) # In[7]: print(ds_train.shape[0]) # In[ ]: # In[8]: # Creating Corpus after preprocessing the training data corpus = [] pstem = PorterStemmer() for i in range(ds_train['text'].shape[0]): text = re.sub("[^a-zA-Z]", ' ', ds_train['text'][i]) text = text.lower() text = text.split() text = [ pstem.stem(word) for word in text if not word in set(stopwords.words('english')) ] text = ' '.join(text) corpus.append(text) # In[9]: #print((corpus))
def stem_all(self, sentence): stemmer = PorterStemmer() return [stemmer.stem(word) for word in sentence]
import re def rm_stopwords(tokens): ''' Returns a list of the elements from the given list that aren't an english stopword ''' good_tokens = [] for token in tokens: if not (token in stopwords.words('english')): good_tokens.append(token) return good_tokens ### modified from csi4106 notebook 5 port = PorterStemmer() def stemmer(tokens): #sometimes case-folds ''' Returns a list of stemmed elements based off the given list ''' return [port.stem(t) for t in tokens] ###end ###modified from https://www.geeksforgeeks.org/python-lemmatization-with-nltk/ lemmatizer = WordNetLemmatizer()
def __init__(self): self._porterStemmer = PorterStemmer()
# -*- coding: utf-8 -*- from nltk.stem.porter import PorterStemmer import json import re import sys ps = PorterStemmer() #path_to_vectors = '/path/to/numberbatch-en.txt' path_to_vectors = sys.argv[1] import numpy as np def dump_stemmed_vectors(filepath): vectors = [] with open(filepath, 'r', encoding="utf8") as myfile: vectors = myfile.readlines() vectors = [vector.strip() for vector in vectors] word_vector_dict = {} for word_vector in vectors: word = word_vector.split()[0].encode('ascii', 'ignore').decode("utf8") vector = word_vector.split()[1:] if '#' not in word and '_' not in word: word = ps.stem(word) if word in word_vector_dict: pass else: word_vector_dict[word] = vector with open('stemmed_vectors', 'w') as myfile: json.dump(word_vector_dict, myfile) def generate_in_correct_format(filename):
def tokenize(text): tokens = nltk.word_tokenize(text) stems = [] for item in tokens: stems.append(PorterStemmer().stem(item)) return stems