def find_npnp_patterns(self, wordpair, doc): # TODO Make an easier switch from head and full NP? # check each sentence and find both words in word pair. # if they are both in the sentence, then create a pattern. # Creating pattern: if adjacent, <X><Y> or <Y><X>, if not then <X>blahblah<Y> where X and Y are NPs. # X and Y are NPs but we want to extract the heads of X and Y. # X = anaphor, y = antecedent ret = [] for s in doc.sentences: tokens = [tok.token for tok in s.words] if wordpair.anaphor.token in tokens and wordpair.antecedent.token in tokens: pattern_str1 = '(' + wordpair.anaphor.token + ')(.*)(' + wordpair.antecedent.token + ')' pattern_str2 = '(' + wordpair.antecedent.token + ')(.*)(' + wordpair.anaphor.token + ')' pattern1 = re.compile(pattern_str1) pattern2 = re.compile(pattern_str2) sent_str = ' '.join([w.token for w in s.words]) match1 = re.search(pattern1, sent_str) match2 = re.search(pattern2, sent_str) if match1: print(match1.group(1, 2, 3)) ret.append(NPNP('<X>(' + match1.group(2) + ')<Y>')) if match2: print(match2.group(1, 2, 3)) ret.append(NPNP('<Y>(' + match2.group(2) + ')<X>')) return ret
def findCurrency(text): """ Display information about the found strings """ symbols = "$£eurospoundsdollars" # Iterate through each item in text and find all strings matching a regular # expression to find all amounts of money for i in text: matches = re.findall('((?:(?:\$|£)(?:\d+)(?:\.?\d*,?\d{1,3})(?:bn|m)?)|'\ '(?:(?:\d+)(?:\.?,?\d)*(?:bn|m)?(?: ?euros?| ?dollars?| ?pounds?| ?p)))',\ i, re.IGNORECASE) # If a match is found, check the currency and amount, print if matches: for m in matches: if re.search('\$|dollars?', m, re.IGNORECASE): currency = "Dollar" if re.search('\£|pounds?|p', m, re.IGNORECASE): currency = "Pound" if re.search('euros?', m, re.IGNORECASE): currency = "Euro" amount = m.strip(symbols) print("Found a match!" + "\nCurrency:", currency, "\nAmount:",\ amount, "\n")
def prob(w1, w2): # prob = count(w1 | w2) / count(w1) key = str(w1) + " " + str(w2) count_w1_w2 = [w for w in sents if re.search(key, w)] key = str(w1) count_w1 = [w for w in sents if re.search(key, w)] return len(count_w1_w2) / float(len(count_w1))
def write_word_comment_position(data_p, input_word, input_re, register): with open( "data/positions/" + input_word.replace(" ", "_") + "_" + register + ".csv", "w+") as raw: writer = csv.DictWriter(raw, fieldnames=fieldnames) writer.writeheader() for thread in data_p: for comment in thread[register]: last_sentence = 1 sentence_count = len(comment['text_sentences']) for paragraph_i in range(0, len(comment['text_paragraphs'])): sentence_tokenized = sent_tokenize( comment['text_paragraphs'][paragraph_i]) for sentence_i in range(0, len(sentence_tokenized)): if re.search(input_re, sentence_tokenized[sentence_i]): sent_pos = last_sentence + sentence_i if sent_pos > sentence_count: sent_pos = sentence_count writer.writerow({ 'sentence_position': sent_pos, 'sentence_count': sentence_count, 'paragraph_position': sentence_i + 1, 'paragraph_length': len(sentence_tokenized) }) last_sentence += len(sentence_tokenized)
def __call__(self, articles): lemmas = list() for t in word_tokenize(articles): lemma = self.wnl.lemmatize(t) if lemma not in stop_words.ENGLISH_STOP_WORDS and re.search(r'^[a-zA-Z]+$', lemma) is not None: lemmas.append(lemma) lemmas.append(lemma) return lemmas
def tokenize(text): """ Remove all punctuation and return tokens for a string """ from nltk import word_tokenize, re text_tokens = [ word for word in word_tokenize(text) if re.search("\w", word) ] return text_tokens
def get_text(self, url): html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') content = '' for item in soup.find_all('div', {'id': 'content'}): for text in item.find_all(text=True): if re.search('▶', text) is not None: break content = content + text + "\n\n" return content
def Select_Keywords_And_Phrases(tagged_data): # Variables used for keyword/phrase selection. tag_weightings = {"title": 5, "meta": 5, "h": 3, "contents": 1} stop_words = list(stopwords.words('english')) grammar = """ NBAR: {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} {<NBAR><IN><NBAR>} """ # Variables to store information for selection. keywords = [] key_phrases = [] word_frequency = {} phrase_frequency = {} # Iterate through the pairs of data, keeping track of the weighting modifier. for pair in tagged_data: tag = pair[0] data = pair[1] weighted_value = tag_weightings[tag] # If meta or title data, add it to the keyphrases/keywords lists. if (tag in ["title", "meta"]): if (len(data) == 1): keywords.append(Lemmatize(data[0][0], data[0][1])) else: words = [] for word, tag in data: words.append(Lemmatize(word, tag)) key_phrase = " ".join(words) if (key_phrase not in key_phrases): key_phrases.append(key_phrase) # Use POS patterns to find noun phrases, then add weighted values to phrase frequency. chunker = nltk.RegexpParser(grammar) result = chunker.parse(data) for subtree in result.subtrees(): if ((subtree.label() == "NP")): leaves = [] noun_phrase = "" if (1 < len(subtree.leaves()) and len(subtree.leaves()) < 5): for word, tag in subtree.leaves(): leaves.append(Lemmatize(word, tag)) noun_phrase = " ".join(leaves) modifier = 1 if (len(subtree.leaves()) == 3): modifier = weighted_value * 3 if (noun_phrase in phrase_frequency): phrase_frequency[noun_phrase] += 1.0 * modifier else: phrase_frequency[noun_phrase] = 1.0 * modifier # Ignore stop-words, check for selected POS tag, then add weighted values to word frequency. no_punctuation_data = [(word.lower(), tag) for word, tag in data if re.search("\w", word)] for word_tag_pair in no_punctuation_data: if ((word_tag_pair[0] not in stop_words) and (word_tag_pair[1][0:2] in ["NN", "VB", "JJ"])): lemma = Lemmatize(word_tag_pair[0], word_tag_pair[1]) if (lemma in word_frequency): word_frequency[lemma] += 1.0 * weighted_value else: word_frequency[lemma] = 1.0 * weighted_value # Find values from the word frequencies to select keywords. total = 0.0 count = 0 for key, value in word_frequency.items(): if (value > 1): count += 1 total += value average = total / count cap = count / average # Select keywords from the frequencies. for key, value in word_frequency.items(): if (average < value and value < cap): if (len(key) > 3): if (key not in keywords): keywords.append(key) # Select key phrases from the frequencies. tracker = 0 for key, value in reversed( sorted(phrase_frequency.items(), key=operator.itemgetter(1))): if (tracker < 7): if (key not in key_phrases): key_phrases.append(key) tracker += 1 else: break return keywords, key_phrases
This is a temporary script file. """ import nltk from nltk import re, word_tokenize, FreqDist, MLEProbDist, probability #open an document f1 = open('a01_data\sampledata.txt') dataRaw = f1.read() f2 = open('a01_data\sampledata.vocab.txt') vocabRaw = f2.read() # calculate the frequence distribution dataRaw_tokens_nopunct = [ word for word in word_tokenize(dataRaw) if re.search("\w", word) ] for elem in dataRaw_tokens_nopunct: if elem == 's': dataRaw_tokens_nopunct.remove(elem) for elem in dataRaw_tokens_nopunct: if elem == '/s': dataRaw_tokens_nopunct.remove(elem) dataRaw_fdist = FreqDist(dataRaw_tokens_nopunct) ##xx = dataRaw_fdist.most_common() vocabRaw_tokens_nopunct = [ word for word in word_tokenize(vocabRaw) if re.search("\w", word) ] # calculate the possibility distribution
def processURL(url): global count global wordList #html page parsing name = url html = urllib.request.urlopen(url).read() soup = BeautifulSoup(html,features="html.parser") #remove javascript and css style from the parsed text. for js in soup(["script", "style"]): js.decompose() count+=1 text = soup.get_text() #process parsed text lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) #after each part of the processing, produces a file output for each document f = open('File '+ str(count) + ' HTML parsing output '+'.txt', 'w') f.write(text) print("File " + str(count) + " HTML Parsing output.txt saved") #Sentence splitting,tokenization and normalization process #tokenization tokens = word_tokenize(text) #remove punctuation tokens_nopunct = [word.lower() for word in tokens if re.search("\w",word)] f = open('File '+ str(count) + ' SS,Tokenization,Normalization output'+'.txt', 'w') f.write(str(tokens_nopunct)) print("File " + str(count) + " SS,Tokenization,Normalization output.txt saved") #Stemming (Reduce a word to its word stem that affixes to suffixes and prefixes (roots)) tokens_nopunct = [stem(word) for word in tokens_nopunct] f = open('File '+ str(count) + ' Stemming output'+'.txt', 'w') f.write(str(tokens_nopunct)) print("File " + str(count) + " Stemming output.txt saved") #remove stopwords #https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python stop_words = set(stopwords.words('english')) filtered = [w for w in tokens_nopunct if not w in stop_words] totalNumberOfWords = len(filtered) f = open('File '+ str(count) + ' Stopwords Removed'+'.txt', 'w') f.write(str(filtered)) print("File " + str(count) + " Stopwords Removed.txt saved") #postagging tagged = nltk.pos_tag(filtered) length = len(tagged) f = open('File '+ str(count) + ' PosTagging output'+'.txt', 'w') f.write(str(tagged)) wordList.append(filtered) print("File " + str(count) + " PosTagging output.txt saved") #count occurences of each word in document ##https://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item counts = [(i, len(list(c))) for i,c in groupby(sorted(flatten(tagged)))] f = open('File '+ str(count) + ' Count Output'+'.txt', 'w') f.write(str(counts)) print("File " + str(count) + " Count Output.txt saved") return filtered
def text_operation(self, text): """ This function takes a list of tokenizes and manipulate every token to his case param: text: list of tokens. :return: the text after parser. """ len_text = len(text) tokenAfterParse = [] counter = -1 for term in text: counter = counter + 1 if self.per == True: self.per = False continue if self.per2 == True: self.per = True self.per2 = False continue if term == " " or term == '': continue if term[-1] in string.punctuation or ord(term[-1]) < 48 or ord( term[-1] ) > 127: # to remove anything that is not a word or number if term[-1] != '%': while term[-1] in string.punctuation or ord( term[-1]) < 48 or ord(term[-1]) > 127: term = term[:-1] if term == "": break if term == "": continue #text[counter] = term FIXME happen in line 160 ##new- remove emoji in middle of term term = ''.join( [l for l in term if ord(l) < 127 and ord(l) > 34] ) # remove every unneccery part in term, add ascii between 35 to 126 if len(term) < 2: continue text[counter] = term ##new # hashtag & tags cases: if term[0] in string.punctuation: if term[0] == '#' and len(term) > 2: # if len(term) == 2: FIXME why to add hashatgs with len 2? # tokenAfterParse.append(term) # tokenAfterParse.append(term[1]) # continue words = self.hashtag_tokenize( term[1:] ) # this func split the words and add the original hashtag with lower case to words tokenAfterParse.extend(words) continue elif term[0] != '@': while term[0] in string.punctuation: term = term[1:] if len(term) < 2: break text[counter] = term # if ord(term[0]) > 127 or term[0] in string.punctuation: # to remove anything that is not a word or number # maybe we need while # if term[0] == '#' and len(term) > 1: # if len(term) == 2: # tokenAfterParse.append(term) # tokenAfterParse.append(term[1]) # continue # words = self.hashtag_tokenize(term[1:]) # tokenAfterParse.extend(words) # # tokenAfterParse.append(term.lower()) # continue # elif term[0] != '@': # while ord(term[0]) > 127 or term[0] in string.punctuation: # term = term[1:] # if term == "": # break # if term == "": # continue # text[counter] = term # url case: if "http" in term: # if ord(term[-1]) == 8230: FIXME no need # continue term = term[term.find('http'):].strip() urls = self.url_Opretion(term) tokenAfterParse.extend(urls) continue # number cases - dates/percentage: if (term.isdigit() or term[0].isdigit()) and not (re.search('[a-zA-Z]', term)): if counter + 1 < len_text and term.isdigit(): if text[counter + 1] in self.month_dict: # Date tokenAfterParse.append( self.Date_Toknize(term, text, counter, len_text)) continue if text[counter + 1] == "percent" or text[ counter + 1] == "percentage" or text[ counter + 1] == "Percent" or text[ counter + 1] == "Percentage": # % new_word = term + text[counter + 1] tokenAfterParse.append(new_word) self.per = True continue new_number = self.numbeOpertion(term, text, counter, len_text) tokenAfterParse.append(new_number) continue # try to minimize the covid terms if term.startswith('covid') or term.startswith( 'Covid') or term.startswith('COVID'): tokenAfterParse.append('covid19') continue # check entity if counter + 1 < len_text: if term[0].isupper() and text[ counter + 1][0].isupper(): # words with big letter name = self.entity(text, counter, len_text) tokenAfterParse.append(name) tokenAfterParse.append(term) continue if term in self.month_dict and text[counter + 1].isdigit(): tokenAfterParse.append( self.Date_Toknize(term, text, counter, len_text)) continue # replace every number from one to ten to digits: elif term in self.dict_numbers.keys(): term = self.dict_numbers[term] tokenAfterParse.append(term) return tokenAfterParse
#Members: Ishwar Venugopal [1906084], Shreya Jadhav [1702121]) #Task 4: Listing out the top 10 similar pairs from nltk.corpus import wordnet import nltk from nltk import re from nltk.stem import WordNetLemmatizer nltk.download('wordnet') from itertools import product import operator import pandas as pd data=open("text1.txt",encoding="utf8").read() tokens=nltk.word_tokenize(data) tokens_nopunct=[word for word in tokens if re.search("\w",word)] tokens_lower=[x.lower() for x in tokens_nopunct] lemmatizer = WordNetLemmatizer() tokens_lem=[lemmatizer.lemmatize(x) for x in tokens_lower] vocab=set(tokens_lem) sims = [] for word1 in vocab: for word2 in vocab: w1 = wordnet.synsets(word1) w2 = wordnet.synsets(word2) if w1 and w2: val=w1[0].path_similarity(w2[0]) sims.append((word1, word2, val))
def valid_sents(sents, key): global valid_sent valid_sent = [w for w in sents if re.search(key, w)]
import nltk from nltk import word_tokenize from nltk import re from nltk.book import * #-----------------------------Ahora con moby dick moby_tokens = text1.tokens moby_tokens_sin_puntos = [ palabra.lower() for palabra in moby_tokens if re.search("\w", palabra) ] #Numero de tokens en moby dick (sin signos de puntuación) nro_tokens_sin_puntos = len(moby_tokens_sin_puntos) print("1) Numero de tokens en Moby Dick: ", nro_tokens_sin_puntos) #Numero de palabras unicas o nro de TYPES **** nro_types = len(set(moby_tokens_sin_puntos)) print("2) Numero de TYPES en Moby Dick: ", nro_types) #Type token ratio de Moby Dick moby_type_token_ratio = nro_types / nro_tokens_sin_puntos print("3) Type token ratio de Moby Dick: ", moby_type_token_ratio) #----------------------------Ahora el WSJ wsj_tokens = text7.tokens wsj_tokens_sin_puntos = [ palabra.lower() for palabra in wsj_tokens if re.search("\w", palabra) ] nro_tokens_wsj_sin_puntos = len(wsj_tokens_sin_puntos) nro_types_wsj = len(set(wsj_tokens_sin_puntos)) wsj_type_token_ratio = nro_types_wsj / nro_tokens_wsj_sin_puntos
# Importing packages for tokenisation and lemmatization from nltk import nltk nltk.download('punkt') from nltk import word_tokenize from nltk import re from nltk.stem import WordNetLemmatizer nltk.download('wordnet') tokens = word_tokenize(text) #tokenizing the obtained text tokens_nopunct = [word for word in tokens if re.search("\w", word)] #removing all punctuations tokens_updated = tokens_nopunct #Updated list of tokens print("********************\n") print("*** Lowercasing ***\n") print("The number of tokens before lowercase:", len(tokens_updated)) print("The number of types before lowercase", len(set(tokens_updated))) print("\n") tokens_lower = [x.lower() for x in tokens_updated] #converting all tokens to lowercase tokens_updated = tokens_lower print("The number of tokens after lowercase:", len(tokens_updated)) print("The number of types after lowercase", len(set(tokens_updated))) print("\n")
def text_operation(self, text): """ This function takes a list of tokenizes and manipulate every token to his case param: text: list of tokens. :return: the text after parser. """ len_text = len(text) tokenAfterParse = [] counter = -1 for term in text: counter = counter + 1 if self.per == True: self.per = False continue if self.per2 == True: self.per = True self.per2 = False continue if term == " " or term == '' or "http" in term: continue if term[-1] in string.punctuation or ord(term[-1]) < 48 or ord( term[-1]) > 127: # to remove anything that is not a word or number in the end of the word if term[-1] != '%': while term[-1] in string.punctuation or ord(term[-1]) < 48 or ord(term[-1]) > 127: term = term[:-1] if term == "": break if term == "": continue # text[counter] = term FIXME happen in line 160 # hashtag & tags cases: if term[0] in string.punctuation or ord(term[0]) > 127: if term[0] == '#' and len(term) > 2: # if len(term) == 2: # continue words = self.hashtag_tokenize( term[1:]) # this func split the words and add the original hashtag with lower case to words tokenAfterParse.extend(words) continue elif term[0] != '@': while term[0] in string.punctuation: term = term[1:] if len(term) < 2: break if term == "": continue text[counter] = term # number cases - dates/percentage: if term.startswith('covid') or term.startswith('Covid') or term.startswith('COVID'): tokenAfterParse.append('covid19') continue if term.startswith('corona') or term.startswith('Corona') or term.startswith('CORONA'): tokenAfterParse.append('corona') continue term = self.clean_word(term) if isinstance(term, list): continue # try to minimize the covid terms if (term.isdigit() or term[0].isdigit()) and not (re.search('[a-zA-Z]', term)): if counter + 1 < len_text and term.isdigit(): if text[counter + 1] in self.month_dict: # Date tokenAfterParse.append(self.Date_Toknize(term, text, counter, len_text)) continue if text[counter + 1] == "percent" or text[counter + 1] == "percentage" or text[ counter + 1] == "Percent" or text[counter + 1] == "Percentage": # % new_word = term + text[counter + 1] tokenAfterParse.append(new_word) self.per = True continue new_number = self.numbeOpertion(term, text, counter, len_text) tokenAfterParse.append(new_number) continue # check entity if counter + 1 < len_text: if term[0].isupper() and text[counter + 1][0].isupper(): # words with big letter name = self.entity(text, counter, len_text) tokenAfterParse.append(name) tokenAfterParse.append(term) continue if term in self.month_dict and text[counter + 1].isdigit(): tokenAfterParse.append(self.Date_Toknize(term, text, counter, len_text)) continue # replace every number from one to ten to digits: elif term in self.dict_numbers.keys(): term = self.dict_numbers[term] term = self.stemmer.stem(term) tokenAfterParse.append(term) return tokenAfterParse
#we will store Beautiful soup into a variable s this variable will search for display elements in the html like h1,paragraph p and so on and give the required information into a #variable rel. s = BeautifulSoup(rw, 'html.parser') rel = "" for relevance in s.find_all('h1'): rel += (relevance.text) for relevance in s.find_all('p'): rel += (relevance.text) for relevance in s.find_all('h2'): rel += (relevance.text) #rel_tokens_nopunct willtraverse through rel and remove all punctuation marks and divide the text into tokens #Using set on our rel_tokens_nopunct function we will get the unique types from the tokens rel_tokens_nopunct = [ word for word in word_tokenize(rel) if re.search("\w", word) ] print("The length of tokens from the url before Lemmatization are:", len(rel_tokens_nopunct)) print("\n\nThe length of types from the url before Lemmatization are:", len(set(rel_tokens_nopunct))) print("\n\nThe tokens contained in the website:" + url + " before Lemmatization are:\n\n") print(rel_tokens_nopunct) print("\n\nThe types contained in the website:" + url + " before Lemmatization are:\n\n") print(set(rel_tokens_nopunct)) #We will use the Lemmatizer function to Lemmatize the words and store the lemmatized values for each of the values in l1,l2,l3 lem = WordNetLemmatizer() l = [lem.lemmatize(i, pos='v') for i in rel_tokens_nopunct]
def str2token(string_data): tokens = word_tokenize(string_data) lowercased_tokens = list(map(lambda x: x.lower(),tokens)) word_tokenized = [word for word in lowercased_tokens if re.search("\w",word)] return word_tokenized
with untried nuclear technology in someone else's school. How exciting is that! I can't have been the only one to feel a warm glow at the thought of so much radioactivity at the very heart of the school. Who knows, by this time next year I could be the Two-head-master! (Finkelstein, D., you're on fire – as is the boiler room!) D.C. """ # Print the PrivateEye text. print(text) # Use NLTKs method to tokenize the text, then convert the text to lowercase and remove all the punctuation. text_tokens = word_tokenize(text) text_sent = sent_tokenize(text) text_tokens_nopunct = [ word.lower() for word in text_tokens if re.search("\w", word) ] # Use NLTKs POS tagger method to tag all the tokens. pos_tagged_tokens = nltk.pos_tag(text_tokens_nopunct) print("Training data and generating the token-tag table...\n") # Creates trainer data using the Brown corpus. trainer_data = brown.tagged_sents()[:10000] # Trains an HMM tagger using the trainer data, then uses it to tag all the tokens. hmm_trainer = nltk.hmm.HiddenMarkovModelTrainer() hmm_tagger = hmm_trainer.train_supervised(trainer_data) hmm_tagged_tokens = [] for s in text_sent: