def features(message): singlegrams = [i for i in message.split() if i not in stop]#Removingstopwords singlegramsrefined = [] #Stemming the single words for k in singlegrams: r = stem(k, stemmer=LEMMA) singlegramsrefined.append(r) newmessage = " ".join(singlegramsrefined) newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing stopwords singlegrams= [i for i in newmessage.split()] singlegramsrefined2 = [] for word in singlegrams: singlegramsrefined2.append(word) bigrams = ngrams(newmessage, n=2)#bigrams trigrams = ngrams(newmessage, n=3)#trigrams totalgrams = singlegramsrefined2 + bigrams + trigrams totalgrams = tuple(totalgrams)#tuple having single words, bigrams and trigrams return totalgrams
def features(message): #List of nltk stopwords stop = [u'i','diabetes','diabetic','type 2 diabetes','type 2', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now','m'] singlegrams = [i for i in message.split() if i not in stop]#Removingstopwords singlegramsrefined = [] #Stemming the single words for k in singlegrams: r = stem(k, stemmer=LEMMA) if r not in stop: singlegramsrefined.append(r) newmessage = " ".join(singlegramsrefined) newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing non alphanumerics singlegrams= [i for i in newmessage.split() if len(i) > 1] singlegramsrefined2 = [] for word in singlegrams: singlegramsrefined2.append(word) bigrams = ngrams(newmessage, n=2)#bigrams trigrams = ngrams(newmessage, n=3)#trigrams v = parsetree(newmessage, lemmata=True)[0] v = [w.lemma for w in v if w.tag.startswith(('NN'))] singlewords = [] for i in v: stopping = stop +[u'hour',u'husband',u'anything',u'thing',u'way',u'n',u'number',u'person',u'd',u'x',u'dose',u'drug',u'today',u'help',u'everyone',u'bed',u'mine',u'bed',u'issue',u'anyone',u'thank' ,u'test', u'eat',u'something',u'doc',u'time',u'c',u'luck',u'lb',u'dr',u'morning','t',u'pill',u'upset',u'take',u'couple',u'month',u'use',u'exercise',u'diet',u'lot',u'vision','taking',u've',u'time',u'month',u'level',u'body',u'diet',u'food',u'release', u'time', u'meal',u'glipizide',u'week', 'type','yr',u'symptom',u'cause',u'tablet',u'blood',u'feel',u'like', u'made',u'bad',u'work',u'still', u'got',u'twice',u'i',u'mg',u'm',u'day', u'sugar',u'taking',u'doctor',u'get',u'year', u'side',u'went',u'med',u'one',u'better', u'effect',u'problyear',u'side',u'went',u'med',u'one',u'better',u'effect',u'problem',u'also'] if i not in stopping: singlewords.append(i) bi = [] for r in bigrams: if r not in [(u'year', u'now'),(u'also', u'take'),(u'doesn', u't') ,(u'take', u'food'),(u'taking', u'metformin'),(u'i', u'diagnosed'),(u'metformin', u'mg'),(u'empty', u'stomach'),(u'couldn', u't'),(u'blood', u'sugar'),(u'diet', u'exercise'),(u'mg', u'x'),(u'type', u'diabetes'),(u'side', u'effect'),(u'i', u'm'),(u'i', u've'),(u'twice', u'day'), (u'a', u'c'),(u'don', u't'),(u'slow', u'release'),(u't', u'take'),(u't', u'take'), (u'good', u'luck'),(u'didn', u't'),(u'mg', u'twice'),(u'take', u'metformin'),(u'time', u'day'), (u'went', u'away'),(u'year', u'ago'),(u'much', u'better'),(u'extended', u'release'),(u'started', u'taking'), (u'can', u't'),(u'anyone', u'else'),(u'month', u'ago'),(u'mg', u'day')]: bi.append(r) totalgrams = singlewords + bi return totalgrams
def featureExtractor(textMessage,countgrams): textMessage = textMessage.lower() #Function to remove stop words stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'] avoidList1 = ['diabetes','type 2','diabetic'] avoidList = stopWords + avoidList1 #Removing these stop words and general cleaning singleGrams = [i for i in textMessage.split() if i not in avoidList] singlegramsRefined = [] #Stemming the words for normalization for k in singleGrams: r = stem(k, stemmer=LEMMA) singlegramsRefined.append(r) newMessage = " ".join(singlegramsRefined) newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars singleGrams= [i for i in newMessage.split()] #Again splitting to single grams singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works biGrams = ngrams(newMessage, n=2)# Generating bigrams triGrams = ngrams(newMessage, n=3)#Generating trigrams totalGramsrefined = [] if countgrams == 1: totalGrams = singlegramsRefined2 totalGramsrefined = [i for i in totalGrams]# We want only those features in the text data which is in the model elif countgrams == 2: totalGrams = singlegramsRefined2+biGrams totalGramsrefined = [i for i in totalGrams] elif countgrams == 3: totalGrams = singlegramsRefined2+biGrams + triGrams totalGramsrefined = [i for i in totalGrams] return totalGramsrefined
def getGrams(self, results): grams = {} for text, weight in results: uni = set(ngrams(text, n=1)) bi = set(ngrams(text, n=2)) tri = set(ngrams(text, n=3)) for gram in uni: grams[gram] = grams.get(gram, 0) + weight for gram in bi: grams[gram] = grams.get(gram, 0) + weight for gram in tri: grams[gram] = grams.get(gram, 0) + weight return grams
def test_ngrams(self): # Assert n-grams with and without punctuation marks / sentence marks. s = "The cat is napping." v1 = en.ngrams(s, n=2) v2 = en.ngrams(s, n=3, punctuation=en.PUNCTUATION.strip(".")) self.assertEqual(v1, [("The", "cat"), ("cat", "is"), ("is", "napping")]) self.assertEqual(v2, [("The", "cat", "is"), ("cat", "is", "napping"), ("is", "napping", ".")]) s = "The cat purrs. The dog barks." v1 = en.ngrams(s, n=2) v2 = en.ngrams(s, n=2, continuous=True) self.assertEqual(v1, [("The", "cat"), ("cat", "purrs"), ("The", "dog"), ("dog", "barks")]) self.assertEqual(v2, [("The", "cat"), ("cat", "purrs"), ("purrs", "The"), ("The", "dog"), ("dog", "barks")]) print "pattern.en.ngrams()"
def test_ngrams(self): # Assert n-grams with and without punctuation marks / sentence marks. s = "The cat is napping." v1 = en.ngrams(s, n=2) v2 = en.ngrams(s, n=3, punctuation=en.PUNCTUATION.strip(".")) self.assertEqual(v1, [("The", "cat"), ("cat", "is"), ("is", "napping")]) self.assertEqual(v2, [("The", "cat", "is"), ("cat", "is", "napping"), ("is", "napping", ".")]) s = "The cat purrs. The dog barks." v1 = en.ngrams(s, n=2) v2 = en.ngrams(s, n=2, continuous=True) self.assertEqual(v1, [("The", "cat"), ("cat", "purrs"), ("The", "dog"), ("dog", "barks")]) self.assertEqual(v2, [("The", "cat"), ("cat", "purrs"), ("purrs", "The"), ("The", "dog"), ("dog", "barks")]) print("pattern.en.ngrams()")
def intertextuality(texts=[], n=5, continuous=False, weight=lambda ngram: 1): """ Returns a dictionary of (i, j) => float. For indices i and j in the given list of texts, the corresponding float is the percentage of text i that is also in text j. Overlap is measured by matching n-grams (by default, 5 successive words). An optional weight function can be used to supply the weight of each n-gram. """ map = {} # n-gram => text id's sum = {} # text id => sum of weight(n-gram) for i, txt in enumerate(texts): for j, ngram in enumerate(ngrams(txt, n, continuous=continuous)): if ngram not in map: map[ngram] = set() map[ngram].add(i) sum[i] = sum.get(i, 0) + weight(ngram) w = defaultdict(Weight) # (id1, id2) => percentage of id1 that overlaps with id2 for ngram in map: for i in map[ngram]: for j in map[ngram]: if i != j: if (i,j) not in w: w[i,j] = Weight(0.0) w[i,j] += weight(ngram) w[i,j].assessments.add(ngram) for i, j in w: w[i,j] /= float(sum[i]) w[i,j] = min(w[i,j], Weight(1.0)) return w
def intertextuality(texts=[], n=5, continuous=False, weight=lambda ngram: 1): """ Returns a dictionary of (i, j) => float. For indices i and j in the given list of texts, the corresponding float is the percentage of text i that is also in text j. Overlap is measured by matching n-grams (by default, 5 successive words). An optional weight function can be used to supply the weight of each n-gram. """ map = {} # n-gram => text id's sum = {} # text id => sum of weight(n-gram) for i, txt in enumerate(texts): for j, ngram in enumerate(ngrams(txt, n, continuous=continuous)): if ngram not in map: map[ngram] = set() map[ngram].add(i) sum[i] = sum.get(i, 0) + weight(ngram) w = defaultdict( Weight) # (id1, id2) => percentage of id1 that overlaps with id2 for ngram in map: for i in map[ngram]: for j in map[ngram]: if i != j: if (i, j) not in w: w[i, j] = Weight(0.0) w[i, j] += weight(ngram) w[i, j].assessments.add(ngram) for i, j in w: w[i, j] /= float(sum[i]) w[i, j] = min(w[i, j], Weight(1.0)) return w
def qTypeDetect(question): q_ext = extractHelper(question) bi_q = ngrams(question,n=2) Wh = ('what','where','who','whose','which','when','how') How = ('how many','how tall','how much','how old','how far','how long','how often') Binary = ('be','have','may','can') # check how: for a,b in bi_q: if (a+' '+b).lower() in How: return 'HOW' #find Root first: q = nlp(q_ext.decode('ascii')) head = findRoot(q[0]) #print head for child in head.children: if child.lemma_ in Wh: return child for child2 in child.children: if child2.lemma_ in Wh: return child2 if q[0] == head: return 'Binary' for child in head.children: tag = child.tag_ pos = child.pos_ #print child if child.lemma_ == "be" or child.lemma_ == "do" or tag == "VBZ" or tag == "VBP" or tag == "MD": return "Binary" if pos == "NOUN" or tag == "VB" or tag == "VBN": return 'Complex'
def ngram_count(self): grams = ngrams(self.text, n=self.total) for gram in grams: if gram in self.ngramcount: self.ngramcount[gram] += 1 else: self.ngramcount[gram] = 1
def get_queries(self): text = self.text beg_quotes = re.findall(r'\"\S', text) for each in beg_quotes: text = text.replace(each, 'BEGQ' + each[-1]) end_quotes = re.findall(r'\S\"', text) for each in end_quotes: text = text.replace(each, each[0] + 'ENDQ') text = re.sub('(ENDQ)+', 'ENDQ', text) text = re.sub('(BEGQ)+', 'BEGQ', text) text = text.replace('--', 'DOUBLEDASH') all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True) stop_words = stopwords.words('english') queries = [] for ngram in all_ngrams: num_stop = len([w for w in ngram if w in stop_words]) stop_score = float(num_stop) / len(ngram) chunked = ne_chunk(pos_tag(ngram)) named_entities = [[w for w, t in elt] for elt in chunked if isinstance(elt, nltk.Tree)] num_ent = sum([len(ent_list) for ent_list in named_entities]) ent_score = float(num_ent) / len(ngram) if stop_score < self.threshold and ent_score < self.threshold: r_string = self.reconstruct_ngram(ngram) if r_string in self.text: queries.append(r_string) reduction = len(queries) / self.max_queries return queries[0::reduction]
def test_OhanaBrendan_extract_pos_tagged_element(): doc = Document('sample', document_sample_text, 0) tagged = parse(doc.raw_text, chunks=False) doc.unigrams = ngrams(tagged, n=1) classifier = OhanaBrendan([doc]) jj_elements = classifier._extract_pos_tagged_element(doc, 'JJ') nns_elements = classifier._extract_pos_tagged_element(doc, 'NNS') assert len(jj_elements) == 1 assert len(nns_elements) == 3
def get_queries(self): """Function to extract search queries from the text: breaks text into ngrams, filters ngrams that consist mostly of stopwords or named entities, selects an evenly spaced sample of the remaining ngrams""" text = self.text beg_quotes = re.findall(r'\"\S', text) for each in beg_quotes: text = text.replace(each, 'BEGQ' + each[-1]) end_quotes = re.findall(r'\S\"', text) for each in end_quotes: text = text.replace(each, each[0] + 'ENDQ') text = re.sub('(ENDQ)+', 'ENDQ', text) text = re.sub('(BEGQ)+', 'BEGQ', text) text = text.replace('--', 'DOUBLEDASH') all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True) if self.language in stopwords.fileids(): stop_words = stopwords.words(self.language) else: stop_words = [] queries = [] queries.append(self.text) for ngram in all_ngrams: num_stop = len([w for w in ngram if w in stop_words]) stop_score = float(num_stop) / len(ngram) if self.language == 'english': chunked = ne_chunk(pos_tag(ngram)) named_entities = [[w for w, t in elt] for elt in chunked if isinstance(elt, nltk.Tree)] num_ent = sum([len(ent_list) for ent_list in named_entities]) ent_score = float(num_ent) / len(ngram) else: ent_score = 0 if stop_score < self.threshold and ent_score < self.threshold: r_string = self.reconstruct_ngram(ngram) if r_string in self.text: queries.append(r_string) reduction = len(queries) / self.max_queries if reduction == 0: reduction = 1 return queries[0::reduction]
def n_grams(self, s, n=2): ''' Obtain n-grams In: (s:string, n:int) text string and size of the n-gram Out: (list) list of n-grams ''' list = [] ngrams_list = ngrams(s, n=n) for ngram in ngrams_list: ngram_joined = '' for word in ngram: ngram_joined += word + ' ' list.append(ngram_joined.rstrip()) if len(list)>=1: return list else: return []
import sys from pattern.en import ngrams import videogrep from collections import Counter videofile = sys.argv[1] subtitlefile = videofile.replace('.mp4', '.srt') lines = open(subtitlefile).read() grams = ngrams(lines, n=3) most_common = Counter(grams).most_common(1) phrase = most_common[0][0] phrase = ' '.join(phrase) print phrase outputfile = videofile + '.most_common.mp4' videogrep.videogrep([videofile], outputfile, phrase, 're')
print(pluralize('leaf')) print(singularize('theives')) # ### Converting Adjective to Comparative and Superlative Degrees from pattern.en import comparative, superlative print(comparative('good')) print(superlative('good')) # ### Finding N-Grams from pattern.en import ngrams print(ngrams("He goes to hospital", n=2)) # ### Finding Sentiments from pattern.en import sentiment print(sentiment("This is an excellent movie to watch. I really love it")) # Explanation: # # - 0.75 show the sentiment score of the sentence that means highly positive # - 0.8 is the subjectivity score that is a personal of the user # ### Checking if a Statement is a Fact from pattern.en import parse, Sentence
Possessives = [["i", "my", "me", "mine"], ["she", "her", "hers"], ["he", "him", "his"], ["you", "your", "yours"], ["it", "it's"], ["we", "us", "our", "ours"], ["they", "them", "their", "theirs"]] Poss = [ "i", "my", "me", "mine", "she", "her", "hers", "he", "him", "his", "you", "your", "yours", "it", "it's", "we", "us", "our", "ours", "they", "them", "their", "theirs" ] Articles = ["a", "an", "the"] Prepositions = ["in", "on", "at"] instr = input() var = instr.lower() # token=nltk.word_tokenize(var) trigrams = ngrams(var, 3) def query(v, i): encoded_query = urllib.parse.quote(v) params = { 'corpus': 'eng-us', 'query': encoded_query, 'topk': 10, 'format': 'tsv' } params = '&'.join('{}={}'.format(name, value) for name, value in params.items()) response = requests.get('https://api.phrasefinder.io/search?' + params)
# Unigrams for words in train_tokens[0:]: for word in words: if word in unigrams: unigrams[word] += 1 else: unigrams[word] = 1 # unigrams prob for unigram in unigrams: unigramsProb[unigram] = float(unigrams[unigram]) # Bigrams for word in train_tokens[0:]: x = ngrams(" ".join(word), 2) for bg in x: cfd[bg[0]][bg[1]] += 1 cfd = ConditionalFreqDist( (bg[0], bg[1]) for bg in list(chain(*[bigrams(i) for i in train_tokens]))) if (idx >= 90): for bgidx, bg in enumerate( list(chain(*[bigrams(i) for i in train_tokens]))): prob = cfd[bg[0]].freq(bg[1]) prob = 0.0001 if not prob else prob bigramsProb[bg] = prob if (idx >= 90): test_tokens.append(tokens) if idx >= 190: for bgidx, bg in enumerate(
for tup in bigram: txt = " ".join(tup) #print txt m = search(txt, sent) if m: entity.append(txt) #print m return entity if __name__ == "__main__": s = "watch simpsons" if len(sys.argv) > 1: s = sys.argv[1] bigram = ngrams(s, n=2) sres = gsearch.search_google(s) cmd = ["last channel", "previous channel","tune to", "turn to", "watch"] sres += cmd #print sres sent = "" for item in sres: sent += " - " + item res = nlp(bigram, sent) if len(res) > 0: print res else: onegram = ngrams(s, n=1) res = nlp(onegram, sent) print res
def ngrams(text, max_n=1, min_n=1): for i in xrange(min_n-1,max_n): for n in en.ngrams(text, n=i+1): yield ' '.join(n)
def featureExtractor(textMessage,countgrams): textMessage = textMessage.lower() #Function to remove stop words stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'] avoidList1 = ["actos", "pioglitazone hydrochloride", "pioglitazone", "glustin", "glizone", "pioz", "zactos"] avoidList2 = ["medformin","metfornin","metforin","glucophage", "metformin", "glucophage xr", "metformin hydrochloride","carbophage sr", "riomet", "fortamet", "glumetza", "obimet", "gluformin", "dianben", "diabex", "diaformin", "siofor","metfogamma", "riomet","diformin","metformi","metphormin","metaforming","metfirman","metoformin","metfomin"] avoidList3 = ["byetta", "bydureon", "exenatide","byetta"] avoidList4 = ["victosa","victoza", "liraglutide", "saxenda","victoza"] avoidList5 = ["invokana", "invokana","canagliflozin"] avoidList6 = ["avandia", "rosiglitazone"] avoidList7 = ["insu","humalog","levimir","novolog","insuline","insulin glargine","insulins","lantus", "toujeo", "abasaglar", "basaglar","insulin","insulins","levamir","levemir"] avoidList8 = ["sitagliptin", "janumet", "januvia", "juvisync","junuvia","januvia","sitaglipton"] avoidList9 = ["amaryl", "glimepiride", "gleam", "k-glim-1", "glucoryl", "glimpid", "glimy","ameryl"] avoidList10 = ['diabetes','type 2','diabetic'] avoidList = stopWords + avoidList1 + avoidList2 + avoidList3 + avoidList4 + avoidList5 + avoidList6 + avoidList7 + avoidList8 + avoidList9 + avoidList10 #Removing these stop words and general cleaning singleGrams = [i for i in textMessage.split() if i not in avoidList] singlegramsRefined = [] #Stemming the words for normalization for k in singleGrams: r = stem(k, stemmer=LEMMA) singlegramsRefined.append(r) newMessage = " ".join(singlegramsRefined) newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars singleGrams= [i for i in newMessage.split()] #Again splitting to single grams singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works biGrams = ngrams(newMessage, n=2)# Generating bigrams triGrams = ngrams(newMessage, n=3)#Generating trigrams listModelfeatures = modelFeatures() totalGramsrefined = [] if countgrams == 1: totalGrams = singlegramsRefined2 totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]# We want only those features in the text data which is in the model elif countgrams == 2: totalGrams = singlegramsRefined2+biGrams totalGramsrefined = [i for i in totalGrams if i in listModelfeatures] elif countgrams == 3: totalGrams = singlegramsRefined2+biGrams + triGrams totalGramsrefined = [i for i in totalGrams if i in listModelfeatures] return totalGramsrefined
# In[47]: # Open the choosen news articles and extract the main text for selected_links in unique_links: results_url = selected_links #print results_url results = requests.get(results_url) results_text = BeautifulSoup(results.text, "lxml") #print(results.text) #break extract_text = results_text.find(class_='arti-flow') try: data = json.loads( results_text.find('script', type='application/ld+json').text) except: continue #print(data['description']) #print(extract_text) #print(data) #final_text = extract_text.get_text() final_text = data['description'] #print(final_text) # Pre-processing the extracted text using ngrams function from the pattern package final_text1 = ngrams(final_text, n=1, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", continuous=False) print(final_text1)
#pip install pattern from pattern.en import parse from pattern.en import pprint parse('Hello Everyone and Welcome to Analytics India Magazine') #parse function differentiate the words in the sentence as a noun, verb, subject, or subject. We can also use the ‘pprint’ function defined in the pattern library to display the parsed sentence in a clear manner. pprint( parse('Hello Everyone and Welcome to Analytics India Magazine', relations=True, tokenize=True, lemmata=True)) #%% ngrams # "n" combination of words in a sentence. from pattern.en import ngrams print(ngrams("Hello Everyone and Welcome to Analytics India Magazine", n=3)) print(ngrams("He goes to hospital", n=2)) #sentiment #Sentiment refers to an opinion or feeling towards a certain thing. sentiment object is used to find the polarity (positivity or negativity) of a text along with its subjectivity. from pattern.en import sentiment print(sentiment("He is a good boy but sometimes he behaves miserably")) print(sentiment("he is has done extremely well")) print(sentiment("This is an excellent movie to watch. I really love it")) #The sentence "This is an excellent movie to watch. I really love it" has a sentiment of 0.75, which shows that it is highly positive. Similarly, the subjectivity of 0.8 refers to the fact that the sentence is a personal opinion of the user. #%%%Modality #Checking if a Statement is a Fact; The modality function returns a value between -1 to 1. For facts, the modality function returns a value greater than 0.5
k=count(s, stemmer=LEMMA, exclude=['.', ',']) f3.write('\n'.join("{!s}={!r}".format(key,val) for (key,val) in k.items())) exclude=[' ', '/', '.', ',', ';', ':', '!', '?', '(', ')', '[', ']', '{', '}', '\'', '`', '"', '@', '#', '$', '*', '+', '-', '|', '=', '~', '_', '...'] ''' dict = {} from nltk.corpus import stopwords cachedStopWords = stopwords.words("english") for article in articles: text = TextBlob(article) for sentence in text.sentences: s = Sentence(parse(''.join(sentence), lemmata=True)) for i in range (1, len(s.words)): k = ngrams(' '.join([word for word in s.lemmata if word not in cachedStopWords]), n=i, punctuation=".,;:!?()[]{}`'\"@#$^&*+-|=~_", continuous=False) if i not in dict: dict[i] = {} for kgram in k: if i > 1: key = '#'.join(unkgram for unkgram in kgram) else: key = kgram[0] if key in dict[i]: dict[i][key] += 1 else: dict[i][key] = 1 for dgram in dict: html.make_graph(dict[dgram], dgram, 1000) for key in dict[dgram]:
print(PAST, 1, PL) in tenses('purred') # rule-based conjugation print 'google' in verbs.infinitives print 'googled' in verbs.inflections print conjugate('googled', tense=PARTICIPLE, parse=False) print conjugate('googled', tense=PARTICIPLE, parse=True) # quantification print number("seventy-five point two") # "seventy-five point two" => 75.2 print numerals(2.245, round=2) # 2.245 => "two point twenty-five" print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify({'carrot': 100, 'parrot': 20}) print quantify('carrot', amount=1000) # spelling print suggest("parot") # n-grams print ngrams("I am eating pizza.", n=2) # bigrams print ngrams("I am eating pizza.", n=3, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", continuous=False) # parser print parse( 'I eat pizza with a fork.', tokenize=True, # Split punctuation marks from words? tags=True, # Parse part-of-speech tags? (NN, JJ, ...) chunks=True, # Parse chunks? (NP, VP, PNP, ...) relations=False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata=False, # Parse lemmata? (ate => eat) encoding='utf-8', # Input string encoding. tagset=None) # Penn Treebank II (default) or UNIVERSAL. # parser tagger and tokenizer
import sys from pattern.vector import Document, count, words from pattern.en import ngrams text = sys.stdin.read() total = int(sys.argv[1]) grams = ngrams(text, n=total) ngramcount = {} for gram in grams: if gram in ngramcount: ngramcount[gram] += 1 else: ngramcount[gram] = 1 for gram in sorted(ngramcount, key=ngramcount.get, reverse=True): count = ngramcount[gram] if count > 10 and all(len(x) > 0 for x in gram): print str(count) + ': ' + ' '.join(gram)
def ngrams(text, max_n=1, min_n=1, strip_punctuation=True): pattern_args = {} if strip_punctuation else {'punctuation': ''} for i in range(min_n - 1, max_n): for n in en.ngrams(text, n=i + 1, **pattern_args): yield ' '.join(n)
# Unigrams for words in train_tokens[0:]: for word in words: if word in unigrams: unigrams[word] += 1 else: unigrams[word] = 1 # unigrams prob for unigram in unigrams: unigramsProb[unigram] = float(unigrams[unigram]) # Bigrams for word in train_tokens[0:]: x = ngrams(" ".join(word), 2) for bg in x: cfd[bg[0]][bg[1]] += 1 cfd = ConditionalFreqDist((bg[0],bg[1]) for bg in list(chain(*[bigrams(i) for i in train_tokens]))) if (idx >= 90): for bgidx, bg in enumerate(list(chain(*[bigrams(i) for i in train_tokens]))): prob = cfd[bg[0]].freq(bg[1]) prob = 0.0001 if not prob else prob bigramsProb[bg] = prob if (idx >= 90): test_tokens.append(tokens) if idx >= 190: for bgidx, bg in enumerate(list(chain(*[bigrams(i) for i in test_tokens]))): prob = cfd[bg[0]].freq(bg[1]) prob = 0.0001 if not prob else prob bigramsProb[bg] = prob
print(conjugate('googled', tense=PARTICIPLE, parse=True)) from pattern.en import quantify print(quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])) print(quantify({'carrot': 100, 'parrot': 20})) print(quantify('carrot', amount=1000)) from pattern.en import quantify print(quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])) print(quantify({'carrot': 100, 'parrot': 20})) print(quantify('carrot', amount=1000)) from pattern.en import ngrams print(ngrams("I am eating pizza.", n=2)) # bigrams from pattern.en import parse print(parse('I ate pizza.').split()) from pattern.en import parsetree s = parsetree('The cat sat on the mat.', relations=True, lemmata=True) print(repr(s)) for sentence in s: for chunk in sentence.chunks: print(chunk.type), [(w.string, w.type) for w in chunk.words] from pattern.en import sentiment
# print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print(PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse s = parse('I eat pizza with a fork.') pprint(s) #tag for word, t in tag('The cat felt happy.'): print word + ' is ' + t s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring." print sentiment(s) print polarity(s) print subjectivity(s) #The modality() function returns a value between -1.0 and +1.0, expressing the degree of certainty
# print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print (PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse s = parse('I eat pizza with a fork.') pprint(s) #tag for word, t in tag('The cat felt happy.'): print word +' is ' +t s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring." print sentiment(s) print polarity(s) print subjectivity(s)
return preprocess_fields_v2.isValidPhrase(tokens) and len(tokens) == otc if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf8') parser = optparse.OptionParser() parser.add_option("-c", "--count", dest = "doNgrams", action = "store_true", default = False, help = "collect ngrams") parser.add_option("-a", "--acronyms", dest = "doAcronyms", action = "store_true", default = False, help = "collect acronym/expansion pairs") parser.add_option("-k", "--nMin", dest = "nMin", type="int", help = "min value of n (ngram or acronym size in tokens)") parser.add_option("-n", "--nMax", dest = "nMax", type="int", help = "max value of n (ngram or acronym size in tokens)") parser.add_option("-s", "--src", dest = "srcFileName", help = "source file") (options, args) = parser.parse_args() nMin = options.nMin if options.nMin else 1 nMax = options.nMax if options.nMax else 3 fileName = options.srcFileName for line in preprocess_fields_v2.fileToList(fileName): if options.doNgrams: for k in range(nMin, nMax + 1): for ngram in ngrams(line, k): phrase = ' '.join(ngram) nvp = preprocess_fields_v2.normalizeAndValidatePhrase(phrase, phraseValidator = partial(noStopWordValidator, otc = k)) if nvp is not None: print phrase elif options.doAcronyms: for (acro, tokens) in acronymizePhrase(line, nMin, nMax): print '|'.join([acro, ' '.join(tokens)])
from pattern.en import ngrams from collections import Counter from videogrep import videogrep from sys import argv filename = argv[1] srt_name = filename.replace('.mp4', '.srt') lines = open(srt_name).read() grams = ngrams(lines, n=3) # print grams most_common = Counter(grams).most_common(10) search_phrase = most_common[0][0] search_phrase = ' '.join(search_phrase) # for phrase in most_common: # print phrase # print ' '.join(phrase[0]) videogrep([filename], filename + '.most_common.mp4', search_phrase, 're')
Module that handles background work such as dividing data into training, cross-validation, and test sets ''' import time import matplotlib.pyplot as plt from pattern.en import ngrams from pattern.en import lemma start=time.clock() #text='Hi my name is be Jason' textFile=open('text.txt','r') text='' for line in textFile: text+=line words=ngrams(text,n=1) words=[str(lemma(word[0])) for word in words] length=len(words) words.sort() wordFreq=dict() count=0 ''' for i in range(len(words)): if i==len(words)-1: continue elif words[i]!=words[i+1]: wordFreq[words[i]]=count+1 count=0 else: count+=1 '''