def sentipol(post_seg): '''emotion polarity (情緒極性) determiner''' pos_con, neg_con = [], [] pos_cnt, neg_cnt = 0, 0 for word in post_seg: if word in pos_lst: pos_cnt += 1 pos_con.append(word) elif word in neg_lst: neg_cnt += 1 neg_con.append(word) diff = pos_cnt - neg_cnt pos_fin = count(pos_con).items() neg_fin = count(neg_con).items() pos_fin = sorted(pos_fin, key=lambda x: x[-1], reverse=True) neg_fin = sorted(neg_fin, key=lambda x: x[-1], reverse=True) postmod = ' '.join(post_seg) for w, n in pos_fin: postmod = postmod.replace(w, '<span class="positive">%s</span>' % w) for w, n in neg_fin: postmod = postmod.replace(w, '<span class="negative">%s</span>' % w) return {'content': postmod, 'diff': diff, 'pos': pos_fin, 'neg': neg_fin}
def roots_and_lemmas(): print(stem('cars', PORTER)) #Root print(stem('cars', LEMMA)) print(stem('studies', PORTER)) # Root print(stem('studies', LEMMA)) text = "People who teach find teaching very rewarding." tokens = words(text) print(count(tokens, stopwords=True, stemmer=PORTER)) print(count(tokens, stopwords=True, stemmer=LEMMA))
def instance(review): # "Great book!" # [("Great", "JJ"), ("book", "NN"), ("!", "!")] v = tag(review) v = [word for (word, pos) in v if pos in ("JJ", "RB") or word in ("!")] v = [predicative(word) for word in v] # ["great", "!", "!"] v = count(v) # {"great": 1, "!": 1} return v
def get_keywords(self, comment_history): comments = [str(x) for x in comment_history] keywords = count(words(comments.__str__())) sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True) return sorted_keywords
def run(self,minePackage): clouds=minePackage['clouds'] urlContent=UrlToPlainText() for cloud in clouds: for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube # print cloud.graph.node[n]['link'] pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link']) cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))
def __init__(self , data, url=""): if url != "": urlContent = UrlToPlainText() self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas") self.contenido = plaintext(self.contenidoConEtiquetas,keep={}) else: self.contenido = "" self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def trigram_text_feature(text): """ return probability distribution {term: count} of each word in <text> """ wl = nltk.word_tokenize(text.lower()) trigrams = nltk.util.trigrams(wl) features = patvec.count(trigrams) return features
def test_count(self): # Assert wordcount with stemming, stopwords and pruning. w = ["The", "cats", "sat", "on", "the", "mat", "."] v1 = vector.count(w) v2 = vector.count(w, stemmer=vector.LEMMA) v3 = vector.count(w, exclude=["."]) v4 = vector.count(w, stopwords=True) v5 = vector.count(w, stopwords=True, top=3) v6 = vector.count(w, stopwords=True, top=3, threshold=1) v7 = vector.count(w, dict=vector.readonlydict, cached=False) self.assertEqual(v1, {"cats": 1, "sat": 1, "mat": 1, ".": 1}) self.assertEqual(v2, {"cat": 1, "sat": 1, "mat": 1, ".": 1}) self.assertEqual(v3, {"cats": 1, "sat": 1, "mat": 1}) self.assertEqual(v4, { "the": 2, "cats": 1, "sat": 1, "on": 1, "mat": 1, ".": 1 }) self.assertEqual(v5, {"the": 2, "cats": 1, ".": 1}) self.assertEqual(v6, {"the": 2}) # Assert custom dict class. self.assertTrue(isinstance(v7, vector.readonlydict)) print "pattern.vector.count()"
def pos_feature_builder( text, target_pos=('JJ', 'NN', 'VB', '!', 'NP', 'RB', 'CD')): """ builds features from target part of speech tags specified by <target_pos> """ if not text: return patvec.count([]) try: parsed_text = paten.parsetree(text, lemmata=True)[0] selected = [ word.lemma for word in parsed_text if word.tag.startswith( target_pos)] except IndexError as e: print text, e selected = [] result = patvec.count(selected) return result
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:] + "$"] = 1 v[len(name)] = 1 return v
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:]+"$"] = 1 v[len(name)] = 1 return v
def start(self): cloudSize = dameCloudSize(self.id_request) cloudSize = cloudSize[0][0] searchKey = dameSerchKey(self.id_request) searchKey = searchKey[0][0] step = 0 while step <= 5: #Mas adelante setear get_stop; esto indica la cantidad de niveles for id_cloud in dameIdCloud( self.id_request ): #Obtiene IDS de los clouds que pertenecen al proyecto print "Id Cloud: " + str(id_cloud[0]) cloud = self.generar_cloud(dameNodo(id_cloud[0])) true_nodes = self.trueNodesSelection(cloud) for n in true_nodes: try: cloud.graph.node[n]['select'] = False crawler = SimpleCrawler1(n, delay=0.1) crawler.newStructure(cloud.graph) time = 0 except: continue while len(crawler.visited) < cloudSize: print "Cloudsize = " + str( cloudSize) + " Crawler Visited = " + str( len(crawler.visited)) + " Nivel = " + str( step) print 'Explorando ...' crawler.crawl(method=None) time += 1 if time > cloudSize * 10: break actualizarSelect(cloud.graph.node[n]['ID'], cloud.graph.node[n]['select']) print print '#####Generando documentos#####' #Creacion de minePackage clouds = list() clouds.append(cloud) minePackage = dict() minePackage['clouds'] = clouds minePackage['searchKey'] = searchKey minePackage['searchKeyStemmer'] = count(words( Sentence(parse(searchKey))), stemmer=PORTER) self.IRController.start(minePackage) #Recupera Informacion #FALTA SCRAPPER CONTROLLER #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD minePackage = None cloud = None gc.collect step += 1 print "Explorando nivel nro: " + str(step) #Controla los niveles a expandir, en este caso 10 print "Proceso Finalizado"
def selectWords(review): ''' a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark ''' review = parsetree(review, lemmata=True)[0] #lemmatize the review #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks review = [ w.lemma for w in review if w.tag.startswith(('JJ', 'NN', 'VB', '!')) ] review = count(review) #a dictionary of (word, count) return review
def run(self, minePackage): clouds = minePackage['clouds'] urlContent = UrlToPlainText() for cloud in clouds: for n in cloud.graph.nodes( ): #Itera una lista de enlaces de la nube print cloud.graph.node[n]['link'] pageContent = urlContent.plainTextConverter( cloud.graph.node[n]['link']) cloud.graph.node[n]['methodData'] = MethodData( count(words(Sentence(parse(pageContent))), stemmer=PORTER))
def train(cls, train_file, model_file): sents_dic = (json.loads(jsonl) for jsonl in SoftSkills.load(train_file)) model = KNN() for sent in sents_dic: text = sent['text'] v = count([word for word, pos in tag(text)]) # {'sweet': 1} if v: model.train(v, type=sent['soft skill']) model.save(model_file) return model
def __init__(self, data, url="", contenidoBd=""): if url != "": urlContent = UrlToPlainText() self.contenidoConEtiquetas = urlContent.plainTextConverter( url, "mantenerEtiquetas") self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: if (contenidoBd != ""): self.contenidoConEtiquetas = contenidoBd self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: self.contenido = "" self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def tokenizer(self,url): #text = 'The black cat was spying on the white cat.' #stemmer=None, stemmer=LEMMA, stemmer=PORTER #print count(words(pageContent), stemmer=PORTER) #print count(words(pageContent), stemmer=LEMMA) #url_content = UrlToplainTextConverter() #page_content = url_content.plainTextConverter(url) page_content = url s = Sentence(parse(page_content)) tokenized_file = count(words(s), stemmer=PORTER) print print tokenized_file print
def count_one_artist(name, bad_words): # ok, this is a bad way to get number of songs for that artist, so we can average out # the words per song default_dir = basedir + name num_songs = len(os.listdir(default_dir)) # we need the number of songs, this is so annoying dict = {} docs = vec.count(vec.words(get_artist_docs(name))) for w in bad_words: if w in docs: dict[w] = docs[w] dict['num_songs'] = num_songs # this is cheap return dict
def postag_feature_builder( text, target_pos=('JJ', 'NN', 'VB', 'NP', 'RB', 'CD')): """ faster version of the tag feature builder uses paten.tag instead of paten.parsetree """ if not text: return {} # tag each word try: result = patvec.count( (word for word, tag in paten.tag(text, tokenize=True, encoding='utf-8') if tag in target_pos)) except IndexError as e: print text, e result = {} return result
def test_count(self): # Assert wordcount with stemming, stopwords and pruning. w = ["The", "cats", "sat", "on", "the", "mat", "."] v1 = vector.count(w) v2 = vector.count(w, stemmer=vector.LEMMA) v3 = vector.count(w, exclude=["."]) v4 = vector.count(w, stopwords=True) v5 = vector.count(w, stopwords=True, top=3) v6 = vector.count(w, stopwords=True, top=3, threshold=1) v7 = vector.count(w, dict=vector.readonlydict, cached=False) self.assertEqual(v1, {"cats":1, "sat":1, "mat":1, ".":1}) self.assertEqual(v2, {"cat":1, "sat":1, "mat":1, ".":1}) self.assertEqual(v3, {"cats":1, "sat":1, "mat":1}) self.assertEqual(v4, {"the":2, "cats":1, "sat":1, "on":1, "mat":1, ".":1}) self.assertEqual(v5, {"the":2, "cats":1, ".":1}) self.assertEqual(v6, {"the":2}) # Assert custom dict class. self.assertTrue(isinstance(v7, vector.readonlydict)) print("pattern.vector.count()")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Mar 25 19:37:34 2019 @author: alternatif """ from pattern.web import Twitter from pattern.en import tag from pattern.vector import KNN, count twitter, knn = Twitter(), KNN() for i in range(1, 3): for tweet in twitter.search('#win OR #fail', start=i, count=100): s = tweet.text.lower() p = '#win' in s and 'WIN' or 'FAIL' v = tag(s) v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective v = count(v) # {'sweet': 1} if v: knn.train(v, type=p) print(knn.classify('sweet potato burger')) print(knn.classify('stupid autocorrect'))
def v(review1): v3 = parsetree(review1, lemmata=True)[0] v4 = [w.lemma for w in v3 if w.tag.startswith(('JJ', 'NN', 'VB', '!'))] v5 = count(v4) return v5
def tokenization(): text = "My new car is better than my new bed" tokens = words(text) print(tokens) print(count(tokens))
def makeSameTense(self, w1, w2): tense = count([i[0] for i in tenses(w2)], stopwords=True) tense = sorted(tense, key=operator.itemgetter(2)) return verbs.conjugate(w1, tense[0])
from pattern.en import parse, Sentence, parsetree, lexicon from pattern.db import csv from random import choice # word count freq_dic = {} with open('data/input/corpus.txt', 'r') as fp: words_list = words(fp.read(), filter=lambda w: w.strip("'").isalnum(), punctuation='.,;:!?()[]{}`' '\"@#$^&*+-|=~_') # returns a list of words by splitting the string on spaces. freq_dic = count( # takes a list of words and returns a dictionary of (word, count)-items. words=words_list, top=None, # Filter words not in the top most frequent (int). threshold=0, # Filter words whose count <= threshold. stemmer=None, # PORTER | LEMMA | function | None exclude=[], # Filter words in the exclude list. stopwords=False, # Include stop words? language='en') # en, es, de, fr, it, nl for k, v in freq_dic.iteritems(): print k, v # stop words and stemming print stem('spies', stemmer=PORTER) print stem('spies', stemmer=LEMMA) s = 'The black cat was spying on the white cat.' print count(words(s), stemmer=PORTER) print count(words(s), stemmer=LEMMA) s = 'The black cat was spying on the white cat.' s = Sentence(parse(s)) print count(s, stemmer=LEMMA) # character n-grams
def countWords(self): wordDict = count( words(plaintext(self.content), filter=lambda w: w.strip("'").isalpha())) return Counter(wordDict)
# -*- coding: utf-8 -*- """ Created on Wed Mar 19 19:45:49 2014 @author: scut1 """ from pattern.web import Twitter from pattern.en import tag from pattern.vector import KNN, count twitter, knn = Twitter(), KNN() for i in range(1, 3): for tweet in twitter.search('#win OR #fail', start=i, count=100): s = tweet.text.lower() p = '#win' in s and 'WIN' or 'FAIL' v = tag(s) v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective v = count(v) # {'sweet': 1} if v: knn.train(v, type=p) print knn.classify('sweet potato burger') print knn.classify('stupid autocorrect')
# language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning # (vector space model, clustering, SVM), network analysis and <canvas> visualization. # pattern.vector # The pattern.vector module is a toolkit for machine learning, based on a vector space model of bag-of-words # documents with weighted features (e.g., tf-idf) and distance metrics (e.g., cosine similarity, infogain). # Models can be used for clustering (k-means, hierarchical), classification (Naive Bayes, Perceptron, k-NN, SVM) # and latent semantic analysis (LSA). from pattern.web import Twitter from pattern.en import tag from pattern.vector import KNN, count twitter, knn = Twitter(), KNN() for i in range(1, 10): for tweet in twitter.search('#win OR #fail', start=i, count=100): s = tweet.text.lower() p = '#win' in s and 'WIN' or 'FAIL' v = tag(s) v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective v = count(v) if v: knn.train(v, type=p) print knn.classify('sweet potato burger') print knn.classify('stupid autocorrect') # Displays # 'WIN' # 'FAIL'
def processor(self, minePackage): print '####SEARCH_KEY:', minePackage['searchKey'] var = minePackage['searchKey'] s = Sentence(parse(var)) return count(words(s), stemmer=PORTER) #Retorna diccionario {palabra: cantidad}
def unigram_text_feature(text): """ return probability distribution {term: count} of each word in <text> """ features = patvec.count(nltk.word_tokenize(text.lower())) return features
def processor(self,minePackage): # print '####SEARCH_KEY:',minePackage['searchKey'] s = Sentence(parse(minePackage['searchKey'])) minePackage['searchKey']=count(words(s), stemmer=PORTER) return minePackage['searchKey']
def v(s): """ Returns a bag-of-words vector for the given string. """ v = {} v.update(count(words(s))) return v
def get_results(query, quantity, force=False, news=False, analysis=True): query = query.lower() start = datetime.now() query = query.replace('_', '%20') breakdown = 50 if breakdown > quantity: breakdown = quantity data_to_be_written = [] knowledgeKeywords = [] duplicates = [] results, created = webSearch.objects.get_or_create(queryText=query.strip()) if created or force or len(results.results.all()) < quantity: all_results = getGoogleResults(query, quantity, news, force) else: all_results = [] if len(all_results) == 0 and not created: all_results = [r.url for r in results.results.all()] all_results = all_results[:quantity] print "TOTAL RESULTS ", str(len(all_results)) # Done with getting search results for index, i in enumerate(all_results): try: wr, created = WebResource.objects.get_or_create(url=i) if created: wr = parseURL(i, True) data = {'url': i} keywords = [ w for w in count(wr.text, top=10, stemmer=LEMMA) if w not in stop ] if 'books.google' in i: text = '' else: text = wr.text data.update({ 'keywords': keywords, 'text': plaintext(text), 'title': wr.title, 'urls': wr.urls, 'type': 'result', 'index': index + 1, 'similar': [], 'duplicates': [], 'category': 0, }) if wr not in results.results.all(): results.results.add(wr) data['plaintext'] = data['text'].split('\n') # while '' in data['plaintext']: # data['plaintext'].remove('') # knowledgeKeywords.extend(data['keywords']) data_to_be_written.append(data) except Exception as e: print e print "Response Result model Prepared" if not analysis: return data_to_be_written list_of_sim_docs, model, m = find_similarity(data_to_be_written) for i in list_of_sim_docs: similar = { 'type': 'similar', 's': i.get('source'), 'd': i.get('dest'), 'source': i.get('source'), 'dest': i.get('dest'), 'score': i.get('score'), } data_to_be_written.append(similar) if similar['score'] > 0.9: for res in data_to_be_written: if res['type'] in [ 'result', 'duplicate' ] and res['url'] == i.get('dest') and len(res['text']) > 0: print "Duplicate [{0}].[{1}]".format( i['source'][:20], i['dest'][:20]) res['type'] = 'duplicate' items = [ Document(i.get('text'), name=i.get('url'), description=i.get('index'), stemmer=LEMMA) for i in data_to_be_written ] m = Model(items, weight=TFIDF) # k = 10 ####### BEGIN Experimental Setup ########## # v,d = m.features, m.documents # y,x = len(m.documents),len(m.features) def build_matrix(w=None, d=None): y, x = len(d), len(w) model = np.zeros((y, x)) for i in range(y): model[i] = [1 if w[j] in d[i].words else 0 for j in range(x)] return model # def find_word_matches(model, words = None, d = None): # y,x = model.shape # for i in range(y): # for j in range(i+1,y): # a = np.copy(model[i]) # b = np.copy(model[j]) # a_ones = np.count_nonzero(a) # b_ones = np.count_nonzero(b) # comparison = (a==b) # cross_product = a*b # intersection = np.count_nonzero(cross_product) # union = a_ones+b_ones-intersection # if a_ones+b_ones>0 and intersection > 0: # score = intersection/union # else: # score = 0 # if model[i].any() and model[j].any() and comparison.any() and score > 0.4: # print "Match [{0}] {1}:[{2} words] - [{3}] {4}:[{5} words] : {6} words".format(d[i].description,d[i].name[:30], np.count_nonzero(a), d[j].description,d[j].name[:30], np.count_nonzero(b), score, math.fabs(d[i].description - d[j].description)) # similar = { # 'type' : 'similar', # 'source' : d[i].name, # 'dest' : d[j].name, # 'score' : score, # } # data_to_be_written.append(similar) # if score >= 0.9: # for res in data_to_be_written: # if res['type'] in ['result','duplicate'] and res['url'] == d[j].name and len(res['text'])>0: # print "Duplicate [{0}].[{1}]".format(i+1,j+1) # res['type'] = 'duplicate' # return model def word_frequency(model, words=None, documents=None, threshold1=0, threshold2=1, transpose=False): "Returns frequent word amoung documents in range of threshold" y, x = model.shape data = {} for i in range(x): count = np.count_nonzero(model[:, i]) / y if count >= threshold1 and count <= threshold2: if words: data[words[i]] = count else: data[i] = count return data model = build_matrix(m.features, m.documents) # model = find_word_matches(model, m.features, m.documents) knowledgeKeywords = [ w for w in word_frequency(model, m.features, m.documents, 0.2, 0.8) ][:20] ####### END Experimental Setup ########## # c = m.cluster(method=HIERARCHICAL, k=k) # for i in c: # cluster = [] # k = [] # contains_text = False # for item in i: # for data in data_to_be_written: # if data.get('type') == 'result' and data.get('url')==item.name: # cluster.append({ # 'url' : data.get('url'), # 'index' : item.description, # }) # if data.get('text'): # k.extend([w for w in count(words(data.get('text')), top=50, stemmer = PORTER, exclude=[], stopwords=False, language='en')]) # contains_text=True # cluster = { # 'type' : 'cluster', # 'data' : cluster, # 'index' : min([c.get('index') for c in cluster] + [0]), # 'keywords' : [w for w in count(k, top=10, stemmer = PORTER, exclude=[], stopwords=False, language='en')] # } # cluster['contains_text'] = contains_text # data_to_be_written.append(cluster) # print "{0} results".format(len(data_to_be_written)) data_to_be_written.append({ 'type': 'meta', 'keywords': knowledgeKeywords, }) result = {} for i in data_to_be_written: if i.get('type') in ['result', 'duplicate']: url = i.get('url') index = int(i.get('index')) result[index] = [ 1 for r in data_to_be_written if r.get('type') == 'similar' and r['source'] == url ] result2 = [i for i, j in result.iteritems()] result3 = [len(j) for i, j in result.iteritems()] Process(target=plot_graph, args=(result2, result3)).start() return data_to_be_written
#!/usr/bin/env python from pattern.en import referenced # Imports import cgi, cgitb import sys, json from pattern.vector import count, LEMMA from pattern.en import parse, Sentence # Logic data = cgi.FieldStorage() output = data.getvalue("documentText") s = Sentence(parse(output)) res = count(s, exclude=[".", ",", "-", "!", '"', "'", ":", ";", "?"], stemmer=LEMMA) print "Content-Type: application/json\n\n" print json.dumps(res)
for row in all_q: row = filter(None, row) #remove nulls def fluency(questions): return len(questions) def elaboration(questions): return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions) def variance(cluster): return avg([distance(centroid(cluster), v) for v in cluster]) vectors = [] for q in all_q: v = count(words(q), stemmer='lemma') v = Vector(v) vectors.append(v) clusters = hierarchical(vectors, k=250, distance='cosine') clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] clusters = sorted(clusters, key=variance) categories = {} for i, cluster in enumerate(clusters): for v in cluster: categories[row[vectors.index(v)]] = i def flex(questions): ml_categories = []