def test_words(self): # Assert word split algorithm (default treats lines as spaces and ignores numbers). s = "The cat sat on the\nmat. 1 11." v = vector.words(s, filter=lambda w: w.isalpha()) self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"]) # Assert custom word filter. v = vector.words(s, filter=lambda w: True) self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"]) print("pattern.vector.words()")
def get_keywords(self, comment_history): comments = [str(x) for x in comment_history] keywords = count(words(comments.__str__())) sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True) return sorted_keywords
def build_model(results=[]): documents = [ Document(i.get('text'), name=i.get('url'), description=i.get('index'), stemmer=LEMMA) for i in results ] m = Model(documents, weight=TFIDF) y, x = 1, len(m.features) model = np.zeros((y, x)) sentence_dict = {} model_sentences = [] for i_index, i in enumerate(documents): sentences = sent_tokenize(results[i_index].get('text').lower()) dy, dx = len(sentences), x for s_index, s in enumerate(sentences): s_words = { w: 1 for w in words(s, stemmer=LEMMA, stopwords=False) if not stopwords_hash.get(w) } if len(s_words) < 5: continue model_sentences.append(s) model = np.append( model, [[1 if s_words.get(w) else 0 for w in m.features]], 0) sentence_dict[model.shape[0] - 1] = i.name # model_sentences[model.shape[0]-1] = s model = np.delete(model, (0), 0) return model, m, model_sentences, sentence_dict
def run(self,minePackage): clouds=minePackage['clouds'] urlContent=UrlToPlainText() for cloud in clouds: for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube # print cloud.graph.node[n]['link'] pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link']) cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))
def __init__(self , data, url=""): if url != "": urlContent = UrlToPlainText() self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas") self.contenido = plaintext(self.contenidoConEtiquetas,keep={}) else: self.contenido = "" self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def roots_and_lemmas(): print(stem('cars', PORTER)) #Root print(stem('cars', LEMMA)) print(stem('studies', PORTER)) # Root print(stem('studies', LEMMA)) text = "People who teach find teaching very rewarding." tokens = words(text) print(count(tokens, stopwords=True, stemmer=PORTER)) print(count(tokens, stopwords=True, stemmer=LEMMA))
def run(self, minePackage): clouds = minePackage['clouds'] urlContent = UrlToPlainText() for cloud in clouds: for n in cloud.graph.nodes( ): #Itera una lista de enlaces de la nube print cloud.graph.node[n]['link'] pageContent = urlContent.plainTextConverter( cloud.graph.node[n]['link']) cloud.graph.node[n]['methodData'] = MethodData( count(words(Sentence(parse(pageContent))), stemmer=PORTER))
def start(self): cloudSize = dameCloudSize(self.id_request) cloudSize = cloudSize[0][0] searchKey = dameSerchKey(self.id_request) searchKey = searchKey[0][0] step = 0 while step <= 5: #Mas adelante setear get_stop; esto indica la cantidad de niveles for id_cloud in dameIdCloud( self.id_request ): #Obtiene IDS de los clouds que pertenecen al proyecto print "Id Cloud: " + str(id_cloud[0]) cloud = self.generar_cloud(dameNodo(id_cloud[0])) true_nodes = self.trueNodesSelection(cloud) for n in true_nodes: try: cloud.graph.node[n]['select'] = False crawler = SimpleCrawler1(n, delay=0.1) crawler.newStructure(cloud.graph) time = 0 except: continue while len(crawler.visited) < cloudSize: print "Cloudsize = " + str( cloudSize) + " Crawler Visited = " + str( len(crawler.visited)) + " Nivel = " + str( step) print 'Explorando ...' crawler.crawl(method=None) time += 1 if time > cloudSize * 10: break actualizarSelect(cloud.graph.node[n]['ID'], cloud.graph.node[n]['select']) print print '#####Generando documentos#####' #Creacion de minePackage clouds = list() clouds.append(cloud) minePackage = dict() minePackage['clouds'] = clouds minePackage['searchKey'] = searchKey minePackage['searchKeyStemmer'] = count(words( Sentence(parse(searchKey))), stemmer=PORTER) self.IRController.start(minePackage) #Recupera Informacion #FALTA SCRAPPER CONTROLLER #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD minePackage = None cloud = None gc.collect step += 1 print "Explorando nivel nro: " + str(step) #Controla los niveles a expandir, en este caso 10 print "Proceso Finalizado"
def __init__(self, data, url="", contenidoBd=""): if url != "": urlContent = UrlToPlainText() self.contenidoConEtiquetas = urlContent.plainTextConverter( url, "mantenerEtiquetas") self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: if (contenidoBd != ""): self.contenidoConEtiquetas = contenidoBd self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: self.contenido = "" self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def count_one_artist(name, bad_words): # ok, this is a bad way to get number of songs for that artist, so we can average out # the words per song default_dir = basedir + name num_songs = len(os.listdir(default_dir)) # we need the number of songs, this is so annoying dict = {} docs = vec.count(vec.words(get_artist_docs(name))) for w in bad_words: if w in docs: dict[w] = docs[w] dict['num_songs'] = num_songs # this is cheap return dict
def tokenizer(self,url): #text = 'The black cat was spying on the white cat.' #stemmer=None, stemmer=LEMMA, stemmer=PORTER #print count(words(pageContent), stemmer=PORTER) #print count(words(pageContent), stemmer=LEMMA) #url_content = UrlToplainTextConverter() #page_content = url_content.plainTextConverter(url) page_content = url s = Sentence(parse(page_content)) tokenized_file = count(words(s), stemmer=PORTER) print print tokenized_file print
def processor(self,minePackage): # print '####SEARCH_KEY:',minePackage['searchKey'] s = Sentence(parse(minePackage['searchKey'])) minePackage['searchKey']=count(words(s), stemmer=PORTER) return minePackage['searchKey']
while len(links) > 0: try: article = Wikipedia(language="it").search(links.pop(), throttle=10) seen[article.title] = True # Parse links from article. for link in article.links: if link not in seen: links.add(link) # Parse words from article. Count words. for word in words(article.string): if word not in frequency: frequency[word] = 0 frequency[word] += 1 print sum(frequency.values()), article.title except: pass # Collect a reliable amount of words (e.g., 1M). if sum(frequency.values()) > 1000000: break #top = sorted((count, word) for word, count in frequency.items())
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- from pattern.vector import words, count, stem, PORTER, LEMMA, chngrams, Document, Vector, distance, Model, TFIDF,\ HIERARCHICAL, Cluster, NB, kfoldcv, KNN, EUCLIDEAN, TF, SVM, RADIAL, gridsearch, GA from pattern.en import parse, Sentence, parsetree, lexicon from pattern.db import csv from random import choice # word count freq_dic = {} with open('data/input/corpus.txt', 'r') as fp: words_list = words(fp.read(), filter=lambda w: w.strip("'").isalnum(), punctuation='.,;:!?()[]{}`' '\"@#$^&*+-|=~_') # returns a list of words by splitting the string on spaces. freq_dic = count( # takes a list of words and returns a dictionary of (word, count)-items. words=words_list, top=None, # Filter words not in the top most frequent (int). threshold=0, # Filter words whose count <= threshold. stemmer=None, # PORTER | LEMMA | function | None exclude=[], # Filter words in the exclude list. stopwords=False, # Include stop words? language='en') # en, es, de, fr, it, nl for k, v in freq_dic.iteritems(): print k, v # stop words and stemming print stem('spies', stemmer=PORTER) print stem('spies', stemmer=LEMMA) s = 'The black cat was spying on the white cat.' print count(words(s), stemmer=PORTER)
def v(s): """ Returns a bag-of-words vector for the given string. """ v = {} v.update(count(words(s))) return v
def tokenization(): text = "My new car is better than my new bed" tokens = words(text) print(tokens) print(count(tokens))
for row in all_q: row = filter(None, row) #remove nulls def fluency(questions): return len(questions) def elaboration(questions): return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions) def variance(cluster): return avg([distance(centroid(cluster), v) for v in cluster]) vectors = [] for q in all_q: v = count(words(q), stemmer='lemma') v = Vector(v) vectors.append(v) clusters = hierarchical(vectors, k=250, distance='cosine') clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] clusters = sorted(clusters, key=variance) categories = {} for i, cluster in enumerate(clusters): for v in cluster: categories[row[vectors.index(v)]] = i def flex(questions): ml_categories = []
def countWords(self): wordDict = count( words(plaintext(self.content), filter=lambda w: w.strip("'").isalpha())) return Counter(wordDict)
def getWords(text): return words( text, stemmer=LEMMA, exclude=[], stopwords=False, language='en') # seeing same results with stemmer.stem, LEMMA, PORTER
def processor(self, minePackage): print '####SEARCH_KEY:', minePackage['searchKey'] var = minePackage['searchKey'] s = Sentence(parse(var)) return count(words(s), stemmer=PORTER) #Retorna diccionario {palabra: cantidad}