Exemplo n.º 1
0
 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")
Exemplo n.º 2
0
 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")
Exemplo n.º 3
0
 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)
     return sorted_keywords
Exemplo n.º 4
0
def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict
Exemplo n.º 5
0
 def run(self,minePackage):
     clouds=minePackage['clouds']
     urlContent=UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube
             # print cloud.graph.node[n]['link']
             pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))
Exemplo n.º 6
0
 def __init__(self , data, url=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas,keep={})
     else:
         self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
Exemplo n.º 7
0
def roots_and_lemmas():

    print(stem('cars', PORTER))  #Root
    print(stem('cars', LEMMA))
    print(stem('studies', PORTER))  # Root
    print(stem('studies', LEMMA))

    text = "People who teach find teaching very rewarding."
    tokens = words(text)
    print(count(tokens, stopwords=True, stemmer=PORTER))
    print(count(tokens, stopwords=True, stemmer=LEMMA))
Exemplo n.º 8
0
 def run(self, minePackage):
     clouds = minePackage['clouds']
     urlContent = UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes(
         ):  #Itera una lista de enlaces de la nube
             print cloud.graph.node[n]['link']
             pageContent = urlContent.plainTextConverter(
                 cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData'] = MethodData(
                 count(words(Sentence(parse(pageContent))), stemmer=PORTER))
Exemplo n.º 9
0
 def start(self):
     cloudSize = dameCloudSize(self.id_request)
     cloudSize = cloudSize[0][0]
     searchKey = dameSerchKey(self.id_request)
     searchKey = searchKey[0][0]
     step = 0
     while step <= 5:  #Mas adelante setear get_stop; esto indica la cantidad de niveles
         for id_cloud in dameIdCloud(
                 self.id_request
         ):  #Obtiene IDS de los clouds que pertenecen al proyecto
             print "Id Cloud: " + str(id_cloud[0])
             cloud = self.generar_cloud(dameNodo(id_cloud[0]))
             true_nodes = self.trueNodesSelection(cloud)
             for n in true_nodes:
                 try:
                     cloud.graph.node[n]['select'] = False
                     crawler = SimpleCrawler1(n, delay=0.1)
                     crawler.newStructure(cloud.graph)
                     time = 0
                 except:
                     continue
                 while len(crawler.visited) < cloudSize:
                     print "Cloudsize = " + str(
                         cloudSize) + " Crawler Visited = " + str(
                             len(crawler.visited)) + " Nivel =  " + str(
                                 step)
                     print 'Explorando ...'
                     crawler.crawl(method=None)
                     time += 1
                     if time > cloudSize * 10:
                         break
                 actualizarSelect(cloud.graph.node[n]['ID'],
                                  cloud.graph.node[n]['select'])
                 print
                 print '#####Generando documentos#####'
                 #Creacion de minePackage
                 clouds = list()
                 clouds.append(cloud)
                 minePackage = dict()
                 minePackage['clouds'] = clouds
                 minePackage['searchKey'] = searchKey
                 minePackage['searchKeyStemmer'] = count(words(
                     Sentence(parse(searchKey))),
                                                         stemmer=PORTER)
                 self.IRController.start(minePackage)  #Recupera Informacion
                 #FALTA SCRAPPER CONTROLLER
             #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD
             minePackage = None
             cloud = None
             gc.collect
         step += 1
         print "Explorando nivel nro: " + str(step)
         #Controla los niveles a expandir, en este caso 10
     print "Proceso Finalizado"
Exemplo n.º 10
0
 def __init__(self, data, url="", contenidoBd=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(
             url, "mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
     else:
         if (contenidoBd != ""):
             self.contenidoConEtiquetas = contenidoBd
             self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
         else:
             self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))),
                       stemmer=PORTER)
def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs  # this is cheap
    return dict
Exemplo n.º 12
0
	def tokenizer(self,url):
		#text = 'The black cat was spying on the white cat.'
		#stemmer=None, stemmer=LEMMA, stemmer=PORTER
		#print count(words(pageContent), stemmer=PORTER)
		#print count(words(pageContent), stemmer=LEMMA)


		#url_content = UrlToplainTextConverter()
		#page_content = url_content.plainTextConverter(url)
           page_content = url
           s = Sentence(parse(page_content))
           tokenized_file = count(words(s), stemmer=PORTER)
           print 
           print tokenized_file
           print
Exemplo n.º 13
0
def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs # this is cheap
    return dict
Exemplo n.º 14
0
 def processor(self,minePackage):
     # print '####SEARCH_KEY:',minePackage['searchKey']
     s = Sentence(parse(minePackage['searchKey']))
     minePackage['searchKey']=count(words(s), stemmer=PORTER)
     return minePackage['searchKey']
while len(links) > 0:

    try:
        article = Wikipedia(language="it").search(links.pop(), throttle=10)
        seen[article.title] = True

        # Parse links from article.

        for link in article.links:

            if link not in seen:
                links.add(link)

        # Parse words from article. Count words.

        for word in words(article.string):

            if word not in frequency:
                frequency[word] = 0
            frequency[word] += 1
        print sum(frequency.values()), article.title

    except:
        pass

    # Collect a reliable amount of words (e.g., 1M).

    if sum(frequency.values()) > 1000000:
        break

#top = sorted((count, word) for word, count in frequency.items())
Exemplo n.º 16
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
from pattern.vector import words, count, stem, PORTER, LEMMA, chngrams, Document, Vector, distance, Model, TFIDF,\
    HIERARCHICAL, Cluster, NB, kfoldcv, KNN, EUCLIDEAN, TF, SVM, RADIAL, gridsearch, GA
from pattern.en import parse, Sentence, parsetree, lexicon
from pattern.db import csv
from random import choice

# word count
freq_dic = {}
with open('data/input/corpus.txt', 'r') as fp:
    words_list = words(fp.read(),
                       filter=lambda w: w.strip("'").isalnum(),
                       punctuation='.,;:!?()[]{}`'
                       '\"@#$^&*+-|=~_')
    # returns a list of words by splitting the string on spaces.
    freq_dic = count(  # takes a list of words and returns a dictionary of (word, count)-items.
        words=words_list,
        top=None,  # Filter words not in the top most frequent (int).
        threshold=0,  # Filter words whose count <= threshold.
        stemmer=None,  # PORTER | LEMMA | function | None
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)
Exemplo n.º 17
0
def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v
Exemplo n.º 18
0
 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
     return sorted_keywords
Exemplo n.º 19
0
def tokenization():
    text = "My new car is better than my new bed"
    tokens = words(text)
    print(tokens)
    print(count(tokens))
Exemplo n.º 20
0
        for row in all_q:
            row = filter(None, row)  #remove nulls
    
            def fluency(questions):
                return len(questions)
            
            def elaboration(questions):
                return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions)
                
            def variance(cluster):
                return avg([distance(centroid(cluster), v) for v in cluster])
    
            vectors = []
                
            for q in all_q:
                v = count(words(q), stemmer='lemma') 
                v = Vector(v)
                vectors.append(v)
                
            clusters = hierarchical(vectors, k=250, distance='cosine')
            clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] 
            clusters = sorted(clusters, key=variance)
            
            categories = {}
            
            for i, cluster in enumerate(clusters):
                for v in cluster: 
                    categories[row[vectors.index(v)]] = i

            def flex(questions):
                ml_categories = []
Exemplo n.º 21
0
 def countWords(self):
     wordDict = count(
         words(plaintext(self.content),
               filter=lambda w: w.strip("'").isalpha()))
     return Counter(wordDict)
Exemplo n.º 22
0
def getWords(text):
    return words(
        text, stemmer=LEMMA, exclude=[], stopwords=False,
        language='en')  # seeing same results with stemmer.stem, LEMMA, PORTER
Exemplo n.º 23
0
def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v
Exemplo n.º 24
0
 def processor(self, minePackage):
     print '####SEARCH_KEY:', minePackage['searchKey']
     var = minePackage['searchKey']
     s = Sentence(parse(var))
     return count(words(s),
                  stemmer=PORTER)  #Retorna diccionario {palabra: cantidad}