Exemplo n.º 1
0
def sentipol(post_seg):
    '''emotion polarity (情緒極性) determiner'''
    pos_con, neg_con = [], []
    pos_cnt, neg_cnt = 0, 0
    for word in post_seg:
        if word in pos_lst:
            pos_cnt += 1
            pos_con.append(word)
        elif word in neg_lst:
            neg_cnt += 1
            neg_con.append(word)
    diff = pos_cnt - neg_cnt

    pos_fin = count(pos_con).items()
    neg_fin = count(neg_con).items()

    pos_fin = sorted(pos_fin, key=lambda x: x[-1], reverse=True)
    neg_fin = sorted(neg_fin, key=lambda x: x[-1], reverse=True)

    postmod = ' '.join(post_seg)

    for w, n in pos_fin:
        postmod = postmod.replace(w, '<span class="positive">%s</span>' % w)
    for w, n in neg_fin:
        postmod = postmod.replace(w, '<span class="negative">%s</span>' % w)

    return {'content': postmod, 'diff': diff, 'pos': pos_fin, 'neg': neg_fin}
Exemplo n.º 2
0
def roots_and_lemmas():

    print(stem('cars', PORTER))  #Root
    print(stem('cars', LEMMA))
    print(stem('studies', PORTER))  # Root
    print(stem('studies', LEMMA))

    text = "People who teach find teaching very rewarding."
    tokens = words(text)
    print(count(tokens, stopwords=True, stemmer=PORTER))
    print(count(tokens, stopwords=True, stemmer=LEMMA))
Exemplo n.º 3
0
def instance(review):                     # "Great book!"
    # [("Great", "JJ"), ("book", "NN"), ("!", "!")]
    v = tag(review)
    v = [word for (word, pos) in v if pos in ("JJ", "RB") or word in ("!")]
    v = [predicative(word) for word in v]  # ["great", "!", "!"]
    v = count(v)                          # {"great": 1, "!": 1}
    return v
Exemplo n.º 4
0
 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)
     return sorted_keywords
Exemplo n.º 5
0
def instance(review):  # "Great book!"
    # [("Great", "JJ"), ("book", "NN"), ("!", "!")]
    v = tag(review)
    v = [word for (word, pos) in v if pos in ("JJ", "RB") or word in ("!")]
    v = [predicative(word) for word in v]  # ["great", "!", "!"]
    v = count(v)  # {"great": 1, "!": 1}
    return v
Exemplo n.º 6
0
 def run(self,minePackage):
     clouds=minePackage['clouds']
     urlContent=UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube
             # print cloud.graph.node[n]['link']
             pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))
Exemplo n.º 7
0
 def __init__(self , data, url=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas,keep={})
     else:
         self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def trigram_text_feature(text):
    """
        return probability distribution {term: count} of each word in <text>
    """
    wl = nltk.word_tokenize(text.lower())
    trigrams = nltk.util.trigrams(wl)
    features = patvec.count(trigrams)
    return features
Exemplo n.º 9
0
 def test_count(self):
     # Assert wordcount with stemming, stopwords and pruning.
     w = ["The", "cats", "sat", "on", "the", "mat", "."]
     v1 = vector.count(w)
     v2 = vector.count(w, stemmer=vector.LEMMA)
     v3 = vector.count(w, exclude=["."])
     v4 = vector.count(w, stopwords=True)
     v5 = vector.count(w, stopwords=True, top=3)
     v6 = vector.count(w, stopwords=True, top=3, threshold=1)
     v7 = vector.count(w, dict=vector.readonlydict, cached=False)
     self.assertEqual(v1, {"cats": 1, "sat": 1, "mat": 1, ".": 1})
     self.assertEqual(v2, {"cat": 1, "sat": 1, "mat": 1, ".": 1})
     self.assertEqual(v3, {"cats": 1, "sat": 1, "mat": 1})
     self.assertEqual(v4, {
         "the": 2,
         "cats": 1,
         "sat": 1,
         "on": 1,
         "mat": 1,
         ".": 1
     })
     self.assertEqual(v5, {"the": 2, "cats": 1, ".": 1})
     self.assertEqual(v6, {"the": 2})
     # Assert custom dict class.
     self.assertTrue(isinstance(v7, vector.readonlydict))
     print "pattern.vector.count()"
def pos_feature_builder(
        text, target_pos=('JJ', 'NN', 'VB', '!', 'NP', 'RB', 'CD')):
    """
        builds features from target part of speech tags specified by <target_pos>
    """
    if not text:
        return patvec.count([])
    try:

        parsed_text = paten.parsetree(text, lemmata=True)[0]
        selected = [
            word.lemma for word in parsed_text if word.tag.startswith(
                target_pos)]
    except IndexError as e:
        print text, e
        selected = []
    result = patvec.count(selected)
    return result
Exemplo n.º 11
0
 def vector(self, name):
     """ Returns a dictionary with character bigrams and suffix.
         For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
     """
     v = chngrams(name, n=2)
     v = count(v)
     v[name[-2:] + "$"] = 1
     v[len(name)] = 1
     return v
Exemplo n.º 12
0
 def vector(self, name): 
     """ Returns a dictionary with character bigrams and suffix.
         For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
     """
     v = chngrams(name, n=2)
     v = count(v)
     v[name[-2:]+"$"] = 1
     v[len(name)] = 1
     return v
Exemplo n.º 13
0
 def start(self):
     cloudSize = dameCloudSize(self.id_request)
     cloudSize = cloudSize[0][0]
     searchKey = dameSerchKey(self.id_request)
     searchKey = searchKey[0][0]
     step = 0
     while step <= 5:  #Mas adelante setear get_stop; esto indica la cantidad de niveles
         for id_cloud in dameIdCloud(
                 self.id_request
         ):  #Obtiene IDS de los clouds que pertenecen al proyecto
             print "Id Cloud: " + str(id_cloud[0])
             cloud = self.generar_cloud(dameNodo(id_cloud[0]))
             true_nodes = self.trueNodesSelection(cloud)
             for n in true_nodes:
                 try:
                     cloud.graph.node[n]['select'] = False
                     crawler = SimpleCrawler1(n, delay=0.1)
                     crawler.newStructure(cloud.graph)
                     time = 0
                 except:
                     continue
                 while len(crawler.visited) < cloudSize:
                     print "Cloudsize = " + str(
                         cloudSize) + " Crawler Visited = " + str(
                             len(crawler.visited)) + " Nivel =  " + str(
                                 step)
                     print 'Explorando ...'
                     crawler.crawl(method=None)
                     time += 1
                     if time > cloudSize * 10:
                         break
                 actualizarSelect(cloud.graph.node[n]['ID'],
                                  cloud.graph.node[n]['select'])
                 print
                 print '#####Generando documentos#####'
                 #Creacion de minePackage
                 clouds = list()
                 clouds.append(cloud)
                 minePackage = dict()
                 minePackage['clouds'] = clouds
                 minePackage['searchKey'] = searchKey
                 minePackage['searchKeyStemmer'] = count(words(
                     Sentence(parse(searchKey))),
                                                         stemmer=PORTER)
                 self.IRController.start(minePackage)  #Recupera Informacion
                 #FALTA SCRAPPER CONTROLLER
             #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD
             minePackage = None
             cloud = None
             gc.collect
         step += 1
         print "Explorando nivel nro: " + str(step)
         #Controla los niveles a expandir, en este caso 10
     print "Proceso Finalizado"
Exemplo n.º 14
0
def selectWords(review):
    '''
    a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark
    '''
    review = parsetree(review, lemmata=True)[0]  #lemmatize the review
    #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks
    review = [
        w.lemma for w in review if w.tag.startswith(('JJ', 'NN', 'VB', '!'))
    ]
    review = count(review)  #a dictionary of (word, count)
    return review
Exemplo n.º 15
0
 def run(self, minePackage):
     clouds = minePackage['clouds']
     urlContent = UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes(
         ):  #Itera una lista de enlaces de la nube
             print cloud.graph.node[n]['link']
             pageContent = urlContent.plainTextConverter(
                 cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData'] = MethodData(
                 count(words(Sentence(parse(pageContent))), stemmer=PORTER))
Exemplo n.º 16
0
    def train(cls, train_file, model_file):
        sents_dic = (json.loads(jsonl)
                     for jsonl in SoftSkills.load(train_file))
        model = KNN()

        for sent in sents_dic:
            text = sent['text']
            v = count([word for word, pos in tag(text)])  # {'sweet': 1}
            if v:
                model.train(v, type=sent['soft skill'])
        model.save(model_file)
        return model
Exemplo n.º 17
0
 def __init__(self, data, url="", contenidoBd=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(
             url, "mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
     else:
         if (contenidoBd != ""):
             self.contenidoConEtiquetas = contenidoBd
             self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
         else:
             self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))),
                       stemmer=PORTER)
Exemplo n.º 18
0
	def tokenizer(self,url):
		#text = 'The black cat was spying on the white cat.'
		#stemmer=None, stemmer=LEMMA, stemmer=PORTER
		#print count(words(pageContent), stemmer=PORTER)
		#print count(words(pageContent), stemmer=LEMMA)


		#url_content = UrlToplainTextConverter()
		#page_content = url_content.plainTextConverter(url)
           page_content = url
           s = Sentence(parse(page_content))
           tokenized_file = count(words(s), stemmer=PORTER)
           print 
           print tokenized_file
           print
Exemplo n.º 19
0
def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs # this is cheap
    return dict
def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs  # this is cheap
    return dict
def postag_feature_builder(
        text, target_pos=('JJ', 'NN', 'VB', 'NP', 'RB', 'CD')):
    """
        faster version of the tag feature builder
        uses paten.tag instead of paten.parsetree
    """
    if not text:
        return {}
    # tag each word
    try:
        result = patvec.count(
            (word for word,
             tag in paten.tag(text,
                              tokenize=True,
                              encoding='utf-8') if tag in target_pos))
    except IndexError as e:
        print text, e
        result = {}
    return result
Exemplo n.º 22
0
 def test_count(self):
     # Assert wordcount with stemming, stopwords and pruning.
     w = ["The", "cats", "sat", "on", "the", "mat", "."]
     v1 = vector.count(w)
     v2 = vector.count(w, stemmer=vector.LEMMA)
     v3 = vector.count(w, exclude=["."])
     v4 = vector.count(w, stopwords=True)
     v5 = vector.count(w, stopwords=True, top=3)
     v6 = vector.count(w, stopwords=True, top=3, threshold=1)
     v7 = vector.count(w, dict=vector.readonlydict, cached=False)
     self.assertEqual(v1, {"cats":1, "sat":1, "mat":1, ".":1})
     self.assertEqual(v2, {"cat":1, "sat":1, "mat":1, ".":1})
     self.assertEqual(v3, {"cats":1, "sat":1, "mat":1})
     self.assertEqual(v4, {"the":2, "cats":1, "sat":1, "on":1, "mat":1, ".":1})
     self.assertEqual(v5, {"the":2, "cats":1, ".":1})
     self.assertEqual(v6, {"the":2})
     # Assert custom dict class.
     self.assertTrue(isinstance(v7, vector.readonlydict))
     print("pattern.vector.count()")
Exemplo n.º 23
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 19:37:34 2019

@author: alternatif
"""

from pattern.web import Twitter
from pattern.en import tag
from pattern.vector import KNN, count

twitter, knn = Twitter(), KNN()

for i in range(1, 3):
    for tweet in twitter.search('#win OR #fail', start=i, count=100):
        s = tweet.text.lower()
        p = '#win' in s and 'WIN' or 'FAIL'
        v = tag(s)
        v = [word for word, pos in v if pos == 'JJ']  # JJ = adjective
        v = count(v)  # {'sweet': 1}
        if v:
            knn.train(v, type=p)

print(knn.classify('sweet potato burger'))
print(knn.classify('stupid autocorrect'))
Exemplo n.º 24
0
def v(review1):
    v3 = parsetree(review1, lemmata=True)[0]
    v4 = [w.lemma for w in v3 if w.tag.startswith(('JJ', 'NN', 'VB', '!'))]
    v5 = count(v4)
    return v5
Exemplo n.º 25
0
def tokenization():
    text = "My new car is better than my new bed"
    tokens = words(text)
    print(tokens)
    print(count(tokens))
Exemplo n.º 26
0
 def makeSameTense(self, w1, w2):
     tense = count([i[0] for i in tenses(w2)], stopwords=True)
     tense = sorted(tense, key=operator.itemgetter(2))
     return verbs.conjugate(w1, tense[0])
Exemplo n.º 27
0
from pattern.en import parse, Sentence, parsetree, lexicon
from pattern.db import csv
from random import choice

# word count
freq_dic = {}
with open('data/input/corpus.txt', 'r') as fp:
    words_list = words(fp.read(),
                       filter=lambda w: w.strip("'").isalnum(),
                       punctuation='.,;:!?()[]{}`'
                       '\"@#$^&*+-|=~_')
    # returns a list of words by splitting the string on spaces.
    freq_dic = count(  # takes a list of words and returns a dictionary of (word, count)-items.
        words=words_list,
        top=None,  # Filter words not in the top most frequent (int).
        threshold=0,  # Filter words whose count <= threshold.
        stemmer=None,  # PORTER | LEMMA | function | None
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)
print count(words(s), stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
s = Sentence(parse(s))
print count(s, stemmer=LEMMA)
# character n-grams
Exemplo n.º 28
0
 def countWords(self):
     wordDict = count(
         words(plaintext(self.content),
               filter=lambda w: w.strip("'").isalpha()))
     return Counter(wordDict)
Exemplo n.º 29
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 19 19:45:49 2014

@author: scut1
"""

from pattern.web    import Twitter
from pattern.en     import tag
from pattern.vector import KNN, count

twitter, knn = Twitter(), KNN()

for i in range(1, 3):
    for tweet in twitter.search('#win OR #fail', start=i, count=100):
        s = tweet.text.lower()
        p = '#win' in s and 'WIN' or 'FAIL'
        v = tag(s)
        v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
        v = count(v) # {'sweet': 1}
        if v:
            knn.train(v, type=p)

print knn.classify('sweet potato burger')
print knn.classify('stupid autocorrect')
# language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning
# (vector space model, clustering, SVM), network analysis and <canvas> visualization.
# pattern.vector
# The pattern.vector module is a toolkit for machine learning, based on a vector space model of bag-of-words
# documents with weighted features (e.g., tf-idf) and distance metrics (e.g., cosine similarity, infogain).
# Models can be used for clustering (k-means, hierarchical), classification (Naive Bayes, Perceptron, k-NN, SVM)
# and latent semantic analysis (LSA).

from pattern.web    import Twitter
from pattern.en     import tag
from pattern.vector import KNN, count

twitter, knn = Twitter(), KNN()
    
    for i in range(1, 10):
        for tweet in twitter.search('#win OR #fail', start=i, count=100):
            s = tweet.text.lower()
            p = '#win' in s and 'WIN' or 'FAIL'
            v = tag(s)
            v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
            v = count(v) 
            if v:
                knn.train(v, type=p)
    
    print knn.classify('sweet potato burger')
    print knn.classify('stupid autocorrect')
 
# Displays
# 'WIN'
# 'FAIL'
Exemplo n.º 31
0
 def makeSameTense(self, w1, w2):
     tense = count([i[0] for i in tenses(w2)], stopwords=True)
     tense = sorted(tense, key=operator.itemgetter(2))
     return verbs.conjugate(w1, tense[0])
Exemplo n.º 32
0
 def processor(self, minePackage):
     print '####SEARCH_KEY:', minePackage['searchKey']
     var = minePackage['searchKey']
     s = Sentence(parse(var))
     return count(words(s),
                  stemmer=PORTER)  #Retorna diccionario {palabra: cantidad}
def unigram_text_feature(text):
    """
        return probability distribution {term: count} of each word in <text>
    """
    features = patvec.count(nltk.word_tokenize(text.lower()))
    return features
Exemplo n.º 34
0
 def processor(self,minePackage):
     # print '####SEARCH_KEY:',minePackage['searchKey']
     s = Sentence(parse(minePackage['searchKey']))
     minePackage['searchKey']=count(words(s), stemmer=PORTER)
     return minePackage['searchKey']
Exemplo n.º 35
0
 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
     return sorted_keywords
Exemplo n.º 36
0
def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v
Exemplo n.º 37
0
def get_results(query, quantity, force=False, news=False, analysis=True):
    query = query.lower()
    start = datetime.now()

    query = query.replace('_', '%20')
    breakdown = 50

    if breakdown > quantity:
        breakdown = quantity

    data_to_be_written = []
    knowledgeKeywords = []
    duplicates = []

    results, created = webSearch.objects.get_or_create(queryText=query.strip())
    if created or force or len(results.results.all()) < quantity:
        all_results = getGoogleResults(query, quantity, news, force)
    else:
        all_results = []

    if len(all_results) == 0 and not created:
        all_results = [r.url for r in results.results.all()]

    all_results = all_results[:quantity]
    print "TOTAL RESULTS ", str(len(all_results))
    # Done with getting search results

    for index, i in enumerate(all_results):
        try:
            wr, created = WebResource.objects.get_or_create(url=i)
            if created:
                wr = parseURL(i, True)
            data = {'url': i}
            keywords = [
                w for w in count(wr.text, top=10, stemmer=LEMMA)
                if w not in stop
            ]

            if 'books.google' in i:
                text = ''
            else:
                text = wr.text

            data.update({
                'keywords': keywords,
                'text': plaintext(text),
                'title': wr.title,
                'urls': wr.urls,
                'type': 'result',
                'index': index + 1,
                'similar': [],
                'duplicates': [],
                'category': 0,
            })

            if wr not in results.results.all():
                results.results.add(wr)

            data['plaintext'] = data['text'].split('\n')

            # while '' in data['plaintext']:
            # 	data['plaintext'].remove('')

            # knowledgeKeywords.extend(data['keywords'])

            data_to_be_written.append(data)
        except Exception as e:
            print e

    print "Response Result model Prepared"

    if not analysis:
        return data_to_be_written

    list_of_sim_docs, model, m = find_similarity(data_to_be_written)
    for i in list_of_sim_docs:
        similar = {
            'type': 'similar',
            's': i.get('source'),
            'd': i.get('dest'),
            'source': i.get('source'),
            'dest': i.get('dest'),
            'score': i.get('score'),
        }
        data_to_be_written.append(similar)

        if similar['score'] > 0.9:
            for res in data_to_be_written:
                if res['type'] in [
                        'result', 'duplicate'
                ] and res['url'] == i.get('dest') and len(res['text']) > 0:
                    print "Duplicate [{0}].[{1}]".format(
                        i['source'][:20], i['dest'][:20])
                    res['type'] = 'duplicate'

    items = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in data_to_be_written
    ]
    m = Model(items, weight=TFIDF)

    # k = 10
    ####### BEGIN Experimental Setup ##########

    # v,d = m.features, m.documents
    # y,x = len(m.documents),len(m.features)

    def build_matrix(w=None, d=None):
        y, x = len(d), len(w)
        model = np.zeros((y, x))

        for i in range(y):
            model[i] = [1 if w[j] in d[i].words else 0 for j in range(x)]

        return model

    # def find_word_matches(model, words = None, d = None):
    # 	y,x = model.shape
    # 	for i in range(y):
    # 		for j in range(i+1,y):
    # 			a = np.copy(model[i])
    # 			b = np.copy(model[j])

    # 			a_ones = np.count_nonzero(a)
    # 			b_ones = np.count_nonzero(b)

    # 			comparison = (a==b)

    # 			cross_product = a*b
    # 			intersection = np.count_nonzero(cross_product)
    # 			union = a_ones+b_ones-intersection

    # 			if a_ones+b_ones>0 and intersection > 0:
    # 				score = intersection/union
    # 			else:
    # 				score = 0

    # 			if model[i].any() and model[j].any() and comparison.any() and score > 0.4:
    # 				print "Match [{0}] {1}:[{2} words] - [{3}] {4}:[{5} words] : {6} words".format(d[i].description,d[i].name[:30], np.count_nonzero(a), d[j].description,d[j].name[:30], np.count_nonzero(b), score, math.fabs(d[i].description - d[j].description))
    # 				similar = {
    # 					'type' : 'similar',
    # 					'source' : d[i].name,
    # 					'dest' : d[j].name,
    # 					'score' : score,
    # 				}
    # 				data_to_be_written.append(similar)

    # 			if score >= 0.9:
    # 				for res in data_to_be_written:
    # 					if res['type'] in ['result','duplicate'] and res['url'] == d[j].name and len(res['text'])>0:
    # 						print "Duplicate [{0}].[{1}]".format(i+1,j+1)
    # 						res['type'] = 'duplicate'
    # 	return model

    def word_frequency(model,
                       words=None,
                       documents=None,
                       threshold1=0,
                       threshold2=1,
                       transpose=False):
        "Returns frequent word amoung documents in range of threshold"
        y, x = model.shape
        data = {}

        for i in range(x):
            count = np.count_nonzero(model[:, i]) / y
            if count >= threshold1 and count <= threshold2:
                if words:
                    data[words[i]] = count
                else:
                    data[i] = count
        return data

    model = build_matrix(m.features, m.documents)
    # model = find_word_matches(model, m.features, m.documents)
    knowledgeKeywords = [
        w for w in word_frequency(model, m.features, m.documents, 0.2, 0.8)
    ][:20]

    ####### END Experimental Setup ##########

    # c = m.cluster(method=HIERARCHICAL, k=k)
    # for i in c:
    # 	cluster = []
    # 	k = []
    # 	contains_text = False

    # 	for item in i:
    # 		for data in data_to_be_written:
    # 			if data.get('type') == 'result' and data.get('url')==item.name:
    # 				cluster.append({
    # 					'url' : data.get('url'),
    # 					'index' : item.description,
    # 					})
    # 				if data.get('text'):
    # 					k.extend([w for w in count(words(data.get('text')), top=50, stemmer = PORTER, exclude=[], stopwords=False, language='en')])
    # 					contains_text=True
    # 	cluster = {
    # 		'type' : 'cluster',
    # 		'data' : cluster,
    # 		'index' : min([c.get('index') for c in cluster] + [0]),
    # 		'keywords' : [w for w in count(k, top=10, stemmer = PORTER, exclude=[], stopwords=False, language='en')]
    # 	}

    # 	cluster['contains_text'] = contains_text

    # 	data_to_be_written.append(cluster)

    # print "{0} results".format(len(data_to_be_written))
    data_to_be_written.append({
        'type': 'meta',
        'keywords': knowledgeKeywords,
    })

    result = {}
    for i in data_to_be_written:
        if i.get('type') in ['result', 'duplicate']:
            url = i.get('url')
            index = int(i.get('index'))

            result[index] = [
                1 for r in data_to_be_written
                if r.get('type') == 'similar' and r['source'] == url
            ]

    result2 = [i for i, j in result.iteritems()]
    result3 = [len(j) for i, j in result.iteritems()]

    Process(target=plot_graph, args=(result2, result3)).start()

    return data_to_be_written
Exemplo n.º 38
0
def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v
Exemplo n.º 39
0
#!/usr/bin/env python
from pattern.en import referenced

# Imports
import cgi, cgitb
import sys, json
from pattern.vector import count, LEMMA
from pattern.en import parse, Sentence

# Logic
data = cgi.FieldStorage()
output = data.getvalue("documentText")
s = Sentence(parse(output))
res = count(s, exclude=[".", ",", "-", "!", '"', "'", ":", ";", "?"], stemmer=LEMMA)

print "Content-Type: application/json\n\n"
print json.dumps(res)
Exemplo n.º 40
0
        for row in all_q:
            row = filter(None, row)  #remove nulls
    
            def fluency(questions):
                return len(questions)
            
            def elaboration(questions):
                return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions)
                
            def variance(cluster):
                return avg([distance(centroid(cluster), v) for v in cluster])
    
            vectors = []
                
            for q in all_q:
                v = count(words(q), stemmer='lemma') 
                v = Vector(v)
                vectors.append(v)
                
            clusters = hierarchical(vectors, k=250, distance='cosine')
            clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] 
            clusters = sorted(clusters, key=variance)
            
            categories = {}
            
            for i, cluster in enumerate(clusters):
                for v in cluster: 
                    categories[row[vectors.index(v)]] = i

            def flex(questions):
                ml_categories = []