def setContent(self, title, detail): self.title = title self.detail = detail # Index content words words = text.uniqueInOrder( text.removeStopWords(text.tokenize(title) + text.tokenize(detail))) words = words[0: conf.MAX_WORDS_INDEXED] # Limit number of words indexed self.words = text.tuples(words, maxSize=2)
def acquire_data_verbose(data, db=None): if (db is None): db = dbctrl.handledb('adj') conn = MySQLdb.connect(user='******', passwd='aditya', db='mem') c= conn.cursor() print "connection openend to mem" c.execute("delete from normal_proc1") c.execute("delete from normal_proc2") print 'data wiped' for e in data: whoosh = text.tokenize(e) print 'text tokenized' for word in whoosh: c.execute("insert into normal_proc1 values('"+word+"', 1)") conn.commit() print 'table 1 done' c.execute("insert into normal_proc2 select word, sum(freq) from normal_proc1 group by word") conn.commit() print 'table 2 done' conn.close()
def basic(review, rules, trules): elements=[]; emoticons = list(re.finditer(rules[1], review)) print str(len(emoticons)) + " emoticons found" if emoticons: elements.extend([e.group() for e in emoticons]) print elements if re.search(rules[0], review): print "negatives detected" recontruct = ''; last=0 for e in re.finditer(rules[2], review): elements.extend(['!'+w for w in text.tokenize(e.group(), trules)]) recontruct += review[last:e.start()] last = e.end() review = recontruct + review[last:] elements.extend(text.tokenize(review, trules)) return elements
def retrieveTopAnswers(surveyId, questionId, answerStart=None, hideReasons=False): questionIdStr = str(questionId) logging.debug(('retrieveTopAnswers()', 'answerStart=', answerStart)) answerRecords = [] # Require user-input to suggest answers, to force some user thought inputWords = text.uniqueInOrder( text.removeStopWords(text.tokenize(answerStart))) logging.debug(('retrieveTopAnswers()', 'inputWords=', inputWords)) if inputWords and (0 < len(inputWords)): # Retrieve answer records answerRecords = Answer.query( Answer.surveyId == surveyId, Answer.questionId == questionIdStr, Answer.words == inputWords[-1]).order(-Answer.score).fetch(1) if (2 <= len(inputWords)): tuple = ' '.join(inputWords[-2:-1]) answerRecords += Answer.query( Answer.surveyId == surveyId, Answer.questionId == questionIdStr, Answer.words == tuple).order(-Answer.score).fetch(1) logging.debug( ('retrieveTopAnswers()', 'answerRecords=', answerRecords)) # Filter out empty answer/reason # Answers missing required-reason should not be saveable. Nor should empty answers. if hideReasons: answerRecords = filter(lambda a: a.hasAnswer(), answerRecords) else: answerRecords = filter(lambda a: a.hasAnswerAndReason(), answerRecords) return answerRecords
def basic(review, rules=None, trules=None, verbose=False): if rules is None: rules = learning() if trules is None: text._retext() if verbose: debug.basic(review, rules, trules) elements=[]; #emoticons = list(re.finditer(rules[1], review)) #if emoticons: elements.extend([e.group() for e in emoticons]) if re.search(rules[0], review): recontruct = ''; last=0 for e in re.finditer(rules[2], review): elements.extend(['!'+w for w in text.tokenize(e.group(), trules)]) recontruct += review[last:e.start()] last = e.end() review = recontruct + review[last:] elements.extend(text.tokenize(review, trules)) return elements
def classify(): decisive = map(norm, open('decisive.txt', 'rb').readlines()) loriot = list(tokenize(open('loriot.txt', 'rb').read().decode('utf-8'))) #print decisive #return platforms = load_platforms() scores = defaultdict(dict) for party, sections in platforms.items(): for section in sections: scores[party][section.key] = {'tokens': len(section)} text = normalize(section.text) n_decisive = 0.0 for phrase in decisive: if phrase in text: n_decisive += 1 scores[party][section.key]['decisive'] = n_decisive/len(section) n_loriot = 0.0 for token in loriot: if token in text: n_loriot += 1 scores[party][section.key]['loriot'] = n_loriot/len(section) #terms = section_terms(model, section) #terms = [(t, s) for t, s in terms] #print [party, section.title, [t for t, s in terms[:10]]] #pprint(scores) with open('data/language.json', 'wb') as fh: json.dump(dict(scores), fh, indent=2)
def retrieveTopSlicesByScoreForStart(budgetId, sliceStart, hideReasons=False): logging.debug( ('retrieveTopSlicesByScoreForStart()', 'sliceStart=', sliceStart)) # We always have sliceStart, since we're not pre-populating slice-suggestions, there must always be slice input sliceRecords = [] inputWords = text.uniqueInOrder( text.removeStopWords(text.tokenize(sliceStart))) logging.debug('retrieveTopSlices() inputWords=' + str(inputWords)) if inputWords and (0 < len(inputWords)): # Retrieve top-voted slice-records matching last input-word. Results will be collected and match-scored in client. # Only one inequality filter per query is supported, so cannot require both title and reason are non-null sliceRecords = Slice.query( Slice.budgetId == budgetId, Slice.words == inputWords[-1]).order(-Slice.score).fetch(1) # Retrieve for last input-word-pair if (2 <= len(inputWords)): tuple = ' '.join(inputWords[-2:-1]) sliceRecords += Slice.query( Slice.budgetId == budgetId, Slice.words == tuple).order(-Slice.score).fetch(1) logging.debug('retrieveTopSlices() sliceRecords=' + str(sliceRecords)) # Filter out empty title/reason # There should be no records missing title & reason, since these should not be saveable, and should not word-match if hideReasons: sliceRecords = filter(lambda s: s.hasTitle(), sliceRecords) else: sliceRecords = filter(lambda s: s.hasTitleAndReason(), sliceRecords) return sliceRecords
def setContent(self, content): self.content = content # Index content words words = text.uniqueInOrder(text.removeStopWords( text.tokenize(content))) words = words[0: conf.MAX_WORDS_INDEXED] # Limit number of words indexed self.words = text.tuples(words, maxSize=2)
def iter_speech_counts(speech, synsets, stopwords=None): speech_without_text = {k: v for k, v in speech.items() if k != 'text'} counts = Counter(text.tokenize(speech['text'], stopwords)) total_count = sum(counts.values()) for synset_name, synset_tokens in synsets: synset_count = sum(counts.get(synset_token, 0) for synset_token in synset_tokens) yield dict(synset=synset_name, synset_count=synset_count, synset_proportion=float(synset_count) / float(total_count), total_count=total_count, **speech_without_text)
def test_tokenize(self): sentences = self.input_sentences expect = [ unicode( 'La Piedad del Vaticano es un grupo escultórico en mármol ' 'realizado por Miguel Ángel entre 1498 y 1499 .', encoding='utf-8'), unicode('Sus dimensiones son 174 por 195 cm .', encoding='utf-8'), unicode('Se encuentra en la Basílica de San Pedro del Vaticano .', encoding='utf-8'), ] actual = text.tokenize('es', sentences) self.assertEqual(expect, actual)
def analyze(review, db=None, regexp=None, debug=False): if regexp is None: regexp = _re_analysis() if db is None: db = memory.recollect() if debug: analyzed(review, db, regexp) trules = text._retext() elements=[] emoticons = list(re.finditer(regexp["emoticons"], review)) if emoticons: elements.extend([e.group() for e in emoticons]) if re.search(regexp["negation"], review): recontruct = ''; last=0 for e in re.finditer(regexp["plckng"], review): elements.extend(['!'+w for w in text.tokenize(e.group(), trules)]) recontruct += review[last:e.start()] last = e.end() review = recontruct + review[last:] elements.extend(text.tokenize(review, trules)) elements = filter(db.__contains__, elements) if elements: val = 0; size = 0 comp = ([db[e] for e in elements]) val += sum(comp); size += len(comp) return val / size return 0
def tabulate(review, rating, z): tokens = set(text.tokenize(review)) c = z.cursor() #c.execute("SELECT word FROM words") #_data = c.fetchall() #Poorly constructed connector (not really) #data=[] #for e in _data: # data.append(e[0]) #_int = tokens.intersection(set(data)) '''for token in _int: c.execute("update words set freq = freq + 1 and score = score + " + str(rating)) tokens.remove(token)''' for leftovers in tokens: c.execute("insert into words values('"+leftovers+"', 1, "+ str(rating) + ", 0)")
def analyzed(review, db, regexp): trules = text._retext() elements=[] emoticons = list(re.finditer(regexp["emoticons"], review)) print "analyze: "+str(len(emoticons))+" emoticons found" if emoticons: elements.extend([e.group() for e in emoticons]) if re.search(regexp["negation"], review): print "analyze: negation detected" recontruct = ''; last=0 for e in re.finditer(regexp["plckng"], review): elements.extend(['!'+w for w in text.tokenize(e.group(), trules)]) recontruct += review[last:e.start()] last = e.end() print "analyze: "+str(len(elements)-len(emoticons))+" negatives found" review = recontruct + review[last:] elements.extend(text.tokenize(review, trules)) elements = filter(db.__contains__, elements) print elements if elements: val = 0; size = 0 comp = ([db[e] for e in elements]) val += sum(comp); size += len(comp) return val / size return 0
def create(budgetId, title, reason, creator=None, fromEditPage=False): slice = Slice(id=Slice.toKeyId(budgetId, title, reason), budgetId=budgetId, title=title, reason=reason, creator=creator, fromEditPage=fromEditPage) # Index content words content = ' '.join([w for w in [title, reason] if w]) words = text.uniqueInOrder(text.removeStopWords( text.tokenize(content))) words = words[0: conf.MAX_WORDS_INDEXED] # Limit number of words indexed slice.words = text.tuples(words, maxSize=2) return slice
def test_tokenize(self): sentences = self.input_sentences expect = [ unicode( 'La Piedad del Vaticano es un grupo escultórico en mármol ' 'realizado por Miguel Ángel entre 1498 y 1499 .', encoding='utf-8'), unicode( 'Sus dimensiones son 174 por 195 cm .', encoding='utf-8'), unicode( 'Se encuentra en la Basílica de San Pedro del Vaticano .', encoding='utf-8'), ] actual = text.tokenize('es', sentences) self.assertEqual(expect, actual)
def dont_do(data, expec): '''''' learnt = {} i = 0 for e in data: words = text.tokenize(e) for w in words: if w not in learnt.keys(): learnt[w] = [expec[i], 1] else: learnt[w][0] += expec[i] learnt[w][1] += 1 i+=1 #finally, remember this dbctrl.snapshot('adj') dbctrl.pickle_adj(learnt)
def acquire_data(data, db=None, verbose=False): '''Use the data standard. This will require more work.''' if verbose: return acquire_data_verbose(data, db) if (db is None): db = dbctrl.handledb('adj') conn = MySQLdb.connect(user='******', passwd='aditya', db='mem') c= conn.cursor() c.execute("delete from normal_proc1") c.execute("delete from normal_proc2") for e in data: whoosh = text.tokenize(e) for word in whoosh: c.execute("insert into normal_proc1 values('"+word+"', 1)") conn.commit() c.execute("insert into normal_proc2 select word, sum(freq) from normal_proc1 group by word") conn.commit() conn.close()
def retrieveTopReasonsForStart(proposalId, reasonStart): proposalIdStr = str(proposalId) inputWords = text.uniqueInOrder( text.removeStopWords(text.tokenize(reasonStart))) if conf.isDev: logging.debug('retrieveTopReasonsForStart() inputWords=' + str(inputWords)) reasonRecordFutures = [] if inputWords and (0 < len(inputWords)): # Retrieve top-voted reason-records matching last input-word # Results will be collected & match-scored in client lastWord = inputWords[-1] reasonRecordFutures.append( Reason.query( Reason.proposalId == proposalIdStr, Reason.words == lastWord).order(-Reason.score).fetch_async(1)) # Retrieve for last input-word-pair if (2 <= len(inputWords)): lastTuple = ' '.join(inputWords[-2:-1]) reasonRecordFutures.append( Reason.query(Reason.proposalId == proposalIdStr, Reason.words == lastTuple).order(-Reason.score).fetch_async(1)) # De-duplicate records, since both word & tuple-top-suggestion may be the same recordsUnique = {} for f in reasonRecordFutures: if f: for r in f.get_result(): if r: recordsUnique[r.key.id()] = r if conf.isDev: logging.debug('retrieveTopReasonsForStart() recordsUnique=' + str(recordsUnique)) return recordsUnique.values()
def textprocess(results, termweight='tfidf'): # word count list wordcounts = [] # df: document frequenty df = {} # iterate Y! BOSS results for result in results: # word list words = [] # title words.extend(text.tokenize(remove_html_tags(result['title']))) # abstract words.extend(text.tokenize(remove_html_tags(result['abstract']))) # key terms of Yahoo! BOSS results #if 'keyterms' in result and len(result['keyterms']) > 0: # words.extend(text.cleanwords(_keyterms(result['keyterms']['terms']))) # del.icio.us tags of Yahoo! BOSS results #if 'delicious_toptags' in result and len(result['delicious_toptags']) > 0: # words.extend(text.cleanwords(_delicioustags(result['delicious_toptags']['tags']))) # word count from word list wc = text.wordcount(words) wordcounts.append(wc) # calcurate document frequncy for word, c in wc.items(): df.setdefault(word, 0.0) df[word] += 1.0 # word list wordlist = [] for word, freq in df.items(): #if freq > 1.0 and float(freq)/len(results) <= 0.6: if freq > 1.0: wordlist.append(word) #print "|D|:%d" % (len(results)) doc_count = float(len(results)) # generate word vector wordvectors = [] for wc in wordcounts: doc_size = float(sum([i for i in wc.values()])) #print "docsize:%d" % (docsize) wordvector = [] for word in wordlist: if word in wc: # boolean if termweight == 'boolean': wordvector.append(1.0) # tf elif termweight == 'tf': wordvector.append(wc[word]) # normtf elif termweight == 'normtf': tf = wc[word] / doc_size wordvector.append(tf) # tfidf elif termweight == 'tfidf': tf = wc[word] / doc_size tf = math.sqrt(tf) idf = math.sqrt(doc_count / df[word]) tfidf = tf * idf wordvector.append(tfidf) else: wordvector.append(0.0) wordvectors.append(wordvector) #print wordlist #print wordvectors return wordlist, wordvectors
def tokens(self): if not hasattr(self, '_tokens'): self._tokens = list(tokenize(self.text)) return self._tokens
def setContent(self, content): self.content = content words = text.uniqueInOrder(text.removeStopWords( text.tokenize(content))) words = words[0:20] # Limit number of words indexed self.words = text.tuples(words, maxSize=2)
def do(data, expec): '''Naive work indeed''' _dict = {} _return = {} i=0 numbers = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] for e in expec: numbers[e] += 1 p_rev_eq_k = [0,0,0,0,0,0,0,0,0,0,0] num = len(expec) for i in range(1,11): p_rev_eq_k[i] = numbers[i]/num print "data initialized" #while learning it doesn't really matter how many times this appears. OR DOES IT? i=0 for e in data: buildup=[] for s in text.split_sentences(e): words=text.tokenize(s) builder=[] if len(set(words).intersection(set(negatives))) > 0: for d in words: builder.append("!"+d) else: builder.extend(words) buildup.extend(builder) for w in (buildup): try: _dict[w][expec[i]] +=1 except KeyError: _dict[w] = [0,0,0,0,0,0,0,0,0,0,0] _dict[w][expec[i]] +=1 i+=1 print "data loaded. " + str(len(_dict)) + " words present" print 'commence pruning' q=0 av=0 for w in _dict.keys(): av += sum(_dict[w]) av = av / len(_dict) av *= 0.9 print str(av) + " is the threshold" for w in _dict.keys(): if sum(_dict[w]) < av: _dict.pop(w) q+=1 print str(q) + ' records removed' i=0 for w in _dict.keys(): score = 0 p_word_in_rev = 0 for j in range(1,11): try: p_word_in_rev += (_dict[w][j]/numbers[j])*p_rev_eq_k[j] except: p_word_in_rev += 0 weights = [0,0,0,0,0,0,0,0,0,0,0] for j in range(1,11): try: weights[j] = (_dict[w][j]/numbers[j])*p_rev_eq_k[j]/p_word_in_rev except: weights[j] = 0 for j in range(1,11): score += weights[j] * j _return[w] = [score,sum(_dict[w])] print 'data processed' print 'task complete' return _return