Python tokenize 예제들, text.tokenize Python 예제들

예제 #1

0

파일 보기

파일: proposal.py 프로젝트: converj/reasonSurvey

 def setContent(self, title, detail):
     self.title = title
     self.detail = detail
     # Index content words
     words = text.uniqueInOrder(
         text.removeStopWords(text.tokenize(title) + text.tokenize(detail)))
     words = words[0:
                   conf.MAX_WORDS_INDEXED]  # Limit number of words indexed
     self.words = text.tuples(words, maxSize=2)

예제 #2

0

파일 보기

파일: normal.py 프로젝트: struggling-coder/opinion

def acquire_data_verbose(data, db=None):
	
	if (db is None):
		db = dbctrl.handledb('adj')

	conn = MySQLdb.connect(user='******', passwd='aditya', db='mem')	
	c= conn.cursor()

	print "connection openend to mem"

	c.execute("delete from normal_proc1")
	c.execute("delete from normal_proc2")

	print 'data wiped'

	for e in data:
		whoosh = text.tokenize(e)
		print 'text tokenized'
		for word in whoosh:
			c.execute("insert into normal_proc1 values('"+word+"', 1)")
		conn.commit()

	print 'table 1 done'

	c.execute("insert into normal_proc2 select word, sum(freq) from normal_proc1 group by word")
	conn.commit()

	print 'table 2 done'
	conn.close()

예제 #3

0

파일 보기

파일: debug.py 프로젝트: struggling-coder/opinion

def basic(review, rules, trules):
	elements=[]; 
	emoticons = list(re.finditer(rules[1], review))
	print str(len(emoticons)) + " emoticons found"
	if emoticons: elements.extend([e.group() for e in emoticons])
	print elements
	if re.search(rules[0], review):
		print "negatives detected"
		recontruct = ''; last=0
		for e in re.finditer(rules[2], review):
			elements.extend(['!'+w for w in text.tokenize(e.group(), trules)])
			recontruct += review[last:e.start()]
			last = e.end()
		review = recontruct + review[last:]
	elements.extend(text.tokenize(review, trules))
	return elements

예제 #4

0

파일 보기

파일: answer.py 프로젝트: converj/reasonSurvey

def retrieveTopAnswers(surveyId,
                       questionId,
                       answerStart=None,
                       hideReasons=False):
    questionIdStr = str(questionId)
    logging.debug(('retrieveTopAnswers()', 'answerStart=', answerStart))

    answerRecords = []

    # Require user-input to suggest answers, to force some user thought
    inputWords = text.uniqueInOrder(
        text.removeStopWords(text.tokenize(answerStart)))
    logging.debug(('retrieveTopAnswers()', 'inputWords=', inputWords))
    if inputWords and (0 < len(inputWords)):
        # Retrieve answer records
        answerRecords = Answer.query(
            Answer.surveyId == surveyId, Answer.questionId == questionIdStr,
            Answer.words == inputWords[-1]).order(-Answer.score).fetch(1)
        if (2 <= len(inputWords)):
            tuple = ' '.join(inputWords[-2:-1])
            answerRecords += Answer.query(
                Answer.surveyId == surveyId,
                Answer.questionId == questionIdStr,
                Answer.words == tuple).order(-Answer.score).fetch(1)
        logging.debug(
            ('retrieveTopAnswers()', 'answerRecords=', answerRecords))

    # Filter out empty answer/reason
    # Answers missing required-reason should not be saveable.  Nor should empty answers.
    if hideReasons:
        answerRecords = filter(lambda a: a.hasAnswer(), answerRecords)
    else:
        answerRecords = filter(lambda a: a.hasAnswerAndReason(), answerRecords)

    return answerRecords

예제 #5

0

파일 보기

파일: learn.py 프로젝트: struggling-coder/opinion

def basic(review, rules=None, trules=None, verbose=False):
	if rules is None: rules = learning()
	if trules is None: text._retext()
	if verbose: debug.basic(review, rules, trules)
	elements=[]; 
	#emoticons = list(re.finditer(rules[1], review))
	#if emoticons: elements.extend([e.group() for e in emoticons])
	if re.search(rules[0], review):
		recontruct = ''; last=0
		for e in re.finditer(rules[2], review):
			elements.extend(['!'+w for w in text.tokenize(e.group(), trules)])
			recontruct += review[last:e.start()]
			last = e.end()
		review = recontruct + review[last:]
	elements.extend(text.tokenize(review, trules))
	return elements

예제 #6

0

파일 보기

파일: classify.py 프로젝트: pudo/wahlprogramme

def classify():
    decisive = map(norm, open('decisive.txt', 'rb').readlines())
    loriot = list(tokenize(open('loriot.txt', 'rb').read().decode('utf-8')))
    #print decisive
    #return
    platforms = load_platforms()
    scores = defaultdict(dict)
    for party, sections in platforms.items():
        for section in sections:
            scores[party][section.key] = {'tokens': len(section)}
            text = normalize(section.text)
            n_decisive = 0.0
            for phrase in decisive:
                if phrase in text:
                    n_decisive += 1
            scores[party][section.key]['decisive'] = n_decisive/len(section)
            n_loriot = 0.0
            for token in loriot:
                if token in text:
                    n_loriot += 1
            scores[party][section.key]['loriot'] = n_loriot/len(section)
            #terms = section_terms(model, section)
            #terms = [(t, s) for t, s in terms]
            #print [party, section.title, [t for t, s in terms[:10]]]
    #pprint(scores)
    with open('data/language.json', 'wb') as fh:
        json.dump(dict(scores), fh, indent=2)

예제 #7

0

파일 보기

파일: slice.py 프로젝트: converj/reasonSurvey

def retrieveTopSlicesByScoreForStart(budgetId, sliceStart, hideReasons=False):

    logging.debug(
        ('retrieveTopSlicesByScoreForStart()', 'sliceStart=', sliceStart))

    # We always have sliceStart, since we're not pre-populating slice-suggestions, there must always be slice input
    sliceRecords = []
    inputWords = text.uniqueInOrder(
        text.removeStopWords(text.tokenize(sliceStart)))
    logging.debug('retrieveTopSlices() inputWords=' + str(inputWords))
    if inputWords and (0 < len(inputWords)):
        # Retrieve top-voted slice-records matching last input-word.  Results will be collected and match-scored in client.
        # Only one inequality filter per query is supported, so cannot require both title and reason are non-null
        sliceRecords = Slice.query(
            Slice.budgetId == budgetId,
            Slice.words == inputWords[-1]).order(-Slice.score).fetch(1)
        # Retrieve for last input-word-pair
        if (2 <= len(inputWords)):
            tuple = ' '.join(inputWords[-2:-1])
            sliceRecords += Slice.query(
                Slice.budgetId == budgetId,
                Slice.words == tuple).order(-Slice.score).fetch(1)
    logging.debug('retrieveTopSlices() sliceRecords=' + str(sliceRecords))

    # Filter out empty title/reason
    # There should be no records missing title & reason, since these should not be saveable, and should not word-match
    if hideReasons: sliceRecords = filter(lambda s: s.hasTitle(), sliceRecords)
    else: sliceRecords = filter(lambda s: s.hasTitleAndReason(), sliceRecords)

    return sliceRecords

예제 #8

0

파일 보기

 def setContent(self, content):
     self.content = content
     # Index content words
     words = text.uniqueInOrder(text.removeStopWords(
         text.tokenize(content)))
     words = words[0:
                   conf.MAX_WORDS_INDEXED]  # Limit number of words indexed
     self.words = text.tuples(words, maxSize=2)

예제 #9

0

파일 보기

def iter_speech_counts(speech, synsets, stopwords=None):
    speech_without_text = {k: v for k, v in speech.items() if k != 'text'}
    counts = Counter(text.tokenize(speech['text'], stopwords))
    total_count = sum(counts.values())
    for synset_name, synset_tokens in synsets:
        synset_count = sum(counts.get(synset_token, 0) for synset_token in synset_tokens)
        yield dict(synset=synset_name,
                   synset_count=synset_count,
                   synset_proportion=float(synset_count) / float(total_count),
                   total_count=total_count,
                   **speech_without_text)

예제 #10

0

파일 보기

파일: test_text.py 프로젝트: szmyty/joshua_translation_engine

 def test_tokenize(self):
     sentences = self.input_sentences
     expect = [
         unicode(
             'La Piedad del Vaticano es un grupo escultórico en mármol '
             'realizado por Miguel Ángel entre 1498 y 1499 .',
             encoding='utf-8'),
         unicode('Sus dimensiones son 174 por 195 cm .', encoding='utf-8'),
         unicode('Se encuentra en la Basílica de San Pedro del Vaticano .',
                 encoding='utf-8'),
     ]
     actual = text.tokenize('es', sentences)
     self.assertEqual(expect, actual)

예제 #11

0

파일 보기

파일: analysis.py 프로젝트: struggling-coder/opinion

def analyze(review, db=None, regexp=None, debug=False):
	if regexp is None: regexp = _re_analysis()
	if db is None: db = memory.recollect()
	if debug: analyzed(review, db, regexp)
	trules = text._retext()
	elements=[]
	emoticons = list(re.finditer(regexp["emoticons"], review))
	if emoticons: elements.extend([e.group() for e in emoticons])
	if re.search(regexp["negation"], review):
		recontruct = ''; last=0
		for e in re.finditer(regexp["plckng"], review): 
			elements.extend(['!'+w for w in text.tokenize(e.group(), trules)])
			recontruct += review[last:e.start()]
			last = e.end()
		review = recontruct + review[last:]
	elements.extend(text.tokenize(review, trules))
	elements = filter(db.__contains__, elements)
	if elements:
		val = 0; size = 0		
		comp = ([db[e] for e in elements])
		val += sum(comp); size += len(comp)
		return val / size
	return 0

예제 #12

0

파일 보기

파일: tabulate.py 프로젝트: struggling-coder/opinion

def tabulate(review, rating, z):
	tokens = set(text.tokenize(review))
	c = z.cursor()
	#c.execute("SELECT word FROM words")
	#_data = c.fetchall() #Poorly constructed connector (not really)
	#data=[]
	#for e in _data:
	#	data.append(e[0])
	#_int = tokens.intersection(set(data))
	'''for token in _int:
		c.execute("update words set freq = freq + 1 and score = score + " + str(rating))
		tokens.remove(token)'''
	for leftovers in tokens:
		c.execute("insert into words values('"+leftovers+"', 1, "+ str(rating) + ", 0)")

예제 #13

0

파일 보기

파일: analysis.py 프로젝트: struggling-coder/opinion

def analyzed(review, db, regexp):
	trules = text._retext()
	elements=[]
	emoticons = list(re.finditer(regexp["emoticons"], review))
	print "analyze: "+str(len(emoticons))+" emoticons found"
	if emoticons: elements.extend([e.group() for e in emoticons])
	if re.search(regexp["negation"], review):
		print "analyze: negation detected"
		recontruct = ''; last=0
		for e in re.finditer(regexp["plckng"], review): 
			elements.extend(['!'+w for w in text.tokenize(e.group(), trules)])
			recontruct += review[last:e.start()]
			last = e.end()
		print "analyze: "+str(len(elements)-len(emoticons))+" negatives found"
		review = recontruct + review[last:]
	elements.extend(text.tokenize(review, trules))
	elements = filter(db.__contains__, elements)
	print elements
	if elements:
		val = 0; size = 0		
		comp = ([db[e] for e in elements])
		val += sum(comp); size += len(comp)
		return val / size
	return 0

예제 #14

0

파일 보기

파일: slice.py 프로젝트: converj/reasonSurvey

 def create(budgetId, title, reason, creator=None, fromEditPage=False):
     slice = Slice(id=Slice.toKeyId(budgetId, title, reason),
                   budgetId=budgetId,
                   title=title,
                   reason=reason,
                   creator=creator,
                   fromEditPage=fromEditPage)
     # Index content words
     content = ' '.join([w for w in [title, reason] if w])
     words = text.uniqueInOrder(text.removeStopWords(
         text.tokenize(content)))
     words = words[0:
                   conf.MAX_WORDS_INDEXED]  # Limit number of words indexed
     slice.words = text.tuples(words, maxSize=2)
     return slice

예제 #15

0

파일 보기

파일: test_text.py 프로젝트: chagge/joshua_translation_engine

 def test_tokenize(self):
     sentences = self.input_sentences
     expect = [
         unicode(
             'La Piedad del Vaticano es un grupo escultórico en mármol '
             'realizado por Miguel Ángel entre 1498 y 1499 .',
             encoding='utf-8'),
         unicode(
             'Sus dimensiones son 174 por 195 cm .',
             encoding='utf-8'),
         unicode(
             'Se encuentra en la Basílica de San Pedro del Vaticano .',
             encoding='utf-8'),
     ]
     actual = text.tokenize('es', sentences)
     self.assertEqual(expect, actual)

예제 #16

0

파일 보기

파일: learn.py 프로젝트: struggling-coder/opinion

def dont_do(data, expec):
	''''''
	learnt = {}
	i = 0

	for e in data:
		words = text.tokenize(e)
		for w in words:
			if w not in learnt.keys(): 
				learnt[w] = [expec[i], 1]
			else:
				learnt[w][0] += expec[i]
				learnt[w][1] += 1
		i+=1

	#finally, remember this
	dbctrl.snapshot('adj')
	dbctrl.pickle_adj(learnt)

예제 #17

0

파일 보기

파일: normal.py 프로젝트: struggling-coder/opinion

def acquire_data(data, db=None, verbose=False):
	'''Use the data standard. This will require more work.'''

	if verbose: return acquire_data_verbose(data, db)

	if (db is None):
		db = dbctrl.handledb('adj')

	conn = MySQLdb.connect(user='******', passwd='aditya', db='mem')	
	c= conn.cursor()

	c.execute("delete from normal_proc1")
	c.execute("delete from normal_proc2")

	for e in data:
		whoosh = text.tokenize(e)
		for word in whoosh:
			c.execute("insert into normal_proc1 values('"+word+"', 1)")
		conn.commit()

	c.execute("insert into normal_proc2 select word, sum(freq) from normal_proc1 group by word")
	conn.commit()
	conn.close()

예제 #18

0

파일 보기

def retrieveTopReasonsForStart(proposalId, reasonStart):
    proposalIdStr = str(proposalId)
    inputWords = text.uniqueInOrder(
        text.removeStopWords(text.tokenize(reasonStart)))
    if conf.isDev:
        logging.debug('retrieveTopReasonsForStart() inputWords=' +
                      str(inputWords))

    reasonRecordFutures = []
    if inputWords and (0 < len(inputWords)):
        # Retrieve top-voted reason-records matching last input-word
        # Results will be collected & match-scored in client
        lastWord = inputWords[-1]
        reasonRecordFutures.append(
            Reason.query(
                Reason.proposalId == proposalIdStr,
                Reason.words == lastWord).order(-Reason.score).fetch_async(1))
        # Retrieve for last input-word-pair
        if (2 <= len(inputWords)):
            lastTuple = ' '.join(inputWords[-2:-1])
            reasonRecordFutures.append(
                Reason.query(Reason.proposalId == proposalIdStr, Reason.words
                             == lastTuple).order(-Reason.score).fetch_async(1))

    # De-duplicate records, since both word & tuple-top-suggestion may be the same
    recordsUnique = {}
    for f in reasonRecordFutures:
        if f:
            for r in f.get_result():
                if r:
                    recordsUnique[r.key.id()] = r
    if conf.isDev:
        logging.debug('retrieveTopReasonsForStart() recordsUnique=' +
                      str(recordsUnique))

    return recordsUnique.values()

예제 #19

0

파일 보기

파일: bosstextproc.py 프로젝트: hideki/clustsrch

def textprocess(results, termweight='tfidf'):
    # word count list
    wordcounts = []

    # df: document frequenty
    df = {}

    # iterate Y! BOSS results
    for result in results:
        # word list
        words = []
        # title
        words.extend(text.tokenize(remove_html_tags(result['title'])))
        # abstract
        words.extend(text.tokenize(remove_html_tags(result['abstract'])))
        # key terms of Yahoo! BOSS results
        #if 'keyterms' in result and len(result['keyterms']) > 0:
        #    words.extend(text.cleanwords(_keyterms(result['keyterms']['terms'])))
        # del.icio.us tags of Yahoo! BOSS results
        #if 'delicious_toptags' in result and len(result['delicious_toptags']) > 0:
        #    words.extend(text.cleanwords(_delicioustags(result['delicious_toptags']['tags'])))
        # word count from word list
        wc = text.wordcount(words)
        wordcounts.append(wc)
        # calcurate document frequncy
        for word, c in wc.items():
            df.setdefault(word, 0.0)
            df[word] += 1.0

    # word list
    wordlist = []        
    for word, freq in df.items():
        #if freq > 1.0 and float(freq)/len(results) <= 0.6:
        if freq > 1.0:
            wordlist.append(word)
    
    #print "|D|:%d" % (len(results))
    doc_count = float(len(results))

    # generate word vector 
    wordvectors = []
    for wc in wordcounts:
        doc_size = float(sum([i for i in wc.values()]))
        #print "docsize:%d" % (docsize)
        wordvector = []
        for word in wordlist:
            if word in wc:
                # boolean
                if termweight == 'boolean':
                    wordvector.append(1.0)
                # tf
                elif termweight == 'tf':
                    wordvector.append(wc[word])
                # normtf
                elif termweight == 'normtf':
                    tf  = wc[word] / doc_size
                    wordvector.append(tf)
                # tfidf
                elif termweight == 'tfidf':
                    tf  = wc[word] / doc_size
                    tf  = math.sqrt(tf)
                    idf = math.sqrt(doc_count / df[word])
                    tfidf = tf * idf
                    wordvector.append(tfidf)
            else:
                wordvector.append(0.0)
        wordvectors.append(wordvector)

    #print wordlist
    #print wordvectors
    return wordlist, wordvectors

예제 #20

0

파일 보기

파일: sections.py 프로젝트: pudo/wahlprogramme

 def tokens(self):
     if not hasattr(self, '_tokens'):
         self._tokens = list(tokenize(self.text))
     return self._tokens

예제 #21

0

파일 보기

파일: answer.py 프로젝트: converj/reasonSurvey

 def setContent(self, content):
     self.content = content
     words = text.uniqueInOrder(text.removeStopWords(
         text.tokenize(content)))
     words = words[0:20]  # Limit number of words indexed
     self.words = text.tuples(words, maxSize=2)

예제 #22

0

파일 보기

파일: learn.py 프로젝트: struggling-coder/opinion

def do(data, expec):
	'''Naive work indeed'''

	_dict = {}
	_return = {}
	i=0
	numbers = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
	for e in expec:
		numbers[e] += 1
	p_rev_eq_k = [0,0,0,0,0,0,0,0,0,0,0]
	num = len(expec)
	for i in range(1,11):
		p_rev_eq_k[i] = numbers[i]/num
	print "data initialized"

	#while learning it doesn't really matter how many times this appears. OR DOES IT?
	i=0
	for e in data:
		buildup=[]
		for s in text.split_sentences(e):
			words=text.tokenize(s)
			builder=[]
			if len(set(words).intersection(set(negatives))) > 0:
				for d in words:
					builder.append("!"+d)
			else:
				builder.extend(words)
			buildup.extend(builder)
		for w in (buildup):
			try:
				_dict[w][expec[i]] +=1
			except KeyError:
				_dict[w] = [0,0,0,0,0,0,0,0,0,0,0]
				_dict[w][expec[i]] +=1
		i+=1
	print "data loaded. " + str(len(_dict)) + " words present"

	print 'commence pruning'
	q=0
	av=0
	for w in _dict.keys():
		av += sum(_dict[w])
	av = av / len(_dict)
	av *= 0.9 
	print str(av) + " is the threshold"
	for w in _dict.keys():
		if sum(_dict[w]) < av:
			_dict.pop(w)
			q+=1
	print str(q) + ' records removed'
	
	i=0
	for w in _dict.keys():
		score = 0
		p_word_in_rev = 0
		for j in range(1,11):
			try: p_word_in_rev += (_dict[w][j]/numbers[j])*p_rev_eq_k[j]
			except: p_word_in_rev += 0
		weights = [0,0,0,0,0,0,0,0,0,0,0]
		for j in range(1,11):
			try: weights[j] = (_dict[w][j]/numbers[j])*p_rev_eq_k[j]/p_word_in_rev
			except: weights[j] = 0
		for j in range(1,11):
			score += weights[j] * j
		_return[w] = [score,sum(_dict[w])]
	print 'data processed'

	print 'task complete'

	return _return