예제 #1
0
def indexText(text):
    # prepare text
    lines = text.split('.')
    clean_lines = [line.strip() for line in lines if line.strip()]
    newtext =  '\n'.join(clean_lines)
    words = textmining.simple_tokenize(newtext)
    p = stemmer.PorterStemmer()
    # filter stop words 
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    # use stemming
    stemmed = []
    freq = {}
    occur = {}
    for index, w in enumerate(words):
        stem = p.stem(w, 0, len(w)-1)
        stemmed.append(stem)
        if stem not in stopwords:
            freq[stem] = stemmed.count(stem)
            occur[stem] = w
    sorted_freq = sorted(freq.iteritems(), key=operator.itemgetter(1), reverse=True)
    # Concordance
    most_freq_words = sorted_freq[:1]
    print "------Index-----"
    print occur[most_freq_words.pop()[0]]
    print "----------------"
    return occur
예제 #2
0
def indexText(text):
    # prepare text
    lines = text.split('.')
    clean_lines = [line.strip() for line in lines if line.strip()]
    newtext = '\n'.join(clean_lines)
    words = textmining.simple_tokenize(newtext)
    p = stemmer.PorterStemmer()
    # filter stop words
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    # use stemming
    stemmed = []
    freq = {}
    occur = {}
    for index, w in enumerate(words):
        stem = p.stem(w, 0, len(w) - 1)
        stemmed.append(stem)
        if stem not in stopwords:
            freq[stem] = stemmed.count(stem)
            occur[stem] = w
    sorted_freq = sorted(freq.iteritems(),
                         key=operator.itemgetter(1),
                         reverse=True)
    # Concordance
    most_freq_words = sorted_freq[:1]
    print "------Index-----"
    print occur[most_freq_words.pop()[0]]
    print "----------------"
    return occur
예제 #3
0
def summarize(text):
# prepare text
    lines = text.split('.')
    clean_lines = [line.strip() for line in lines if line.strip()]
    newtext =  '\n'.join(clean_lines)
    tdm = textmining.TermDocumentMatrix()
    tdm.add_doc(newtext)
    for index,row in enumerate(tdm.rows(cutoff=1)):
        if index == 0 : words = row
        if index == 1 : count = row
    # filter stop words 
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    freq = [(w, count[index]) for index, w in enumerate(words) if w not in stopwords]
    freq.sort(reverse=True)
    # Concordance
    most_freq_words = freq[:10]
    summary = []
    h = histogram(lines, most_freq_words)
    rowcount = threshold(h)
    summary = [(index, line) for index, line in enumerate(lines) if index < rowcount]
    summary.sort()
    ret = [line[1] for line in summary]
    print '.'.join(ret)
    return ret
예제 #4
0
def summarize(text):
    # prepare text
    lines = text.split('.')
    clean_lines = [line.strip() for line in lines if line.strip()]
    newtext = '\n'.join(clean_lines)
    tdm = textmining.TermDocumentMatrix()
    tdm.add_doc(newtext)
    for index, row in enumerate(tdm.rows(cutoff=1)):
        if index == 0: words = row
        if index == 1: count = row
    # filter stop words
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    freq = [(w, count[index]) for index, w in enumerate(words)
            if w not in stopwords]
    freq.sort(reverse=True)
    # Concordance
    most_freq_words = freq[:10]
    summary = []
    h = histogram(lines, most_freq_words)
    rowcount = threshold(h)
    summary = [(index, line) for index, line in enumerate(lines)
               if index < rowcount]
    summary.sort()
    ret = [line[1] for line in summary]
    print '.'.join(ret)
    return ret
예제 #5
0
def bigram_collocations_example():
    # 使用统计的方法,找出《夏洛克福尔摩斯的冒险中》的十大最重要的二词短语
    example_dir = os.path.dirname(__file__)
    sample_text_file = os.path.join(example_dir, 'holmes.txt')
    text = open(sample_text_file).read()
    words = textmining.simple_tokenize(text)
    bigrams = textmining.bigram_collocations(words)
    print '\nbigram_collocations_example 1\n'
    for bigram in bigrams[:10]:
        print ' '.join(bigram)
예제 #6
0
def bigram_collocations_example():
    # Find the 10 most statistically significant two word phrases in the
    # full text of 'The Adventures of Sherlock Holmes'
    example_dir = os.path.dirname(__file__)
    sample_text_file = os.path.join(example_dir, 'holmes.txt')
    text = open(sample_text_file).read()
    words = textmining.simple_tokenize(text)
    bigrams = textmining.bigram_collocations(words)
    print '\nbigram_collocations_example 1\n'
    for bigram in bigrams[:10]:
        print ' '.join(bigram)
def mine_files(file_list,word):
	data_raw = []
	data_send = []
	tdm = textmining.TermDocumentMatrix()
	if len(word) > 1:
		temp = word.split(' ')
		wordTemp = temp[0]
	else:
		wordTemp = word
	for i in range(0,len(file_list)):
		spot = 0
		data_raw = open(file_list[i]).read()
		r = re.compile('')
		data_raw = r.split(data_raw)
		spot = 0
		for y in range(0,len(data_raw)):
			if data_raw[y]==wordTemp:
				spot = y
			elif abs(y-spot)<300:

				data_send+=data_raw[y]
			
		tdm.add_doc(''.join(data_send))

	rows =  tdm.rows(cutoff=1)
	pos = []
	freq =[]
	count = 0
	for row in rows:
		if count ==0:
			pos = row
			count +=1
		else: 
			for r in range(0,len(row)):
				
				if len(freq)==r:
					freq.append(row[r])
				else:
					freq[r] += row[r]




		
	


#	freq_word = [(counts[0][0], word) for (word, counts) in textmining.dictionary.items()]
#	freq_word.sort(reverse=True)
#	print '\ndictionary_example 1\n'
#	count = 0
#	for freq, word in freq_word[:]:
#		for tup in textmining.dictionary[word]:
#			if(tup[1] in ['aj0','ajc','ajs','at0','av0','avp','avq','cjc','cjs'] and float(tup[0])/freq>.6 or word in textmining.stopwords):
#				break

#tup[1] in ['np0','nn1','nn0','nn2','vbz'] and float(tup[0])/freq>.9 and
#
#			if( freq > 100000):
#				print word, freq, tup[0], float(tup[0])/freq
#				break

	text = '';
	for i in range(0,len(file_list)):
		text = text + open(file_list[i]).read()
#		os.remove(file_list[i])
	words = textmining.simple_tokenize(text)
	bigrams = textmining.bigram_collocations(words)
	results = []
	for bigram in bigrams[:15]:
		results.append(' '.join(bigram))


	

	single = {}		
			
	for ind in range(0,len(freq)):
		if pos[ind] in textmining.stopwords:
			freq[ind] = 0
		for res in results:
			if pos[ind] in res:
				freq[ind] = 0
		single[-1*freq[ind]] = pos[ind]	




	single_end = single.items()	
	for q in range(0,40):
		results.append(single_end[q][1])

	sorted(results, key=lambda x: google_search_count(x +' '+ word), reverse=True)
		
	return results
예제 #8
0
def isClassifiable(word):
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    return w not in stopwords
예제 #9
0
def bayesClassify(text, category, trainer):
    words = textmining.simple_tokenize(text)
    wps = calcWordsProbability(words, category, trainer)
    return normalizeSignificance(calculateOverallProbability(wps))
예제 #10
0
def isClassifiable(word):
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    return w not in stopwords
예제 #11
0
def bayesClassify(text, category, trainer):
    words = textmining.simple_tokenize(text)
    wps = calcWordsProbability(words, category, trainer)
    return normalizeSignificance(calculateOverallProbability(wps))