示例#1
0
def bigram_collocations_example():
    # 使用统计的方法,找出《夏洛克福尔摩斯的冒险中》的十大最重要的二词短语
    example_dir = os.path.dirname(__file__)
    sample_text_file = os.path.join(example_dir, 'holmes.txt')
    text = open(sample_text_file).read()
    words = textmining.simple_tokenize(text)
    bigrams = textmining.bigram_collocations(words)
    print '\nbigram_collocations_example 1\n'
    for bigram in bigrams[:10]:
        print ' '.join(bigram)
示例#2
0
def bigram_collocations_example():
    # Find the 10 most statistically significant two word phrases in the
    # full text of 'The Adventures of Sherlock Holmes'
    example_dir = os.path.dirname(__file__)
    sample_text_file = os.path.join(example_dir, 'holmes.txt')
    text = open(sample_text_file).read()
    words = textmining.simple_tokenize(text)
    bigrams = textmining.bigram_collocations(words)
    print '\nbigram_collocations_example 1\n'
    for bigram in bigrams[:10]:
        print ' '.join(bigram)
def mine_files(file_list,word):
	data_raw = []
	data_send = []
	tdm = textmining.TermDocumentMatrix()
	if len(word) > 1:
		temp = word.split(' ')
		wordTemp = temp[0]
	else:
		wordTemp = word
	for i in range(0,len(file_list)):
		spot = 0
		data_raw = open(file_list[i]).read()
		r = re.compile('')
		data_raw = r.split(data_raw)
		spot = 0
		for y in range(0,len(data_raw)):
			if data_raw[y]==wordTemp:
				spot = y
			elif abs(y-spot)<300:

				data_send+=data_raw[y]
			
		tdm.add_doc(''.join(data_send))

	rows =  tdm.rows(cutoff=1)
	pos = []
	freq =[]
	count = 0
	for row in rows:
		if count ==0:
			pos = row
			count +=1
		else: 
			for r in range(0,len(row)):
				
				if len(freq)==r:
					freq.append(row[r])
				else:
					freq[r] += row[r]




		
	


#	freq_word = [(counts[0][0], word) for (word, counts) in textmining.dictionary.items()]
#	freq_word.sort(reverse=True)
#	print '\ndictionary_example 1\n'
#	count = 0
#	for freq, word in freq_word[:]:
#		for tup in textmining.dictionary[word]:
#			if(tup[1] in ['aj0','ajc','ajs','at0','av0','avp','avq','cjc','cjs'] and float(tup[0])/freq>.6 or word in textmining.stopwords):
#				break

#tup[1] in ['np0','nn1','nn0','nn2','vbz'] and float(tup[0])/freq>.9 and
#
#			if( freq > 100000):
#				print word, freq, tup[0], float(tup[0])/freq
#				break

	text = '';
	for i in range(0,len(file_list)):
		text = text + open(file_list[i]).read()
#		os.remove(file_list[i])
	words = textmining.simple_tokenize(text)
	bigrams = textmining.bigram_collocations(words)
	results = []
	for bigram in bigrams[:15]:
		results.append(' '.join(bigram))


	

	single = {}		
			
	for ind in range(0,len(freq)):
		if pos[ind] in textmining.stopwords:
			freq[ind] = 0
		for res in results:
			if pos[ind] in res:
				freq[ind] = 0
		single[-1*freq[ind]] = pos[ind]	




	single_end = single.items()	
	for q in range(0,40):
		results.append(single_end[q][1])

	sorted(results, key=lambda x: google_search_count(x +' '+ word), reverse=True)
		
	return results
示例#4
0
def bigrams(text):
    # bigrams
    bigrams = textmining.bigram_collocations(words)
    for bigram in bigrams[:10]:
        print ' '.join(bigram)
示例#5
0
def bigrams(text):
    # bigrams
    bigrams = textmining.bigram_collocations(words)
    for bigram in bigrams[:10]:
        print ' '.join(bigram)