def bigram_collocations_example(): # 使用统计的方法,找出《夏洛克福尔摩斯的冒险中》的十大最重要的二词短语 example_dir = os.path.dirname(__file__) sample_text_file = os.path.join(example_dir, 'holmes.txt') text = open(sample_text_file).read() words = textmining.simple_tokenize(text) bigrams = textmining.bigram_collocations(words) print '\nbigram_collocations_example 1\n' for bigram in bigrams[:10]: print ' '.join(bigram)
def bigram_collocations_example(): # Find the 10 most statistically significant two word phrases in the # full text of 'The Adventures of Sherlock Holmes' example_dir = os.path.dirname(__file__) sample_text_file = os.path.join(example_dir, 'holmes.txt') text = open(sample_text_file).read() words = textmining.simple_tokenize(text) bigrams = textmining.bigram_collocations(words) print '\nbigram_collocations_example 1\n' for bigram in bigrams[:10]: print ' '.join(bigram)
def mine_files(file_list,word): data_raw = [] data_send = [] tdm = textmining.TermDocumentMatrix() if len(word) > 1: temp = word.split(' ') wordTemp = temp[0] else: wordTemp = word for i in range(0,len(file_list)): spot = 0 data_raw = open(file_list[i]).read() r = re.compile('') data_raw = r.split(data_raw) spot = 0 for y in range(0,len(data_raw)): if data_raw[y]==wordTemp: spot = y elif abs(y-spot)<300: data_send+=data_raw[y] tdm.add_doc(''.join(data_send)) rows = tdm.rows(cutoff=1) pos = [] freq =[] count = 0 for row in rows: if count ==0: pos = row count +=1 else: for r in range(0,len(row)): if len(freq)==r: freq.append(row[r]) else: freq[r] += row[r] # freq_word = [(counts[0][0], word) for (word, counts) in textmining.dictionary.items()] # freq_word.sort(reverse=True) # print '\ndictionary_example 1\n' # count = 0 # for freq, word in freq_word[:]: # for tup in textmining.dictionary[word]: # if(tup[1] in ['aj0','ajc','ajs','at0','av0','avp','avq','cjc','cjs'] and float(tup[0])/freq>.6 or word in textmining.stopwords): # break #tup[1] in ['np0','nn1','nn0','nn2','vbz'] and float(tup[0])/freq>.9 and # # if( freq > 100000): # print word, freq, tup[0], float(tup[0])/freq # break text = ''; for i in range(0,len(file_list)): text = text + open(file_list[i]).read() # os.remove(file_list[i]) words = textmining.simple_tokenize(text) bigrams = textmining.bigram_collocations(words) results = [] for bigram in bigrams[:15]: results.append(' '.join(bigram)) single = {} for ind in range(0,len(freq)): if pos[ind] in textmining.stopwords: freq[ind] = 0 for res in results: if pos[ind] in res: freq[ind] = 0 single[-1*freq[ind]] = pos[ind] single_end = single.items() for q in range(0,40): results.append(single_end[q][1]) sorted(results, key=lambda x: google_search_count(x +' '+ word), reverse=True) return results
def bigrams(text): # bigrams bigrams = textmining.bigram_collocations(words) for bigram in bigrams[:10]: print ' '.join(bigram)