def indexText(text): # prepare text lines = text.split('.') clean_lines = [line.strip() for line in lines if line.strip()] newtext = '\n'.join(clean_lines) words = textmining.simple_tokenize(newtext) p = stemmer.PorterStemmer() # filter stop words text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) # use stemming stemmed = [] freq = {} occur = {} for index, w in enumerate(words): stem = p.stem(w, 0, len(w)-1) stemmed.append(stem) if stem not in stopwords: freq[stem] = stemmed.count(stem) occur[stem] = w sorted_freq = sorted(freq.iteritems(), key=operator.itemgetter(1), reverse=True) # Concordance most_freq_words = sorted_freq[:1] print "------Index-----" print occur[most_freq_words.pop()[0]] print "----------------" return occur
def indexText(text): # prepare text lines = text.split('.') clean_lines = [line.strip() for line in lines if line.strip()] newtext = '\n'.join(clean_lines) words = textmining.simple_tokenize(newtext) p = stemmer.PorterStemmer() # filter stop words text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) # use stemming stemmed = [] freq = {} occur = {} for index, w in enumerate(words): stem = p.stem(w, 0, len(w) - 1) stemmed.append(stem) if stem not in stopwords: freq[stem] = stemmed.count(stem) occur[stem] = w sorted_freq = sorted(freq.iteritems(), key=operator.itemgetter(1), reverse=True) # Concordance most_freq_words = sorted_freq[:1] print "------Index-----" print occur[most_freq_words.pop()[0]] print "----------------" return occur
def summarize(text): # prepare text lines = text.split('.') clean_lines = [line.strip() for line in lines if line.strip()] newtext = '\n'.join(clean_lines) tdm = textmining.TermDocumentMatrix() tdm.add_doc(newtext) for index,row in enumerate(tdm.rows(cutoff=1)): if index == 0 : words = row if index == 1 : count = row # filter stop words text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) freq = [(w, count[index]) for index, w in enumerate(words) if w not in stopwords] freq.sort(reverse=True) # Concordance most_freq_words = freq[:10] summary = [] h = histogram(lines, most_freq_words) rowcount = threshold(h) summary = [(index, line) for index, line in enumerate(lines) if index < rowcount] summary.sort() ret = [line[1] for line in summary] print '.'.join(ret) return ret
def summarize(text): # prepare text lines = text.split('.') clean_lines = [line.strip() for line in lines if line.strip()] newtext = '\n'.join(clean_lines) tdm = textmining.TermDocumentMatrix() tdm.add_doc(newtext) for index, row in enumerate(tdm.rows(cutoff=1)): if index == 0: words = row if index == 1: count = row # filter stop words text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) freq = [(w, count[index]) for index, w in enumerate(words) if w not in stopwords] freq.sort(reverse=True) # Concordance most_freq_words = freq[:10] summary = [] h = histogram(lines, most_freq_words) rowcount = threshold(h) summary = [(index, line) for index, line in enumerate(lines) if index < rowcount] summary.sort() ret = [line[1] for line in summary] print '.'.join(ret) return ret
def bigram_collocations_example(): # 使用统计的方法,找出《夏洛克福尔摩斯的冒险中》的十大最重要的二词短语 example_dir = os.path.dirname(__file__) sample_text_file = os.path.join(example_dir, 'holmes.txt') text = open(sample_text_file).read() words = textmining.simple_tokenize(text) bigrams = textmining.bigram_collocations(words) print '\nbigram_collocations_example 1\n' for bigram in bigrams[:10]: print ' '.join(bigram)
def bigram_collocations_example(): # Find the 10 most statistically significant two word phrases in the # full text of 'The Adventures of Sherlock Holmes' example_dir = os.path.dirname(__file__) sample_text_file = os.path.join(example_dir, 'holmes.txt') text = open(sample_text_file).read() words = textmining.simple_tokenize(text) bigrams = textmining.bigram_collocations(words) print '\nbigram_collocations_example 1\n' for bigram in bigrams[:10]: print ' '.join(bigram)
def mine_files(file_list,word): data_raw = [] data_send = [] tdm = textmining.TermDocumentMatrix() if len(word) > 1: temp = word.split(' ') wordTemp = temp[0] else: wordTemp = word for i in range(0,len(file_list)): spot = 0 data_raw = open(file_list[i]).read() r = re.compile('') data_raw = r.split(data_raw) spot = 0 for y in range(0,len(data_raw)): if data_raw[y]==wordTemp: spot = y elif abs(y-spot)<300: data_send+=data_raw[y] tdm.add_doc(''.join(data_send)) rows = tdm.rows(cutoff=1) pos = [] freq =[] count = 0 for row in rows: if count ==0: pos = row count +=1 else: for r in range(0,len(row)): if len(freq)==r: freq.append(row[r]) else: freq[r] += row[r] # freq_word = [(counts[0][0], word) for (word, counts) in textmining.dictionary.items()] # freq_word.sort(reverse=True) # print '\ndictionary_example 1\n' # count = 0 # for freq, word in freq_word[:]: # for tup in textmining.dictionary[word]: # if(tup[1] in ['aj0','ajc','ajs','at0','av0','avp','avq','cjc','cjs'] and float(tup[0])/freq>.6 or word in textmining.stopwords): # break #tup[1] in ['np0','nn1','nn0','nn2','vbz'] and float(tup[0])/freq>.9 and # # if( freq > 100000): # print word, freq, tup[0], float(tup[0])/freq # break text = ''; for i in range(0,len(file_list)): text = text + open(file_list[i]).read() # os.remove(file_list[i]) words = textmining.simple_tokenize(text) bigrams = textmining.bigram_collocations(words) results = [] for bigram in bigrams[:15]: results.append(' '.join(bigram)) single = {} for ind in range(0,len(freq)): if pos[ind] in textmining.stopwords: freq[ind] = 0 for res in results: if pos[ind] in res: freq[ind] = 0 single[-1*freq[ind]] = pos[ind] single_end = single.items() for q in range(0,40): results.append(single_end[q][1]) sorted(results, key=lambda x: google_search_count(x +' '+ word), reverse=True) return results
def isClassifiable(word): text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) return w not in stopwords
def bayesClassify(text, category, trainer): words = textmining.simple_tokenize(text) wps = calcWordsProbability(words, category, trainer) return normalizeSignificance(calculateOverallProbability(wps))