def get_corpus(name): corpus = [] with open(name, 'r') as corpus_file: for doc in corpus_file: if doc.strip(): corpus.append(parse_document(doc)) corpus_file.close() return corpus
for status, chunk in itertools.groupby( flattened_chunks, lambda (word, pos, chunk): chunk != 'O') ] valid_chunks = [ ' '.join(word.lower() for word, tag, chunk in wtc_group if word.lower() not in stopword_list) for status, wtc_group in valid_chunks_tagged if status ] all_chunks.append(valid_chunks) return all_chunks sentences = parse_document(toy_text) valid_chunks = get_chunks(sentences) print valid_chunks def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10): valid_chunks = get_chunks(sentences, grammar=grammar) dictionary = corpora.Dictionary(valid_chunks) corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus]
to control their body temperature. Their pillar-like legs can carry their great weight. African elephants have larger ears and concave backs while Asian elephants have smaller ears and convex or level backs. """ from gensim.summarization import summarize, keywords def text_summarization_gensim(text, summary_ratio=0.5): summary = summarize(text, split=True, ratio=summary_ratio) for sentence in summary: print(sentence) docs = parse_document(toy_text) text = ' '.join(docs) text_summarization_gensim(text, summary_ratio=0.4) sentences = parse_document(toy_text) norm_sentences = normalize_corpus(sentences,lemmatize=False) total_sentences = len(norm_sentences) print('Total Sentences in Document:', total_sentences) num_sentences = 3 num_topics = 2 vec, dt_matrix = build_feature_matrix(sentences,
lambda (word,pos,chunk): chunk != 'O')] valid_chunks = [' '.join(word.lower() for word, tag, chunk in wtc_group if word.lower() not in stopword_list) for status, wtc_group in valid_chunks_tagged if status] all_chunks.append(valid_chunks) return all_chunks sentences = parse_document(toy_text) valid_chunks = get_chunks(sentences) print valid_chunks def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10): valid_chunks = get_chunks(sentences, grammar=grammar) dictionary = corpora.Dictionary(valid_chunks) corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus]
for moving objects and digging. Elephants' large ear flaps help to control their body temperature. Their pillar-like legs can carry their great weight. African elephants have larger ears and concave backs while Asian elephants have smaller ears and convex or level backs. """ def text_summarization_gensim(text, summary_ratio=0.5): summary = summarize(text, split=True, ratio=summary_ratio) for sentence in summary: print(sentence) # Using Gensim Summarization Method docs = parse_document(document1) text = ' '.join(docs) text_summarization_gensim(text, summary_ratio=0.3) sentences = parse_document(document1) norm_sentences = normalize_corpus(sentences, lemmatize=False) total_sentences = len(norm_sentences) print('Total Sentences in Document:', total_sentences) num_sentences = 3 num_topics = 1 vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency') td_matrix = dt_matrix.transpose()
top_sentence_indices.sort() s = '' for index in top_sentence_indices: s = s + ' ' + sentences[index] print(sentences[index]) return s path = r'../../data/raw/OpinosisDataset1.0_0/topics/' allFiles = glob.glob(path + "/*.data") reviews = list() for file_ in allFiles: with open(file_, "r") as f: review = f.read() DOCUMENT = review sentences = parse_document(DOCUMENT) norm_sentences = normalize_corpus(sentences, lemmatize=True) print("Total Sentences:", len(norm_sentences)) filename_search = re.search(r'[^\\/:*?"<>|\r\n]+$', file_) filename = filename_search.group() myfile = open(r'../../data/processed/lsa/' + filename, 'w') myfile.writelines( lsa_text_summarizer(norm_sentences, num_sentences=2, num_topics=5, feature_type='frequency', sv_threshold=0.5)) myfile = open(r'../../data/processed/textrank_cosine/' + filename, 'w') myfile.writelines( textrank_text_summarizer(norm_sentences, num_sentences=2,
to control their body temperature. Their pillar-like legs can carry their great weight. African elephants have larger ears and concave backs while Asian elephants have smaller ears and convex or level backs. """ from gensim.summarization import summarize, keywords def text_summarization_gensim(text, summary_ratio=0.5): summary = summarize(text, split=True, ratio=summary_ratio) for sentence in summary: print sentence docs = parse_document(toy_text) text = ' '.join(docs) text_summarization_gensim(text, summary_ratio=0.4) sentences = parse_document(toy_text) norm_sentences = normalize_corpus(sentences,lemmatize=False) total_sentences = len(norm_sentences) print 'Total Sentences in Document:', total_sentences num_sentences = 3 num_topics = 2