/
summarizer.py
78 lines (63 loc) · 3.47 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# Summarize each topic generated from nmf_topics.py
import argparse
from gensim.summarization import keywords, summarize
import config
from mongo_setup import mongoclient
from process_text import PostManager
# set up encoding to allow piping unicode to file
import sys
import codecs
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Generates keywords or sentences for queried documents in subreddit')
arg_parser.add_argument('--subreddit', type=str, help='subreddit name (or "all" to get all posts', required=True)
# arg_parser.add_argument('--topic_id', type=int, help='topic id to summarize', required=True)
# arg_parser.add_argument('--topic_thresh', type=float, help='threshold for specified topic probability of documents', required=True)
arg_parser.add_argument('--summary_ratio', type=float, help='document to summary ratio. Smaller means shorter summary.', default=0.2)
arg_parser.add_argument('--single_doc_len', type=float, help='all individual documents are truncated to N characters', default=2500)
args = arg_parser.parse_args()
postman = PostManager(mongoclient, args.subreddit)
search_words = config.SEARCH_WORDS
doc_char_limit = 60000
print 'looking at topic-modeled posts in subreddit "%s"' % args.subreddit
# print 'using topic prob threshold %f' % args.topic_thresh
print 'per-topic character limit is roughly %i' % doc_char_limit
print 'per-post character limit is %i' % args.single_doc_len
for topic_id in sorted(postman.get_topics()):
print '\nTopic #%s:\n=============' % topic_id
# query_mixin = {'postwise.tokens': {'$in': search_words}} #TODO: make query more general
# query_mixin = {'postwise.topic_distro':{'$elemMatch':{'topic_id':topic_id, 'prob':{'$gt':args.topic_thresh}}}}
query_mixin = {'postwise.topic_assignment.topic':topic_id}
doc_id_text_generator = postman.fetch_doc_text_body(document_level='postwise', find_query_mixin=query_mixin)
concat_txt = ''
breakout = 0 #dumb infinite loop preventer
while len(concat_txt) < doc_char_limit:
if breakout > 9999:
raise IOError('this should never happen')
try:
doc_id, text_body = doc_id_text_generator.next()
except StopIteration:
print 'not enough docs found, breaking'
break
concat_txt = ' '.join([concat_txt, text_body[:args.single_doc_len]])
breakout += 1
print 'used %i concatenated docs for this topic' % breakout
print 'actual character length of concatenated docs: %i' % len(concat_txt)
# make sure you have something
if len(concat_txt) == 0:
print 'got nothing for this topic'
continue
# TODO: make arga
generate_keywords = True
generate_sentences = True
if generate_keywords:
print '\ngenerating keywords\n------------------------------\n'
summary = keywords(concat_txt, ratio=args.summary_ratio, split=True, lemmatize=True)
print ', '.join(summary)
if generate_sentences:
print '\ngenerating sentences\n------------------------------\n'
summary = summarize(concat_txt, split=True, ratio=args.summary_ratio)
for sentence in summary:
print ' * ' + sentence
# it's sentence or keyword depending on --sentence flag