示例#1
0
 def summarize(self, msgs, range_spec=None):
     """Return a summary of the text
     TODO: 1. Looks like spacy is not getting the main sentence from the message.
     2. Load times for the spacy summarizer won't cut it. Commenting out now 
        until this can be fixed
     """
     if not msgs or len(msgs) == 0:
         self.logger.warn("No messages to form summary")
         return u"\n Unable to form summary here.\n"
     txt = range_spec['txt'] if range_spec else u'Summary is'
     size = range_spec['size'] if range_spec and 'size' in range_spec else 3
     summ = txt + u' '
     #limit canonical dictionary to top 200 docs
     can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
     top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:300]
     can_dict = {key: can_dict[key] for key in top_keys}
     self.logger.info("Length of can_dict is %s", len(can_dict))
     simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:3]])
     # If the number of messages or vocabulary is too low, just look for a
     # promising set of messages
     if len(msgs) < 11 or len(can_dict) < 11:
         #return the longest
         self.logger.warn("Too few messages for NLP.")
         summ += simple_sum
     else:
         max_sents = {}
         for (txt, msg) in can_dict.items():
             if len(txt.split()) > 3:
                 #Use the same splitting that gensim does
                 for snt in split_sentences(txt):
                     if len(snt.split()) > 100:
                         snt = u' '.join(snt.split()[:100])
                     max_sents[snt] = msg
         ratio = (size * 2)/ float(len(max_sents.keys()))
         #ratio = 0.3
         sent1 = u' '.join(can_dict.keys())
         sent2 = u' '.join(max_sents.keys())
         gn_sum = gs_sumrz(sent1, ratio=ratio, split=True)[:size]
         mx_sum = gs_sumrz(sent2, ratio=ratio, split=True)[:size]
         self.logger.info("Gensim sum %s", gn_sum)
         gs_summ = u'\n'.join([self.tagged_sum(can_dict[ss] if ss in can_dict else max_sents[ss]) for ss in gn_sum if len(ss) > 1 and (ss in max_sents or ss in can_dict)])
         for ss in mx_sum:
             if ss not in max_sents and ss not in can_dict and len(ss.split()) > 5:
                 self.logger.info("Searching for: %s", ss)
                 for (ky, msg) in max_sents.items():
                     if ss in ky or (len(ky.split()) > 10 and ky in ss):
                         gs_summ += u'\n' + self.tagged_sum(msg)
         if len(gn_sum) > 1:
             summ += gs_summ
         else:
             self.logger.warn("NLP Summarizer produced null output %s", gs_summ)
             summ += simple_sum
     self.logger.info("Summary for segment %s is %s", msgs, summ) 
     return summ
示例#2
0
 def tagged_sum(self, msg):
     user = "******"
     if 'user' in msg:
         user = msg['user']
     elif 'bot_id' in msg:
         user = msg['bot_id']
     elif 'username' in msg and msg['username'] == u'bot':
         user = '******'
     split_text = get_msg_text(msg).split()
     text = u' '.join(split_text[:30])+u'...' if len(split_text) > 30 else u' '.join(split_text)
     if self.channel:
         link = TsSummarizer.archive_link.format(self.channel, re.sub(r'\.',u'',msg['ts']))
         text = u'<'+link+'|'+text+'>'
     return u'@{} <@{}>: {}'.format(ts_to_time(msg['ts']).strftime("%a-%b-%-m-%Y %H:%M:%S"), user,  text)
示例#3
0
 def summarize(self, msgs, range_spec=None):
     """Return a summary of the text
     TODO: 1. Looks like spacy is not getting the main sentence from the message.
     2. Load times for the spacy summarizer won't cut it. Commenting out now 
        until this can be fixed
     """
     size = range_spec['size'] if range_spec and 'size' in range_spec else 3
     if not msgs or len(msgs) == 0:
         self.logger.warn("No messages to form summary")
         return u"\n Unable to form summary here.\n"
     txt = range_spec['txt'] if range_spec else u'Summary is'
     if range_spec:
         self.logger.info("First 10 messages  %s of %s", msgs[:10],
                          len(msgs))
         self.logger.info("Using time range spec %s", range_spec)
         start_time = time.strptime(
             range_spec['start'],
             "%B %d %Y") if 'start' in range_spec else ts_to_time(
                 min(msgs, key=lambda m: m['ts'])['ts'])
         self.logger.info("Start time is  %s", start_time)
         delt = tspec_to_delta(**range_spec)
         end_time = start_time + delt
         self.logger.info("End time is  %s", end_time)
         msgs = [
             msg for msg in msgs if ts_to_time(msg['ts']) >= start_time
             and ts_to_time(msg['ts']) <= end_time
         ]
         self.logger.info("First 10 messages  %s of %s", msgs[:10],
                          len(msgs))
     summ = txt + u' '
     summ_list = []
     can_dict = {canonicalize(get_msg_text(msg)): msg for msg in msgs}
     top_keys = sorted(can_dict.keys(),
                       key=lambda x: len(x.split()),
                       reverse=True)
     can_dict = {key: can_dict[key] for key in top_keys}
     self.logger.info("Length of can_dict is %s", len(can_dict))
     simple_sum_list = [
         can_dict[ss] for ss in sorted(can_dict.keys(),
                                       key=lambda x: len(x.split()),
                                       reverse=True)[:size]
     ]
     simple_sum = u'\n'.join([
         self.tagged_sum(can_dict[ss])
         for ss in sorted(can_dict.keys(),
                          key=lambda x: len(x.split()),
                          reverse=True)[:size]
     ])
     #simple_sum = u'\n'.join([self.tagged_sum(ss) for ss in simple_sum_list])
     assert (len(simple_sum_list) <= size)
     #simple_sum = self.tagged_sum(can_dict[max(can_dict.keys(), key=lambda x: len(x))])
     if len(msgs) < 10:
         #return the longest
         summ += u'\n'.join([
             self.tagged_sum(ss)
             for ss in sorted(simple_sum_list, key=lambda x: x['ts'])
         ])
     else:
         max_sents = {}
         user_sents = {}
         for (txt, msg) in can_dict.items():
             if len(txt.split()) > 3:
                 sl = list(self.sumr.nlp(txt).sents)
                 max_sents[max(sl, key=lambda x: len(x)).text] = msg
                 user_sents[max(sl, key=lambda x: len(x)).
                            text] = msg['user'] if 'user' in msg else u''
         txt_sum = [
             v for v in self.sumr(u' '.join(max_sents.keys()), size,
                                  user_sents)
         ]
         self.logger.info("Canonical keys are \n%s",
                          u' '.join(can_dict.keys()))
         self.logger.info("Spacy summ %s", txt_sum)
         nlp_summ = u'\n'.join([
             self.tagged_sum(max_sents[ss]) for ss in txt_sum
             if len(ss) > 1 and ss in max_sents
         ])
         nlp_list = [
             max_sents[ss] for ss in txt_sum
             if len(ss) > 1 and ss in max_sents
         ]
         for ss in txt_sum:
             if ss not in max_sents and len(ss.split()) > 5:
                 self.logger.info("Searching for: %s", ss)
                 for (ky, msg) in max_sents.items():
                     if ss in ky or (len(ky.split()) > 10 and ky
                                     in ss) and len(nlp_list) <= size:
                         nlp_summ += u'\n' + self.tagged_sum(msg)
                         nlp_list.append(msg)
         if len(nlp_list) < 2:
             self.logger.info("Failed to find nlp summary using heuristic")
             summ += u'\n'.join([
                 self.tagged_sum(ss)
                 for ss in sorted(simple_sum_list, key=lambda x: x['ts'])
             ])
         else:
             self.logger.info("First msg is %s, %s", nlp_list[0],
                              nlp_list[0]['ts'])
             self.logger.info("Sorted is %s",
                              sorted(nlp_list, key=lambda x: x['ts']))
             summ += u'\n'.join([
                 self.tagged_sum(ss)
                 for ss in sorted(nlp_list, key=lambda x: x['ts'])
             ])
     self.logger.info("Summary for segment %s is %s", msgs, summ)
     return summ
示例#4
0
 def parify_text(self, msg_segment):
     ptext = u'. '.join([TextRankTsSummarizer.flrg.sub(u'', get_msg_text(msg)) for msg in msg_segment])
     self.logger.debug("Parified text is %s", ptext)
     return ptext
示例#5
0
 def summarize(self, msgs, range_spec=None):
     """Return a summary of the text
     TODO: 1. Looks like spacy is not getting the main sentence from the message.
     2. Load times for the spacy summarizer won't cut it. Commenting out now 
        until this can be fixed
     """
     size = range_spec['size'] if range_spec and 'size' in range_spec else 3
     if not msgs or len(msgs) == 0:
         self.logger.warn("No messages to form summary")
         return u"\n Unable to form summary here.\n"
     txt = range_spec['txt'] if range_spec else u'Summary is'
     if range_spec:
         self.logger.info("First 10 messages  %s of %s", msgs[:10], len(msgs)) 
         self.logger.info("Using time range spec %s", range_spec)
         start_time = time.strptime(range_spec['start'], "%B %d %Y") if 'start' in range_spec else ts_to_time(min(msgs, key=lambda m: m['ts'])['ts'])
         self.logger.info("Start time is  %s", start_time)
         delt = tspec_to_delta(**range_spec)
         end_time = start_time + delt
         self.logger.info("End time is  %s", end_time)
         msgs = [msg for msg in msgs if ts_to_time(msg['ts']) >= start_time and ts_to_time(msg['ts']) <= end_time]
         self.logger.info("First 10 messages  %s of %s", msgs[:10], len(msgs)) 
     summ = txt + u' '
     summ_list = []
     can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
     top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)
     can_dict = {key: can_dict[key] for key in top_keys}
     self.logger.info("Length of can_dict is %s", len(can_dict))
     simple_sum_list = [can_dict[ss] for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]]
     simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]])
     #simple_sum = u'\n'.join([self.tagged_sum(ss) for ss in simple_sum_list])
     assert(len(simple_sum_list) <= size)
     #simple_sum = self.tagged_sum(can_dict[max(can_dict.keys(), key=lambda x: len(x))]) 
     if len(msgs) < 10:
         #return the longest
         summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])])
     else:
         max_sents = {}
         user_sents = {}
         for (txt, msg) in can_dict.items():
             if len(txt.split()) > 3:
                 sl = list(self.sumr.nlp(txt).sents)
                 max_sents[max(sl, key = lambda x: len(x)).text] = msg
                 user_sents[max(sl, key = lambda x: len(x)).text] = msg['user'] if 'user' in msg else u''
         txt_sum = [v for v in self.sumr(u' '.join(max_sents.keys()), size, user_sents)]
         self.logger.info("Canonical keys are \n%s", u' '.join(can_dict.keys()))
         self.logger.info("Spacy summ %s", txt_sum)
         nlp_summ = u'\n'.join([self.tagged_sum(max_sents[ss]) for ss in txt_sum if len(ss) > 1 and ss in max_sents])
         nlp_list = [max_sents[ss] for ss in txt_sum if len(ss) > 1 and ss in max_sents]
         for ss in txt_sum:
             if ss not in max_sents and len(ss.split()) > 5:
                 self.logger.info("Searching for: %s", ss)
                 for (ky, msg) in max_sents.items():
                     if ss in ky or (len(ky.split()) > 10 and ky in ss) and len(nlp_list) <= size:
                         nlp_summ += u'\n' + self.tagged_sum(msg)
                         nlp_list.append(msg)
         if len(nlp_list) < 2:
             self.logger.info("Failed to find nlp summary using heuristic")
             summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])])
         else:
             self.logger.info("First msg is %s, %s", nlp_list[0], nlp_list[0]['ts'])
             self.logger.info("Sorted is %s", sorted(nlp_list, key=lambda x: x['ts']))
             summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(nlp_list, key=lambda x: x['ts'])])
     self.logger.info("Summary for segment %s is %s", msgs, summ) 
     return summ