def get_kw_correl(w1, w2): cm = get_and_update(4, None, (w1, w2), g.ts) + get_and_update(4, None, (w2, w1), g.ts) correl = cm / (small + sqrt( get_and_update(1, g.ww, w1, g.ts) * \ get_and_update(1, g.ww, w2, g.ts) )) return correl
def get_kw_correl(w1, w2): cm = get_and_update(4, None, (w1, w2), g.ts) + get_and_update( 4, None, (w2, w1), g.ts) correl = cm / (small + sqrt( get_and_update(1, g.ww, w1, g.ts) * \ get_and_update(1, g.ww, w2, g.ts) )) return correl
def get_correl(words, word2, case): #TODO: do average here #(it doesn't matter in my usecase) if not words: return 1 correl = 0 for i in range(len(words)): word1 = words[i] word_pair = (word1, word2) if case == 1 else (word2, word1) try: correl += log( small + \ ( get_and_update(4, None, word_pair, g.ts) / \ (small + sqrt( \ get_and_update(1, g.ww, word_pair[0], g.ts) * \ get_and_update(1, g.ww, word_pair[1], g.ts)) ) ) ) except Exception, e: print word_pair raise e
def build_summary(starting_summary, parent_keywords=[]): partial_summary = ( list(starting_summary), # summary until this point 0, # score until this point list(starting_summary) # starting summary ) partial_summaries = [] heappush(partial_summaries, (-partial_summary[1], partial_summary)) completed_summaries = [] while partial_summaries: # generate potential summaries # expand them and keep the best ones _, (summary, summary_score, keywords) = heappop(partial_summaries) # add words to summary add_forward = g.ng.get((summary[-2], summary[-1]), {}).keys() add_reverse = g.ing.get((summary[0], summary[1]), {}).keys() # compute score for possible next moves # also updates ng previous_bigram = tuple(summary[-2:]) next_options = [ ( summary + list(bigr)[1:], get_score( summary, bigr, get_and_update(3, None, previous_bigram + (bigr[1],), g.ts), previous_bigram, 1, list(keywords) + list(parent_keywords) ), bigr[1] ) for bigr in add_forward ] # repeat for reverse links (not elegant, refactor?) previous_bigram = tuple(summary[:2]) next_options += [ ( list(bigr)[:1] + summary, get_score( summary, bigr, get_and_update(3, None, (bigr[0],) + previous_bigram, g.ts), previous_bigram, 2, list(keywords) + list(parent_keywords) ), bigr[0] ) for bigr in add_reverse ] next_options = nlargest(5, next_options, key=lambda x: x[1]) for next in next_options: summary, score = ( next[0], summary_score + next[1] ) if summary[-1] == '_E' and summary[0] == '_S' and len(summary) > 8: # this summary looks good, we keep it # update penalties for w in summary: g.penalty[w] += 1 print score return summary elif (summary[-1] == '_E' and summary[0] == '_S'): # this summary is too short, discard it for w in summary: g.penalty[w] += 1 else: partial_summaries.append((-score, (summary, score, keywords))) # no summary could be built return None
if s > max_count: max_count = s max_ind = i if max_ind not in event_cut_points: event_cut_points[max_ind] = [] event_cut_points[max_ind].append(key) initialize(tweets[:batch_size]) for i in range(1, len(tweets) / batch_size + 1): chunk_tweets = tweets[(i-1) * batch_size: i * batch_size] if i == 1: initialize(chunk_tweets) else: add_tweets_to_graph(chunk_tweets) if i in event_cut_points: g.prune(g.ts) for item in g.nw.items(): get_and_update(2, g.nw, item[0], g.ts) for kw_id in event_cut_points[i]: keyword_set = event_keywords[kw_id] print keyword_set runtime = time.time() summarize_keywords(keyword_set, n=1) runtime = time.time() - runtime print '(%s)' % runtime print '----------------------------'