示例#1
0
def get_kw_correl(w1, w2):
    cm = get_and_update(4, None, (w1, w2), g.ts) + get_and_update(4, None, (w2, w1), g.ts)
    correl = cm / (small + sqrt(
            get_and_update(1, g.ww, w1, g.ts) * \
            get_and_update(1, g.ww, w2, g.ts)
    ))
    return correl
示例#2
0
def get_kw_correl(w1, w2):
    cm = get_and_update(4, None, (w1, w2), g.ts) + get_and_update(
        4, None, (w2, w1), g.ts)
    correl = cm / (small + sqrt(
            get_and_update(1, g.ww, w1, g.ts) * \
            get_and_update(1, g.ww, w2, g.ts)
    ))
    return correl
示例#3
0
def get_correl(words, word2, case):
    #TODO: do average here
    #(it doesn't matter in my usecase)
    if not words:
        return 1
    correl = 0
    for i in range(len(words)):
        word1 = words[i]
        word_pair = (word1, word2) if case == 1 else (word2, word1)
        try:
            correl += log(
                small + \
                (
                    get_and_update(4, None, word_pair, g.ts) / \
                    (small + sqrt( \
                        get_and_update(1, g.ww, word_pair[0], g.ts) * \
                        get_and_update(1, g.ww, word_pair[1], g.ts))
                    )
                )
            )
        except Exception, e:
            print word_pair
            raise e
示例#4
0
def get_correl(words, word2, case):
    #TODO: do average here
    #(it doesn't matter in my usecase)
    if not words:
        return 1
    correl = 0
    for i in range(len(words)):
        word1 = words[i]
        word_pair = (word1, word2) if case == 1 else (word2, word1)
        try:
            correl += log(
                small + \
                (
                    get_and_update(4, None, word_pair, g.ts) / \
                    (small + sqrt( \
                        get_and_update(1, g.ww, word_pair[0], g.ts) * \
                        get_and_update(1, g.ww, word_pair[1], g.ts))
                    )
                )
            )
        except Exception, e:
            print word_pair
            raise e
示例#5
0
def build_summary(starting_summary, parent_keywords=[]):

    partial_summary = (
        list(starting_summary),  # summary until this point
        0,                       # score until this point
        list(starting_summary)   # starting summary
    )

    partial_summaries = []
    heappush(partial_summaries, (-partial_summary[1], partial_summary))
    completed_summaries = []

    while partial_summaries:

        # generate potential summaries
        # expand them and keep the best ones
        _, (summary, summary_score, keywords) = heappop(partial_summaries)

        # add words to summary
        add_forward = g.ng.get((summary[-2], summary[-1]), {}).keys()
        add_reverse = g.ing.get((summary[0], summary[1]), {}).keys()

        # compute score for possible next moves 
        # also updates ng
        previous_bigram = tuple(summary[-2:])
        next_options = [
            (
                summary + list(bigr)[1:], 
                get_score(
                    summary, 
                    bigr, 
                    get_and_update(3, None, previous_bigram + (bigr[1],), g.ts), 
                    previous_bigram,
                    1,
                    list(keywords) + list(parent_keywords)
                ),
                bigr[1]
            )
            for bigr in add_forward
        ]

        # repeat for reverse links (not elegant, refactor?)
        previous_bigram = tuple(summary[:2])
        next_options += [
            (
                list(bigr)[:1] + summary, 
                get_score(
                    summary, 
                    bigr, 
                    get_and_update(3, None, (bigr[0],) + previous_bigram, g.ts), 
                    previous_bigram,
                    2,
                    list(keywords) + list(parent_keywords)
                ),
                bigr[0]
            )
            for bigr in add_reverse
        ]

        next_options = nlargest(5, next_options, key=lambda x: x[1])

        for next in next_options:
            summary, score = (
                    next[0], 
                    summary_score + next[1]
            )

            
            if summary[-1] == '_E' and summary[0] == '_S' and len(summary) > 8:
                # this summary looks good, we keep it
                # update penalties
                for w in summary:
                    g.penalty[w] += 1
                print score
                return summary

            elif (summary[-1] == '_E' and summary[0] == '_S'):
                # this summary is too short, discard it
                for w in summary:
                    g.penalty[w] += 1
            else:
                partial_summaries.append((-score, (summary, score, keywords)))

    # no summary could be built
    return None
示例#6
0
        if s > max_count:
            max_count = s
            max_ind = i
    if max_ind not in event_cut_points:
        event_cut_points[max_ind] = []
    event_cut_points[max_ind].append(key)

initialize(tweets[:batch_size])

for i in range(1, len(tweets) / batch_size + 1):
    chunk_tweets = tweets[(i-1) * batch_size: i * batch_size]

    if i == 1:
        initialize(chunk_tweets)
    else:
        add_tweets_to_graph(chunk_tweets)

    if i in event_cut_points:
        g.prune(g.ts)
        for item in g.nw.items():
            get_and_update(2, g.nw, item[0], g.ts)

        for kw_id in event_cut_points[i]:
            keyword_set = event_keywords[kw_id]
            print keyword_set
            runtime = time.time()
            summarize_keywords(keyword_set, n=1)
            runtime = time.time() - runtime
            print '(%s)' % runtime
            print '----------------------------'