def process_models(models): result = [] for model in models: for model_ in models: if model != model_: result.append(mc_difference_logic.diff_markov_chains(model, model_)) log.info(sum([el['content'] for el in result])) return result
def big_differences(): log.info('extract messages') users = main_db.get_not_loaded_users() model_main = markov_chain('main', booster) result = [] log.info('---------users to find is %s-------------------------------' % len(users)) loaded_users = [] for user in users: log.info('load user %s' % user) loaded_user = engine.scrap(user, neighbourhood=0) if not loaded_user: continue main_db.set_user_loaded(user) model_main = create_model(loaded_user, mc=model_main) create_model(loaded_user) loaded_users.append(loaded_user) log.info('---------start process differences of models--------------') for user in loaded_users: model_current = markov_chain.create(user.name_, booster) diff_element = diff_markov_chains(model_main, model_current) result.append({'name': user.name_, 'x': diff_element['content'], 'y': user.timeline_count}) log.info('create difference... %s' % diff_element['content']) diff_main = diff_markov_chains(model_main, model_main) nodes, edges = model_main.get_unique_nodes_edges() model_diffs = [ {'x': diff_main['content'], 'y': float(edges) / nodes}, ] vis.visualise(result, header='diff and tweets count', x_title='difference between this and main', y_title='count tweets', spec_symbols=model_diffs) model_main.visualise(100)
def clust(models): out = [] for mc in models: t1 = time.time() nearest = max([{el: diff_markov_chains(mc.model_id_, el.model_id_, booster)} for el in models if el != mc], key=lambda x: x.values()[0]) nearest.keys()[0].print_me() print nearest.values()[0] new_mc_id = booster.sum_models(mc.model_id_, nearest.keys()[0].model_id_) new_mc = markov_chain(new_mc_id, booster) out.append(new_mc) t2 = time.time() print 'time: ', t2 - t1 return clust(out)
timeline2 = tools.flush(user2.timeline, by_what=lambda x: tp.get_words(x['text'], is_normalise=True))[:10] print len(timeline1) print len(timeline2) mc1 = markov_chain_machine.create_model(timeline1, user_name1, boost) mc2 = markov_chain_machine.create_model(timeline2, user_name2, boost) return mc1, mc2 def form_timeline(user_timeline): true_timeline = tools.flush(user_timeline, by_what=lambda x: tp.get_words(x['text'], is_normalise=True)) return true_timeline if __name__ == '__main__': # models = get_users_data('navalny', 'MedvedevRussia') # print diff_markov_chains(models[0], models[1]) # engine.get_relations_of_user('navalny') # user = engine.get_user_info('GoogleRussia') # db.save_user(user.serialise()) # user = db.get_user({'name_':'@GoogleRussia'}) # # print len(user.timeline) # print user.timeline_count user = db.get_user({'name_': '@GoogleRussia'}) time_line = form_timeline(user.timeline) mc = markov_chain_machine.create_model(time_line,user.name_,boost) mc.print_me() diff_markov_chains(mc,mc)
def test_difference_logic(markov_chain_l, markov_chain_r): difference_element = diff_markov_chains(markov_chain_l, markov_chain_r) log.info('difference element is: %s' % difference_element)