def test_generate_model(): messages = twitter_timeline_parser.extract_messages("c:/temp/tweets2009-12.txt", limit=100) log.info('\n'.join([message['words'] for message in messages])) log.info(len(messages)) booster = db_mc_handler() mc_l = markov_chain('test',booster) for i in range(len(messages)): message = messages[i] mc_l.add_message(split_to_words(message['words'])) log.info('appending %s words: %s relations: %s' % (i, mc_l.words_count_, mc_l.relations_count_)) log.info('................................')
def test_model_(): booster = db_mc_handler(truncate=True) mc1 = markov_chain('left_test', booster,n_of_gram_=2) mc2 = markov_chain('right_test', booster,n_of_gram_=2) mc1.add_message(['a', 'b', 'c', 'd']) mc1.add_message(['a', 'b', 'c', 'd']) mc1.add_message(['a1', 'b1', 'c1', 'd1']) mc1.add_message(['a','a1','b','b1','c','c1','d','d1']) mc2.add_message(['a', 'b', 'c', 'd']) mc2.add_message(['a2', 'b2', 'c2', 'd2']) mc1.save() mc2.save() mc1.print_me() mc2.print_me()
import visualise.vis2d_machine as vis __author__ = '4ikist' __doc__ = """ Эксперимент 1. 1) Создание общей модели определенного класса людей. 2) Нахождение весов людей на основе принадлежности ленты определенного человека к общей модели. 3) Класстеризация на людей на основе разницы """ log = loggers.logger main_db = db_handler() engine = engines.tweepy_engine(out=main_db) booster = db_mc_handler() def get_users(filename): """ forming users some from db or scrapping from ttr """ result = [] users = open(filename).readlines() for user in users: name_ = tools.imply_dog(user, with_dog=True).strip() log.info('find user by name "%s"' % name_) m_user = main_db.get_user({'name_': name_}) if m_user: log.info('user found %s' % m_user.name_) result.append(m_user)
from analysing_data import markov_chain_machine from analysing_data.markov_chain_machine import markov_chain import text_proc.text_processing as tp from analysing_data.booster import db_mc_handler from model.db import db_handler from search_engine.twitter_engine import tweepy_engine from analysing_data.mc_difference_logic import diff_markov_chains import tools __author__ = '4ikist' db = db_handler(host_='localhost', port_=27017, db_name_='ttr_exp') boost = db_mc_handler() engine = tweepy_engine(out=db) def get_users_data(user_name1, user_name2): user1 = engine.get_user_info(user_name1) user2 = engine.get_user_info(user_name2) db.save_user(user1.serialise()) db.save_user(user2.serialise()) timeline1 = tools.flush(user1.timeline, by_what=lambda x: tp.get_words(x['text'], is_normalise=True))[:10] timeline2 = tools.flush(user2.timeline, by_what=lambda x: tp.get_words(x['text'], is_normalise=True))[:10] print len(timeline1) print len(timeline2) mc1 = markov_chain_machine.create_model(timeline1, user_name1, boost) mc2 = markov_chain_machine.create_model(timeline2, user_name2, boost) return mc1, mc2
from analysing_data.markov_chain_machine import markov_chain import loggers from model.db import db_handler from search_engine import twitter_engine from search_engine.twitter_engine import tweepy_engine import tools from visualise import vis_machine __author__ = '4ikist' db_ = db_handler(truncate=False) api_engine = twitter_engine.tweepy_engine(out=db_) booster = db_mc_handler(truncate=False) vis_processor = vis_machine log = loggers.logger def model_splitter(message): message_ = message.split() return message_ def process_names(file_name, class_name): """ get from file ser names, scrapping saving and forming markov chains for any user timeline """ names = open(file_name).readlines() result = []