def handle(self, dictionary_id, output_folder, **options): if not dictionary_id: raise CommandError("Dictionary id is required.") try: dictionary_id = int(dictionary_id) except ValueError: raise CommandError("Dictionary id must be a number.") if not output_folder: raise CommandError("Output folder path is required.") num_pairs = options.get('num_pairs') #num_conditions = options.get('num_conditions') #num_stages = options.get('num_stages') experiment_name = options.get('experiment_name') # make sure the folder exists check_or_create_dir(output_folder) output_filename = "%s/user_accounts.log" % output_folder with open(output_filename, "w") as output: with transaction.atomic(savepoint=False): # create an experiment experiment = experiment_models.Experiment( name=experiment_name, saved_path_root=output_folder, dictionary_id=dictionary_id) experiment.save() experiment.initialize_controlled_experiment( #num_conditions=num_conditions, #num_stages=num_stages, num_pairs=num_pairs, output=output)
def handle(self, dictionary_id, output_folder, **options): if not dictionary_id: raise CommandError("Dictionary id is required.") try: dictionary_id = int(dictionary_id) except ValueError: raise CommandError("Dictionary id must be a number.") if not output_folder: raise CommandError("Output folder path is required.") num_pairs = options.get('num_pairs') #num_conditions = options.get('num_conditions') #num_stages = options.get('num_stages') experiment_name = options.get('experiment_name') # make sure the folder exists check_or_create_dir(output_folder) output_filename = "%s/user_accounts.log" % output_folder with open(output_filename, "w") as output: with transaction.atomic(savepoint=False): # create an experiment experiment = experiment_models.Experiment(name=experiment_name, saved_path_root=output_folder, dictionary_id=dictionary_id) experiment.save() experiment.initialize_controlled_experiment(#num_conditions=num_conditions, #num_stages=num_stages, num_pairs=num_pairs, output=output)
def handle(self, dictionary_id, output_folder, **options): if not dictionary_id: raise CommandError("Dictionary id is required.") try: dictionary_id = int(dictionary_id) except ValueError: raise CommandError("Dictionary id must be a number.") if not output_folder: raise CommandError("Output folder path is required.") num_pairs = options.get("num_pairs") num_conditions = options.get("num_conditions") num_stages = options.get("num_stages") experiment_name = options.get("experiment_name") # make sure the folder exists check_or_create_dir(output_folder) output_filename = "%s/user_accounts.log" % output_folder with open(output_filename, "w") as output: # create an experiment experiment = experiment_models.Experiment(name=experiment_name, dictionary_id=dictionary_id) experiment.save() experiment.initialize_experiment( num_conditions=num_conditions, num_stages=num_stages, num_pairs=num_pairs, output=output )
def handle(self, dataset_id, save_path, **options): action = options.get('action') tweet_parser_path = options.get('tweet_parser_path') if not dataset_id: raise CommandError("Dataset id is required.") try: dataset_id = int(dataset_id) except ValueError: raise CommandError("Dataset id must be a number.") if not save_path: raise CommandError("File save path is required.") check_or_create_dir(save_path) if action == 'all' or action == 'dump': from msgvis.apps.enhance.tasks import dump_tweets print "Dumping messages..." dump_tweets(dataset_id, save_path) if action == 'all' or action == 'parse': from msgvis.apps.enhance.tasks import parse_tweets output_path = "%s/parsed_tweets" %save_path check_or_create_dir(output_path) print "\n==========" print "Parsing messages..." parse_tweets(tweet_parser_path, save_path, output_path) if action == 'all' or action == 'lemmatize': from msgvis.apps.enhance.tasks import lemmatize_tweets input_path = "%s/parsed_tweets" %save_path output_path = "%s/converted_tweets" %save_path check_or_create_dir(output_path) print "\n==========" print "Lemmatizing messages..." lemmatize_tweets(input_path, output_path)
def handle(self, dataset_id, save_path, **options): action = options.get('action') tweet_parser_path = options.get('tweet_parser_path') if not dataset_id: raise CommandError("Dataset id is required.") try: dataset_id = int(dataset_id) except ValueError: raise CommandError("Dataset id must be a number.") if not save_path: raise CommandError("File save path is required.") check_or_create_dir(save_path) if action == 'all' or action == 'dump': from msgvis.apps.enhance.tasks import dump_tweets print "Dumping messages..." dump_tweets(dataset_id, save_path) if action == 'all' or action == 'parse': from msgvis.apps.enhance.tasks import parse_tweets output_path = "%s/parsed_tweets" % save_path check_or_create_dir(output_path) print "\n==========" print "Parsing messages..." parse_tweets(tweet_parser_path, save_path, output_path) if action == 'all' or action == 'lemmatize': from msgvis.apps.enhance.tasks import lemmatize_tweets input_path = "%s/parsed_tweets" % save_path output_path = "%s/converted_tweets" % save_path check_or_create_dir(output_path) print "\n==========" print "Lemmatizing messages..." lemmatize_tweets(input_path, output_path)
def process_stage(self, use_tfidf=False): experiment = self.assignment.experiment dictionary = experiment.dictionary try: for source in self.assignment.pair.users.all(): sources = ["system", source] features = list(dictionary.get_feature_list(sources)) messages = self.selected_messages.all() master_messages = dictionary.dataset.get_master_message_set().all() feature_index_map = {} for idx, feature in enumerate(features): feature_index_map[feature.index] = idx model_save_path = "%s/%s_stage%d/" % (experiment.saved_path_root, source.username, self.order) check_or_create_dir(model_save_path) X, y, code_map_inverse = coding_utils.get_formatted_data( dictionary=dictionary, source=source, messages=messages, feature_index_map=feature_index_map, feature_num=len(features), use_tfidf=use_tfidf, master_messages=master_messages, ) lin_clf = coding_utils.train_model(X, y, model_save_path=model_save_path) svm_model = SVMModel(source=source, source_stage=self, saved_path=model_save_path) svm_model.save() weights = [] for code_index, code_id in code_map_inverse.iteritems(): for feature_index, feature in enumerate(features): try: if lin_clf.coef_.shape[0] == 1: weight = lin_clf.coef_[0][feature_index] else: weight = lin_clf.coef_[code_index][feature_index] except: import traceback traceback.print_exc() import pdb pdb.set_trace() model_weight = SVMModelWeight( svm_model=svm_model, code_id=code_id, feature=feature, weight=weight ) weights.append(model_weight) SVMModelWeight.objects.bulk_create(weights) try: next_stage = self.get_next_stage() except IndexError: pass else: next_message_set = next_stage.message_set.messages.all() code_assignments = [] next_X = coding_utils.get_formatted_X( messages=next_message_set, dictionary=dictionary, source=source, feature_index_map=feature_index_map, feature_num=len(features), use_tfidf=use_tfidf, ) predict_y, prob = coding_utils.get_prediction(lin_clf, next_X) for idx, message in enumerate(next_message_set): code_index = predict_y[idx] code_id = code_map_inverse[code_index] try: if lin_clf.coef_.shape[0] == 1: if code_index == 1: probability = prob[idx] else: probability = 1 - prob[idx] else: probability = prob[idx, code_index] except: import traceback traceback.print_exc() import pdb pdb.set_trace() code_assignment = coding_models.CodeAssignment( message=message, source=source, code_id=code_id, is_user_labeled=False, probability=probability, ) code_assignments.append(code_assignment) coding_models.CodeAssignment.objects.bulk_create(code_assignments) except: import traceback traceback.print_exc() import pdb pdb.set_trace()
def process_stage(self, use_tfidf=False): experiment = self.assignment.experiment dictionary = experiment.dictionary try: for source in self.assignment.pair.users.all(): sources = ["system", source] features = list(dictionary.get_feature_list(sources)) messages = self.selected_messages.all() master_messages = dictionary.dataset.get_master_message_set( ).all() feature_index_map = {} for idx, feature in enumerate(features): feature_index_map[feature.index] = idx model_save_path = "%s/%s_stage%d/" % ( experiment.saved_path_root, source.username, self.order) check_or_create_dir(model_save_path) X, y, code_map_inverse = coding_utils.get_formatted_data( dictionary=dictionary, source=source, messages=messages, feature_index_map=feature_index_map, feature_num=len(features), use_tfidf=use_tfidf, master_messages=master_messages) lin_clf = coding_utils.train_model( X, y, model_save_path=model_save_path) svm_model = SVMModel(source=source, source_stage=self, saved_path=model_save_path) svm_model.save() weights = [] for code_index, code_id in code_map_inverse.iteritems(): for feature_index, feature in enumerate(features): try: if lin_clf.coef_.shape[0] == 1: weight = lin_clf.coef_[0][feature_index] else: weight = lin_clf.coef_[code_index][ feature_index] except: import traceback traceback.print_exc() import pdb pdb.set_trace() model_weight = SVMModelWeight(svm_model=svm_model, code_id=code_id, feature=feature, weight=weight) weights.append(model_weight) SVMModelWeight.objects.bulk_create(weights) try: next_stage = self.get_next_stage() except IndexError: pass else: next_message_set = next_stage.message_set.messages.all() code_assignments = [] next_X = coding_utils.get_formatted_X( messages=next_message_set, dictionary=dictionary, source=source, feature_index_map=feature_index_map, feature_num=len(features), use_tfidf=use_tfidf) predict_y, prob = coding_utils.get_prediction( lin_clf, next_X) for idx, message in enumerate(next_message_set): code_index = predict_y[idx] code_id = code_map_inverse[code_index] try: if lin_clf.coef_.shape[0] == 1: if code_index == 1: probability = prob[idx] else: probability = 1 - prob[idx] else: probability = prob[idx, code_index] except: import traceback traceback.print_exc() import pdb pdb.set_trace() code_assignment = coding_models.CodeAssignment( message=message, source=source, code_id=code_id, is_user_labeled=False, probability=probability) code_assignments.append(code_assignment) coding_models.CodeAssignment.objects.bulk_create( code_assignments) except: import traceback traceback.print_exc() import pdb pdb.set_trace()