def handle(self, dictionary_id, output_folder, **options):

        if not dictionary_id:
            raise CommandError("Dictionary id is required.")
        try:
            dictionary_id = int(dictionary_id)
        except ValueError:
            raise CommandError("Dictionary id must be a number.")

        if not output_folder:
            raise CommandError("Output folder path is required.")

        num_pairs = options.get('num_pairs')
        #num_conditions = options.get('num_conditions')
        #num_stages = options.get('num_stages')
        experiment_name = options.get('experiment_name')

        # make sure the folder exists
        check_or_create_dir(output_folder)

        output_filename = "%s/user_accounts.log" % output_folder

        with open(output_filename, "w") as output:
            with transaction.atomic(savepoint=False):
                # create an experiment
                experiment = experiment_models.Experiment(
                    name=experiment_name,
                    saved_path_root=output_folder,
                    dictionary_id=dictionary_id)
                experiment.save()

                experiment.initialize_controlled_experiment(  #num_conditions=num_conditions,
                    #num_stages=num_stages,
                    num_pairs=num_pairs,
                    output=output)
    def handle(self, dictionary_id, output_folder, **options):

        if not dictionary_id:
            raise CommandError("Dictionary id is required.")
        try:
            dictionary_id = int(dictionary_id)
        except ValueError:
            raise CommandError("Dictionary id must be a number.")

        if not output_folder:
            raise CommandError("Output folder path is required.")


        num_pairs = options.get('num_pairs')
        #num_conditions = options.get('num_conditions')
        #num_stages = options.get('num_stages')
        experiment_name = options.get('experiment_name')

        # make sure the folder exists
        check_or_create_dir(output_folder)

        output_filename = "%s/user_accounts.log" % output_folder

        with open(output_filename, "w") as output:
            with transaction.atomic(savepoint=False):
                # create an experiment
                experiment = experiment_models.Experiment(name=experiment_name,
                                                          saved_path_root=output_folder,
                                                          dictionary_id=dictionary_id)
                experiment.save()

                experiment.initialize_controlled_experiment(#num_conditions=num_conditions,
                                                 #num_stages=num_stages,
                                                 num_pairs=num_pairs,
                                                 output=output)
Пример #3
0
    def handle(self, dictionary_id, output_folder, **options):

        if not dictionary_id:
            raise CommandError("Dictionary id is required.")
        try:
            dictionary_id = int(dictionary_id)
        except ValueError:
            raise CommandError("Dictionary id must be a number.")

        if not output_folder:
            raise CommandError("Output folder path is required.")

        num_pairs = options.get("num_pairs")
        num_conditions = options.get("num_conditions")
        num_stages = options.get("num_stages")
        experiment_name = options.get("experiment_name")

        # make sure the folder exists
        check_or_create_dir(output_folder)

        output_filename = "%s/user_accounts.log" % output_folder

        with open(output_filename, "w") as output:
            # create an experiment
            experiment = experiment_models.Experiment(name=experiment_name, dictionary_id=dictionary_id)
            experiment.save()

            experiment.initialize_experiment(
                num_conditions=num_conditions, num_stages=num_stages, num_pairs=num_pairs, output=output
            )
Пример #4
0
    def handle(self, dataset_id, save_path, **options):
        action = options.get('action')
        tweet_parser_path = options.get('tweet_parser_path')

        if not dataset_id:
            raise CommandError("Dataset id is required.")
        try:
            dataset_id = int(dataset_id)
        except ValueError:
            raise CommandError("Dataset id must be a number.")

        if not save_path:
            raise CommandError("File save path is required.")

        check_or_create_dir(save_path)

        if action == 'all' or action == 'dump':
            from msgvis.apps.enhance.tasks import dump_tweets
            print "Dumping messages..."
            dump_tweets(dataset_id, save_path)

        if action == 'all' or action == 'parse':
            from msgvis.apps.enhance.tasks import parse_tweets
            output_path = "%s/parsed_tweets" %save_path
            check_or_create_dir(output_path)

            print "\n=========="
            print "Parsing messages..."
            parse_tweets(tweet_parser_path, save_path, output_path)

        if action == 'all' or action == 'lemmatize':
            from msgvis.apps.enhance.tasks import lemmatize_tweets
            input_path = "%s/parsed_tweets" %save_path
            output_path = "%s/converted_tweets" %save_path
            check_or_create_dir(output_path)

            print "\n=========="
            print "Lemmatizing messages..."
            lemmatize_tweets(input_path, output_path)
Пример #5
0
    def handle(self, dataset_id, save_path, **options):
        action = options.get('action')
        tweet_parser_path = options.get('tweet_parser_path')

        if not dataset_id:
            raise CommandError("Dataset id is required.")
        try:
            dataset_id = int(dataset_id)
        except ValueError:
            raise CommandError("Dataset id must be a number.")

        if not save_path:
            raise CommandError("File save path is required.")

        check_or_create_dir(save_path)

        if action == 'all' or action == 'dump':
            from msgvis.apps.enhance.tasks import dump_tweets
            print "Dumping messages..."
            dump_tweets(dataset_id, save_path)

        if action == 'all' or action == 'parse':
            from msgvis.apps.enhance.tasks import parse_tweets
            output_path = "%s/parsed_tweets" % save_path
            check_or_create_dir(output_path)

            print "\n=========="
            print "Parsing messages..."
            parse_tweets(tweet_parser_path, save_path, output_path)

        if action == 'all' or action == 'lemmatize':
            from msgvis.apps.enhance.tasks import lemmatize_tweets
            input_path = "%s/parsed_tweets" % save_path
            output_path = "%s/converted_tweets" % save_path
            check_or_create_dir(output_path)

            print "\n=========="
            print "Lemmatizing messages..."
            lemmatize_tweets(input_path, output_path)
Пример #6
0
    def process_stage(self, use_tfidf=False):
        experiment = self.assignment.experiment
        dictionary = experiment.dictionary
        try:
            for source in self.assignment.pair.users.all():
                sources = ["system", source]
                features = list(dictionary.get_feature_list(sources))
                messages = self.selected_messages.all()
                master_messages = dictionary.dataset.get_master_message_set().all()

                feature_index_map = {}
                for idx, feature in enumerate(features):
                    feature_index_map[feature.index] = idx

                model_save_path = "%s/%s_stage%d/" % (experiment.saved_path_root, source.username, self.order)
                check_or_create_dir(model_save_path)

                X, y, code_map_inverse = coding_utils.get_formatted_data(
                    dictionary=dictionary,
                    source=source,
                    messages=messages,
                    feature_index_map=feature_index_map,
                    feature_num=len(features),
                    use_tfidf=use_tfidf,
                    master_messages=master_messages,
                )
                lin_clf = coding_utils.train_model(X, y, model_save_path=model_save_path)

                svm_model = SVMModel(source=source, source_stage=self, saved_path=model_save_path)
                svm_model.save()

                weights = []
                for code_index, code_id in code_map_inverse.iteritems():
                    for feature_index, feature in enumerate(features):
                        try:
                            if lin_clf.coef_.shape[0] == 1:
                                weight = lin_clf.coef_[0][feature_index]
                            else:
                                weight = lin_clf.coef_[code_index][feature_index]
                        except:
                            import traceback

                            traceback.print_exc()
                            import pdb

                            pdb.set_trace()

                        model_weight = SVMModelWeight(
                            svm_model=svm_model, code_id=code_id, feature=feature, weight=weight
                        )

                        weights.append(model_weight)

                SVMModelWeight.objects.bulk_create(weights)

                try:
                    next_stage = self.get_next_stage()
                except IndexError:
                    pass
                else:
                    next_message_set = next_stage.message_set.messages.all()

                    code_assignments = []
                    next_X = coding_utils.get_formatted_X(
                        messages=next_message_set,
                        dictionary=dictionary,
                        source=source,
                        feature_index_map=feature_index_map,
                        feature_num=len(features),
                        use_tfidf=use_tfidf,
                    )
                    predict_y, prob = coding_utils.get_prediction(lin_clf, next_X)
                    for idx, message in enumerate(next_message_set):
                        code_index = predict_y[idx]
                        code_id = code_map_inverse[code_index]
                        try:
                            if lin_clf.coef_.shape[0] == 1:
                                if code_index == 1:
                                    probability = prob[idx]
                                else:
                                    probability = 1 - prob[idx]
                            else:
                                probability = prob[idx, code_index]
                        except:
                            import traceback

                            traceback.print_exc()
                            import pdb

                            pdb.set_trace()

                        code_assignment = coding_models.CodeAssignment(
                            message=message,
                            source=source,
                            code_id=code_id,
                            is_user_labeled=False,
                            probability=probability,
                        )
                        code_assignments.append(code_assignment)

                    coding_models.CodeAssignment.objects.bulk_create(code_assignments)

        except:
            import traceback

            traceback.print_exc()
            import pdb

            pdb.set_trace()
Пример #7
0
    def process_stage(self, use_tfidf=False):
        experiment = self.assignment.experiment
        dictionary = experiment.dictionary
        try:
            for source in self.assignment.pair.users.all():
                sources = ["system", source]
                features = list(dictionary.get_feature_list(sources))
                messages = self.selected_messages.all()
                master_messages = dictionary.dataset.get_master_message_set(
                ).all()

                feature_index_map = {}
                for idx, feature in enumerate(features):
                    feature_index_map[feature.index] = idx

                model_save_path = "%s/%s_stage%d/" % (
                    experiment.saved_path_root, source.username, self.order)
                check_or_create_dir(model_save_path)

                X, y, code_map_inverse = coding_utils.get_formatted_data(
                    dictionary=dictionary,
                    source=source,
                    messages=messages,
                    feature_index_map=feature_index_map,
                    feature_num=len(features),
                    use_tfidf=use_tfidf,
                    master_messages=master_messages)
                lin_clf = coding_utils.train_model(
                    X, y, model_save_path=model_save_path)

                svm_model = SVMModel(source=source,
                                     source_stage=self,
                                     saved_path=model_save_path)
                svm_model.save()

                weights = []
                for code_index, code_id in code_map_inverse.iteritems():
                    for feature_index, feature in enumerate(features):
                        try:
                            if lin_clf.coef_.shape[0] == 1:
                                weight = lin_clf.coef_[0][feature_index]
                            else:
                                weight = lin_clf.coef_[code_index][
                                    feature_index]
                        except:
                            import traceback
                            traceback.print_exc()
                            import pdb
                            pdb.set_trace()

                        model_weight = SVMModelWeight(svm_model=svm_model,
                                                      code_id=code_id,
                                                      feature=feature,
                                                      weight=weight)

                        weights.append(model_weight)

                SVMModelWeight.objects.bulk_create(weights)

                try:
                    next_stage = self.get_next_stage()
                except IndexError:
                    pass
                else:
                    next_message_set = next_stage.message_set.messages.all()

                    code_assignments = []
                    next_X = coding_utils.get_formatted_X(
                        messages=next_message_set,
                        dictionary=dictionary,
                        source=source,
                        feature_index_map=feature_index_map,
                        feature_num=len(features),
                        use_tfidf=use_tfidf)
                    predict_y, prob = coding_utils.get_prediction(
                        lin_clf, next_X)
                    for idx, message in enumerate(next_message_set):
                        code_index = predict_y[idx]
                        code_id = code_map_inverse[code_index]
                        try:
                            if lin_clf.coef_.shape[0] == 1:
                                if code_index == 1:
                                    probability = prob[idx]
                                else:
                                    probability = 1 - prob[idx]
                            else:
                                probability = prob[idx, code_index]
                        except:
                            import traceback
                            traceback.print_exc()
                            import pdb
                            pdb.set_trace()

                        code_assignment = coding_models.CodeAssignment(
                            message=message,
                            source=source,
                            code_id=code_id,
                            is_user_labeled=False,
                            probability=probability)
                        code_assignments.append(code_assignment)

                    coding_models.CodeAssignment.objects.bulk_create(
                        code_assignments)

        except:
            import traceback
            traceback.print_exc()
            import pdb
            pdb.set_trace()