Пример #1
0
def main():
    # ----- settings:
    experiment_type = 1
    split_in_cross_validation_again = False
    find_ranks_in_PSA_again = False
    portion_of_test_in_dataset = 0.3
    number_of_folds = 10
    portion_of_sampled_dataset_vector = [
        0.02, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9
    ]
    classifiers_for_experiments = [
        'SVM', 'LDA', 'QDA', 'Random Forest', 'Logistic Regression',
        'Gaussian Naive Bayes'
    ]
    path_to_save = './PSA_outputs/'

    # ---- path of dataset:
    path_dataset = './dataset/Breast_cancer_dataset/wdbc_data.txt'
    # ---- read the dataset:
    print(
        '############################## Reading dataset and splitting it to K-fold train and test sets'
    )
    data = pd.read_csv(
        path_dataset, sep=",", header=None
    )  # read text file using pandas dataFrame: https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas
    labels_of_classes = ['M', 'B']
    X, y = read_dataset(data=data, labels_of_classes=labels_of_classes)
    experiments = Experiments()
    # # --- saving/loading split dataset in/from folder:
    # if split_in_cross_validation_again:
    #     train_indices_in_folds, test_indices_in_folds, X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds = experiments.cross_validation(X=X, y=y, n_splits=number_of_folds, test_size=portion_of_test_in_dataset)
    #     save_variable(train_indices_in_folds, 'train_indices_in_folds', path_to_save=path_to_save)
    #     save_variable(test_indices_in_folds, 'test_indices_in_folds', path_to_save=path_to_save)
    #     save_variable(X_train_in_folds, 'X_train_in_folds', path_to_save=path_to_save)
    #     save_variable(X_test_in_folds, 'X_test_in_folds', path_to_save=path_to_save)
    #     save_variable(y_train_in_folds, 'y_train_in_folds', path_to_save=path_to_save)
    #     save_variable(y_test_in_folds, 'y_test_in_folds', path_to_save=path_to_save)
    # else:
    #     file = open(path_to_save+'train_indices_in_folds.pckl','rb')
    #     train_indices_in_folds = pickle.load(file); file.close()
    #     file = open(path_to_save+'test_indices_in_folds.pckl','rb')
    #     test_indices_in_folds = pickle.load(file); file.close()
    #     file = open(path_to_save+'X_train_in_folds.pckl','rb')
    #     X_train_in_folds = pickle.load(file); file.close()
    #     file = open(path_to_save+'X_test_in_folds.pckl','rb')
    #     X_test_in_folds = pickle.load(file); file.close()
    #     file = open(path_to_save+'y_train_in_folds.pckl','rb')
    #     y_train_in_folds = pickle.load(file); file.close()
    #     file = open(path_to_save+'y_test_in_folds.pckl','rb')
    #     y_test_in_folds = pickle.load(file); file.close()

    # ----- experiments:
    if experiment_type == 1:
        experiments.multi_class_demo()
    def __init__(self):

        # parameters
        self.global_planner = rospy.get_param(
            'social_experiments/global_planner', '')
        self.local_planner = rospy.get_param(
            'social_experiments/local_planner', '')
        self.world_model_name = rospy.get_param(
            'social_experiments/world_model_name', '')
        self.robot_model_name = rospy.get_param(
            'social_experiments/robot_model_name', '')
        self.max_experiments = rospy.get_param(
            'social_experiments/max_experiments', 100)
        self.path_storage = rospy.get_param('social_experiments/path_storage',
                                            '')
        self.robot_vel = rospy.get_param('social_experiments/robot_vel', 0.3)
        self.space_factor_tolerance = rospy.get_param(
            'social_experiments/space_factor_tolerance', 5)
        self.time_factor_tolerance = rospy.get_param(
            'social_experiments/time_factor_tolerance', 5)
        # self.start_service = rospy.get_param('social_experiments/start_service', '/regions/start')
        # self.goal_service = rospy.get_param('social_experiments/goal_service', '/regions/goal')
        self.checkpoint_services = rospy.get_param(
            'social_experiments/checkpoint_services', '')

        if (self.checkpoint_services is ''):
            self.checkpoint_services = []
        else:
            self.checkpoint_services = list(
                self.checkpoint_services.split(" "))

        # log
        rospy.loginfo('global_planner: ' + self.global_planner)
        rospy.loginfo('local_planner: ' + self.local_planner)
        rospy.loginfo('world_model_name: ' + self.world_model_name)
        rospy.loginfo('robot: ' + self.robot_model_name)
        rospy.loginfo('robot vel: ' + str(self.robot_vel))
        rospy.loginfo('space factor tolerance: ' +
                      str(self.space_factor_tolerance))
        rospy.loginfo('time factor tolerance: ' +
                      str(self.time_factor_tolerance))
        rospy.loginfo('max experiments: ' + str(self.max_experiments))
        # rospy.loginfo('start service: ' + str(self.start_service))
        # rospy.loginfo('goal service: ' + str(self.goal_service))
        # rospy.loginfo('checkpoint services: ' + str(self.checkpoint_services))
        print('')

        # data
        self.data = []

        # init experiments
        self.ex = Experiments(self.global_planner, self.local_planner,
                              self.world_model_name, self.robot_model_name)
def shs_test_set_evals(size, method="msd_title", with_duplicates=True):
    """
    :param size: Required prune size of the results
    :param method: (string type) {default:"msd_title"}
        choose the method of experiment available modes are
        ["msd_title", "pre-msd_title", "mxm_lyrics", "title_mxm_lyrics", "pre-title_mxm_lyrics"]
    :param with_duplicates: (boolean) {default:True} include
        or exclude MSD official duplicate tracks from the experiments
    :return:
    """

    es = SearchModule(presets.uri_config)

    if with_duplicates:
        exp = Experiments(es, './data/test_shs.csv', presets.shs_msd)
    else:
        exp = Experiments(es, './data/test_shs.csv', presets.shs_msd_no_dup)

    if method == "msd_title":
        LOGGER.info("\n%s with size %s and duplicates=%s " %
                    (method, size, with_duplicates))
        results = exp.run_song_title_match_task(size=size)

    elif method == "pre-msd_title":
        LOGGER.info("\n%s with size %s and duplicates=%s" %
                    (method, size, with_duplicates))
        results = exp.run_cleaned_song_title_task(size=size)

    elif method == "mxm_lyrics":
        LOGGER.info("\n%s with size %s and duplicates=%s" %
                    (method, size, with_duplicates))
        results = exp.run_mxm_lyrics_search_task(presets.more_like_this,
                                                 size=size)

    elif method == "title_mxm_lyrics":
        LOGGER.info("\n%s with size %s and duplicates=%s" %
                    (method, size, with_duplicates))
        results = exp.run_rerank_title_with_mxm_lyrics_task(size=size,
                                                            with_cleaned=False)

    elif method == "pre-title_mxm_lyrics":
        LOGGER.info("\n%s with size %s and duplicates=%s" %
                    (method, size, with_duplicates))
        results = exp.run_rerank_title_with_mxm_lyrics_task(size=size,
                                                            with_cleaned=True)

    else:
        raise Exception("\nInvalid 'method' parameter for the experiment ! ")

    mean_avg_precision = exp.mean_average_precision(results)
    LOGGER.info("\n Mean Average Precision (MAP) = %s" % mean_avg_precision)

    return
Пример #4
0
def main():
    settings = Settings()
    settings.Initalize_Global_Settings()

    preprocess = Preprocess(settings)
    preprocess.Load_Into_Dataframes()

    analysis = Analysis(preprocess)
    experiments = Experiments(analysis)

    data = analysis.Core(experiments)
    data_experimentals = experiments.Run_Experiments()

    models, best_fit, gals_df = analysis.Mocks_And_Models(experiments)

    plotting = Plotting(preprocess)
    plotting.Plot_Core(data, models, best_fit)
    plotting.Plot_Experiments(data, data_experimentals, models, best_fit)
Пример #5
0
    def __init__(self, setting):

        self.setting = setting
        self.mallet_path = setting['malletpath']
        self.number_of_topics = setting['nooftopics']
        self.number_of_iter = setting['noofiterations']

        self.stack_importer = StackImporter(setting)
        self.lda_importer = LDAImporter(setting)
        self.experiments = Experiments(setting)

        self.model = None
        self.corpus = None
        self.dictionary = None
        self.answer_corpus = None

        directory = self.setting['lda_folder']
        file_name = 'local_lda_model' + self.setting['theme'] + '.gs'
        self.path = ''.join([directory, file_name])
Пример #6
0
def main():

    config = get_config_from_json('config.json')
    # create an instance of the model
    model = VAE(config)
    # create experiments instance
    experiments = Experiments(config, model)
    # create trainer instance
    trainer = Trainer(config, model, experiments)
    # train the model
    trainer.train()
Пример #7
0
    def __init__(self, setting):

        self.setting = setting

        self.idf_values = None

        self.wiki_corpus = None
        self.wiki_dictionary = None
        self.wiki_vectors = []
        self.wiki_processor = WikiPreprocessor(setting)
        self.wiki_importer = WikiImporter(setting, self.wiki_processor)

        self.stack_corpus = None
        self.answer_vectors = {}
        self.question_vectors = {}
        self.user_vectors = {}
        self.user_content = {}
        self.stack_importer = StackImporter(setting)

        self.esa_importer = ESAImporter(setting)
        self.inverted_index = defaultdict(list)
        self.number_of_concepts = 0

        self.experiments = Experiments(setting)
Пример #8
0
	def __init__(self, setting):

		self.setting          = setting
		self.mallet_path      = setting['malletpath']
		self.number_of_topics = setting['nooftopics']
		self.number_of_iter   = setting['noofiterations']

		self.stack_importer   = StackImporter(setting)
		self.lda_importer     = LDAImporter(setting)
		self.experiments      = Experiments(setting)

		self.model            = None
		self.corpus           = None
		self.dictionary       = None
		self.answer_corpus    = None

		directory = self.setting['lda_folder']
		file_name = 'local_lda_model' + self.setting['theme'] + '.gs'
		self.path = ''.join([directory, file_name])
Пример #9
0
	def __init__(self, setting):

		self.setting          = setting

		self.idf_values       = None

		self.wiki_corpus      = None
		self.wiki_dictionary  = None
		self.wiki_vectors     = []
		self.wiki_processor   = WikiPreprocessor(setting)
		self.wiki_importer    = WikiImporter(setting, self.wiki_processor)

		self.stack_corpus        = None
		self.answer_vectors      = {}
		self.question_vectors    = {}
		self.user_vectors        = {}
		self.user_content        = {}
		self.stack_importer      = StackImporter(setting)

		self.esa_importer        = ESAImporter(setting)
		self.inverted_index      = defaultdict(list)
		self.number_of_concepts  = 0

		self.experiments         = Experiments(setting)
Пример #10
0
class ESA(object):
    """ ESA - Explicit Semantic Analysis """
    def __init__(self, setting):

        self.setting = setting

        self.idf_values = None

        self.wiki_corpus = None
        self.wiki_dictionary = None
        self.wiki_vectors = []
        self.wiki_processor = WikiPreprocessor(setting)
        self.wiki_importer = WikiImporter(setting, self.wiki_processor)

        self.stack_corpus = None
        self.answer_vectors = {}
        self.question_vectors = {}
        self.user_vectors = {}
        self.user_content = {}
        self.stack_importer = StackImporter(setting)

        self.esa_importer = ESAImporter(setting)
        self.inverted_index = defaultdict(list)
        self.number_of_concepts = 0

        self.experiments = Experiments(setting)

    ###############################################################################
    # Clean and load data
    ###############################################################################
    def clean_and_load_data(self):
        """ Cleans the data and saves it in a database """

        self.wiki_importer.import_wiki_data()

    ###############################################################################
    # Create and manage data used by ESA algorithm
    ###############################################################################

    def build_esa_db(self):
        """ Initializes the ESA database """

        logging.info("\nCreating ESA database ...")

        self.esa_importer.open_esa_db()

        # Initialize database
        self.esa_importer.create_esa_db()

        # Save the dictionary and corpus of the Wikipedia data
        self.wiki_dictionary = self.wiki_importer.build_wiki_kb()

        # Save the inverse document frequencies in the ESA database
        number_of_documents = self.wiki_dictionary.num_docs  #self.wiki_importer.get_number_of_concepts()
        self.esa_importer.save_wiki_inverse_document_frequencies(
            number_of_documents)

        self.esa_importer.close_esa_db()

    def load_esa_index(self):
        """ Gets the inverted index from the database """

        self.esa_importer.open_esa_db()

        self.esa_importer.get_pruned_inverted_index(self.inverted_index)
        logging.info("\nDone")

        self.esa_importer.close_esa_db()

    ###############################################################################
    # Build TF-IDF Vectors
    ###############################################################################

    def create_tf_idf_vectors(self):
        """ Creates them if not already in database """

        self.esa_importer.open_esa_db()

        # Calculate tfidf vectors for the Wikipedia articles
        self.create_tf_idf_wiki_vectors()

        # Save terms and vectors to ESA db
        #self.esa_importer.save_inverted_index(self.wiki_vectors)

        logging.info("\nDone")

        self.esa_importer.close_esa_db()

    def create_tf_idf_wiki_vectors(self):
        """ Keeping only non-zero entries of the vectors """

        wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary(
        )

        logging.info("Retrieving idf values ...")
        inv_doc_freq = {}
        self.esa_importer.get_wiki_inverse_document_frequencies(inv_doc_freq)

        logging.info("Building the tfidf vectors and the inverse index ...")
        tfidf_model = TfidfModel(self.wiki_dictionary, inv_doc_freq)
        inverted_index = defaultdict(list)

        for document in wiki_corpus:
            vector = tfidf_model[document]

            for term_id, value in vector:
                inverted_index[term_id].append((document.document_id, value))

            #print "Added " + str(document.document_id)

        logging.info("\n\tDone.")
        self.esa_importer.save_inverted_index(inverted_index)

        self.save_index_to_file(inverted_index)

    def _create_tf_idf_stack_vectors(self, only_questions=False):
        """ Create the tfidf vectors for the Stackexchange data. """

        # Load question and answer corpus
        logging.info("Loading stack corpus and dictionary ...")
        question_corpus = self.stack_importer.get_question_corpus()
        answer_corpus = self.stack_importer.get_answer_corpus()

        corpus = question_corpus + answer_corpus
        dictionary = self.stack_importer.get_dictionary_from_corpora(
            [question_corpus, answer_corpus])
        dict_size = len(dictionary)

        # Save stack dictionary
        stack_dict = {}
        for word_id, word in enumerate(dictionary.token2id):
            stack_dict[unicode(word)] = word_id

        self.idf_values = zeros(dict_size)

        logging.info("Determining question vectors ...")
        questions = StackCorpus(self.stack_importer.connection, "question")
        for question in questions:
            question_vector = zeros(dict_size)

            for word in question.body:
                word_id = stack_dict.get(unicode(word), -1)

                if word_id != -1:
                    question_vector[word_id] = self.tf_idf(
                        word, word_id, question.body, corpus)

            self.question_vectors[question.id] = question_vector

        logging.info("\n\tDone.")

        if only_questions:  # Skip the answers
            return stack_dict

        logging.info("Determining answer vectors ...")
        answers = StackCorpus(self.stack_importer.connection, "answer")

        for answer in answers:
            answer_vector = zeros(dict_size)

            for word in answer.body:
                word_id = stack_dict.get(unicode(word), -1)

                if word_id != -1:
                    tf_idf = self.tf_idf(word, word_id, answer.body, corpus)
                    answer_vector[word_id] = tf_idf

            self.answer_vectors[answer.id] = answer_vector

        logging.info("\n\tDone.")

        return stack_dict

    def _create_local_tf_idf_stack_vectors(self, user_id):
        """ Create the tfidf vectors for the local Stackexchange data of the given user """

        # Load question and answer corpus
        #logging.info("Loading stack corpus and dictionary ...")
        question_corpus = self.stack_importer.get_user_question_corpus(user_id)
        answer_corpus = self.stack_importer.get_user_answer_corpus(user_id)

        corpus = question_corpus + answer_corpus
        dictionary = self.stack_importer.get_dictionary_from_corpora(
            [question_corpus, answer_corpus])
        dict_size = len(dictionary)

        # Save stack dictionary
        stack_dict = {}
        for word_id, word in enumerate(dictionary.token2id):
            stack_dict[unicode(word)] = word_id

        self.idf_values = zeros(dict_size)

        #logging.info("Determining question vectors ...")
        questions = self.stack_importer.get_user_local_questions(user_id)

        for question in questions:
            question_vector = zeros(dict_size)

            for word in question.body:
                word_id = stack_dict.get(unicode(word), -1)

                if word_id != -1:
                    question_vector[word_id] = self.tf_idf(
                        word, word_id, question.body, corpus)

            self.question_vectors[question.id] = question_vector

        #logging.info("\n\tDone.")

        #logging.info("Determining answer vectors ...")
        answers = self.stack_importer.get_user_local_answers(user_id)

        for answer in answers:
            answer_vector = zeros(dict_size)

            for word in answer.body:
                word_id = stack_dict.get(unicode(word), -1)

                if word_id != -1:
                    tf_idf = self.tf_idf(word, word_id, answer.body, corpus)
                    answer_vector[word_id] = tf_idf

            self.answer_vectors[answer.id] = answer_vector

        #logging.info("\n\tDone.")

        return stack_dict

    def _create_user_tf_idf_stack_vector(self, user_id, stack_dict):
        """ Create the tfidf vector representation of a user, based on her answers"""

        aux = self.user_content.get(user_id, None)
        if aux is not None:
            return aux

        user_corpus = []
        user_words = []
        answers = self.stack_importer.get_user_answers_to_questions(user_id)
        for answer in answers:
            user_corpus.append(answer.body)
            for word in answer.body:
                user_words.append(word)

        self.user_content[user_id] = user_words

        dict_size = len(stack_dict)
        user_vector = zeros(dict_size)

        for word in set(user_words):
            word_id = stack_dict.get(unicode(word), -1)

            if word_id != -1:
                tf_idf = self.tf_idf(word, word_id, user_words, user_corpus)
                user_vector[word_id] = tf_idf

        self.user_vectors[user_id] = user_vector

        return user_words

    @staticmethod
    def tf(word, document):
        """ Returns the normalized frequency of the word in the given document """

        word_count = document.count(unicode(word))
        return float(word_count) / len(document)

    @staticmethod
    def df(word, corpus):
        """ Returns the number of documents in the collection that contain the given word """

        return sum(1 for document in corpus if unicode(word) in document)

    #@staticmethod
    def idf(self, word, corpus):
        """ Returns the inverse document frequency of the word in the documents collection """

        return math.log(len(corpus)) / self.df(word, corpus)

    def tf_idf(self, word, word_index, document, corpus):
        """ Returns the TF-IDF value for the given 
		word in the document of the corpus """

        # Calculate the term frequency value (tf)
        tf = self.tf(word, document)
        if tf == 0.0:
            return 0.0

        # Calculate the inverse document frequency value (idf)
        if self.idf_values[word_index] == 0.0:
            self.idf_values[word_index] = self.idf(word, corpus)

        return float(tf * self.idf_values[word_index])

    ###############################################################################
    # Associations and Similarities of Stackexchange questions/answers using
    # Wikipedia's articles as concepts.
    ###############################################################################

    def calculate_similarities(self):
        """ Applies the ESA algorithm to the global stack data """

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        self.esa_importer.create_clean_concept_doc_relation()
        self.esa_importer.create_clean_similarities_table()

        logging.info("Loading the inverted index ...")
        self.esa_importer.get_pruned_inverted_index(self.inverted_index)

        #print "Has beer " + str(self.inverted_index.get(unicode("beer"), None))

        logging.info("Calculating stack tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors()

        # For each question calculate similarity with each answer
        logging.info("\nCalculating questions-answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:
            q_vector = self.get_esa_vector(question.id, question.body,
                                           self.question_vectors[question.id],
                                           stack_dictionary, 1)
            q_vector_norm = norm(q_vector)
            similarities = []

            answer_corpus = StackCorpus(self.stack_importer.connection,
                                        "answer")

            for answer in answer_corpus:
                a_vector = self.get_esa_vector(answer.id, answer.body,
                                               self.answer_vectors[answer.id],
                                               stack_dictionary, 2)
                sim = self.similarity(q_vector, q_vector_norm, a_vector)
                similarities.append((question.id, answer.id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")

    def calculate_tf_idf_similarities(self):
        """Applies the TF-IDF algorithm to the global stack data"""

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        self.esa_importer.create_clean_similarities_table()

        logging.info("Calculating stack tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors()

        # For each question calculate similarity with each answer
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        logging.info("\nCalculating questions-answers similarities ...")
        for question in question_corpus:
            q_vector = self.question_vectors[question.id]
            q_vector_norm = norm(q_vector)
            similarities = []

            answer_corpus = StackCorpus(self.stack_importer.connection,
                                        "answer")
            for answer in answer_corpus:
                a_vector = self.answer_vectors[answer.id]
                sim = self.similarity(q_vector, q_vector_norm, a_vector)
                similarities.append((question.id, answer.id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")

    def calculate_local_tfidf_similarities(self):
        """ Applies TF-IDF to the local stack data, in order
		to calculate questions/answers similarities. The local
		data is measured per user.
		Returns the list of users that were filtered. """

        # Keep filtered users
        filtered_users = []

        # Open database connections
        self.esa_importer.open_esa_db()
        self.stack_importer.open_stack_db()

        # Clean similarity table
        self.esa_importer.create_clean_similarities_table()

        # For each question calculate its similarity with the all the answers given
        # by the users who answered the given question
        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)
            similarities = []

            # Get the users that gave an answer to the question
            users = self.stack_importer.get_users_from_question(question.id)
            print "Users that replied: " + str(len(users))

            # Calculate the similarities of question with all
            # answers from the given users (related or not to question)
            for user_id in users:
                user_answers = self.stack_importer.get_user_answers_to_questions(
                    user_id)

                # Only consider users with more than 1 answer
                if len(user_answers) > 5:

                    print "User " + str(user_id)
                    a = []
                    for answer in user_answers:
                        a.append(answer.id)
                    print a

                    # Calculate tf_idf vectors for the given user
                    self.question_vectors.clear()
                    self.answer_vectors.clear()
                    stack_dictionary = self._create_local_tf_idf_stack_vectors(
                        user_id)

                    q_vector = self.question_vectors[question.id]
                    q_vector_norm = norm(q_vector)

                    for answer in user_answers:
                        a_vector = self.answer_vectors[answer.id]
                        sim = self.similarity(q_vector, q_vector_norm,
                                              a_vector)
                        similarities.append((question.id, answer.id, sim))

                else:
                    filtered_users.append(user_id)

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        # Close database connections
        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()
        logging.info("\nDone")

        return filtered_users

    def calculate_local_esa_similarities(self):
        """ Applies the ESA algorithm to the local stack data.
		This local data is measured per user. Returns the list
		of filtered users """

        # Keep filtered users
        filtered_users = []

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        #self.esa_importer.create_clean_concept_doc_relation()
        self.esa_importer.create_clean_similarities_table()

        logging.info("Loading the inverted index ...")
        self.esa_importer.get_pruned_inverted_index(self.inverted_index)

        # For each question calculate its similarity with all the answers given
        # by the users who answered the given question
        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)
            similarities = []

            # Get the users that gave an answer to the question
            users = self.stack_importer.get_users_from_question(question.id)
            print "Users that replied: " + str(len(users))

            # Calculate the similarities of question with all
            # answers from the given users (related or not to question)
            for user_id in users:
                user_answers = self.stack_importer.get_user_answers_to_questions(
                    user_id)

                # Only consider users with more than 5 answers
                if len(user_answers) > 5:
                    print "User " + str(user_id)

                    # Calculate tf_idf vectors for the given user
                    self.question_vectors.clear()
                    self.answer_vectors.clear()
                    stack_dictionary = self._create_local_tf_idf_stack_vectors(
                        user_id)

                    q_vector = self.get_esa_vector(
                        question.id, question.body,
                        self.question_vectors[question.id], stack_dictionary,
                        1)
                    q_vector_norm = norm(q_vector)

                    for answer in user_answers:
                        a_vector = self.get_esa_vector(
                            answer.id, answer.body,
                            self.answer_vectors[answer.id], stack_dictionary,
                            2)
                        sim = self.similarity(q_vector, q_vector_norm,
                                              a_vector)
                        similarities.append((question.id, answer.id, sim))

                else:
                    filtered_users.append(user_id)

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")

        return filtered_users

    def get_esa_vector(self, id, document, tfidf_vector, dictionary, type):
        """ Creates the interpretation vector of the given document.
		- The document should be a set of tokens, already preprocessed
		- The vector represents the relatedness of the document
		with all the Wikipedia articles
		- Type indicates the type of document: question (1) or answer (2) """

        # Interpretation vector with dimensions = Wikipedia articles
        interpretation = zeros(2080905)

        for token in set(document):
            documents = self.inverted_index.get(unicode(token), None)
            word_id = dictionary.get(unicode(token), -1)

            if documents is not None and word_id != -1:
                #print str(len(documents))
                for document_id, value in documents:
                    interpretation[document_id] += (value *
                                                    tfidf_vector[word_id])

        return interpretation

    def similarity(self, vector1, norm_vector1, vector2):
        """ Calculates the cosine similarity between the given vectors """

        # Cosine similartity
        sim = float(dot(vector1, vector2) / (norm_vector1 * norm(vector2)))
        return sim

    def save_relatedness_to_file(self, file_name):

        self.esa_importer.open_esa_db()
        self.esa_importer.write_relatedness_to_file(file_name)
        self.esa_importer.close_esa_db()

    ### EXTRA ###
    def save_index_to_file(self,
                           index=None,
                           file_name='../data/ESA/index.txt'):

        index = defaultdict(list)

        # Extract it from DB
        self.esa_importer.open_esa_db()
        self.esa_importer.get_pruned_inverted_index(index)
        self.esa_importer.close_esa_db()

        # Copy to file
        logging.info("Saving them in a file ...")
        with open(file_name, 'a') as f:
            for word, doc_list in index.iteritems():
                #print word
                f.write(word + '\n')
                f.write(' '.join([str(x) for x in doc_list]))
                f.write('\n')

    def testing_beer_concept(self):

        tfidf_norm_values = []
        tfidf_values = []
        append_values = tfidf_values.append
        append_norm_values = tfidf_norm_values.append

        self.esa_importer.open_esa_db()
        wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary(
        )

        # IDF is fixed
        idf = 4.8225774331876625
        df = 0

        for document in wiki_corpus:

            content = document.content.split(' ')

            if unicode("beer") in content:

                doc_tf = defaultdict(float)
                size = 0  # length of the document
                df += 1

                # Faster than Counter
                for word in content:
                    doc_tf[word] += 1.0
                    size += 1

                # Calculate tfidf value for word "beer" in Wiki data
                norm_value = (doc_tf[unicode("beer")] / size) * idf
                value = doc_tf[unicode("beer")] * idf

                append_values((document.document_id, value))
                append_norm_values((document.document_id, norm_value))

        print "DF : " + str(df)

        # Sort each list
        sorted_norm_values = sorted(tfidf_norm_values, key=itemgetter(1))
        sorted_norm_values = sorted_norm_values[::-1]
        sorted_values = sorted(tfidf_values, key=itemgetter(1))
        sorted_values = sorted_values[::-1]

        # Print top 10 in each list
        print "Normalized : "
        print ' , '.join(
            [str(id) + " " + str(value) for id, value in sorted_norm_values])

        print "\nNot normalized"
        print ' , '.join(
            [str(id) + " " + str(value) for id, value in sorted_values])

        self.esa_importer.close_esa_db()

    def prun_inverted_index(self):
        """ Prun the inverted index """

        self.esa_importer.open_esa_db()

        index = EsaIndex(self.esa_importer.connection)
        result = []
        append = result.append

        for term, vector in index:
            append((term, vector))

        self.esa_importer.save_pruned_index(result)

        self.esa_importer.close_esa_db()

    ###############################################################################
    # Find the right person
    # Then, following a naive strong tie strategy, we could check for each question
    # which other users would have been asked following two strategies: (a) based
    # on the social network ties (the ones with strongest ties) and (b) based on
    # the content similarity (which answer is most similar to the question using
    # TF-IDF or ESA, whatever you like best). Finally, we can compare both results
    # with the ground truth (which users got actually asked in the dataset).
    ###############################################################################

    def calculate_esa_similarities_to_users(self):

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        self.esa_importer.create_clean_similarities_table()

        logging.info("Loading the inverted index ...")
        self.esa_importer.get_pruned_inverted_index(self.inverted_index)

        logging.info("Calculating questions tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors(
            only_questions=True)

        # For each question determine which other users would have been asked
        logging.info("Calculating questions/users similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        users = self.stack_importer.get_active_users()

        for question in question_corpus:
            print "Question " + str(question.id)
            q_vector = self.get_esa_vector(question.id, question.body,
                                           self.question_vectors[question.id],
                                           stack_dictionary, 1)
            q_vector_norm = norm(q_vector)
            similarities = []

            for user_id in users:
                user_body = self._create_user_tf_idf_stack_vector(
                    user_id, stack_dictionary)
                u_vector = self.get_esa_vector(user_id, user_body,
                                               self.user_vectors[user_id],
                                               stack_dictionary, 2)
                sim = self.similarity(q_vector, q_vector_norm, u_vector)
                similarities.append((question.id, user_id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")

    def calculate_tfidf_similarities_to_users(self):

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        #self.esa_importer.create_clean_concept_doc_relation()
        self.esa_importer.create_clean_similarities_table()

        logging.info("Calculating questions tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors(
            only_questions=True)

        # For each question determine which other users would have been asked
        logging.info("Calculating questions/users similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        users = self.stack_importer.get_active_users()

        for question in question_corpus:
            print "Question " + str(question.id)
            q_vector = self.question_vectors[question.id]
            q_vector_norm = norm(q_vector)
            similarities = []

            for user_id in users:
                user_body = self._create_user_tf_idf_stack_vector(
                    user_id, stack_dictionary)
                u_vector = self.user_vectors[user_id]
                sim = self.similarity(q_vector, q_vector_norm, u_vector)
                similarities.append((question.id, user_id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")

    ###############################################################################
    # Experiments - Calculate statistics on the data
    ###############################################################################
    def initialize_experiments(self):

        self.experiments.open_experiment_db()
        self.experiments.create_experiments_db()
        self.experiments.close_experiment_db()

    def run_experiment_1(self):

        self.experiments.open_experiment_db()
        self.experiments.run_experiment_1(True)
        self.experiments.close_experiment_db()

    def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'):

        self.experiments.open_experiment_db()
        self.esa_importer.open_esa_db()
        self.stack_importer.open_stack_db()

        total_answers = self.stack_importer.get_number_of_answers()

        # Get number of answers for each question
        number_of_answers = self.stack_importer.get_number_of_original_answers(
        )

        # Load similarities for each question
        logging.info("Loading similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")
        similar_answers = {}
        original_answers = {}

        for question in question_corpus:
            original_answers[
                question.
                id] = self.stack_importer.get_question_original_answers(
                    question.id)
            similar_answers[
                question.
                id] = self.esa_importer.load_similarities_for_question(
                    question.id, -1, False)

        self.stack_importer.close_stack_db()
        self.esa_importer.close_esa_db()

        # Calculate avg precision and recall for each case
        precision = {}
        recall = {}
        for limit in xrange(1, total_answers + 1):
            logging.info("Calculating with limit %s", str(limit))

            avg_precision, avg_recall = self.experiments.run_experiment_1_avg(
                number_of_answers, original_answers, similar_answers,
                experiment_type, limit)
            precision[limit] = avg_precision
            recall[limit] = avg_recall

        # Save into the database
        self.experiments.save_experiment_results(experiment_type, precision,
                                                 recall)

        # Write them in a file
        folder = self.setting[
            "experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
        self.experiments.write_pr_curve(experiment_type, folder)

        self.experiments.close_experiment_db()

        logging.info("\nDone")

    def run_experiment_2_avg(self, algorithm='esa'):
        """ Same as run_experiment_1_avg but similarities were 
		calculated with local data per user """

        self.run_experiment_1_avg('2_avg', algorithm)

    def run_experiment_3_avg(self, algorithm='esa', experiment_type='3_avg'):
        """ Similar to experiment_1, but checking users instead of answers """

        self.experiments.open_experiment_db()
        self.esa_importer.open_esa_db()
        self.stack_importer.open_stack_db()

        # Get the number of active users
        active_users = len(self.stack_importer.get_active_users())

        # Get the users that gave an answer to each question
        asked_users = self.stack_importer.get_original_users()

        # Load similarities for each question
        logging.info("Loading similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")
        similar_users = {}
        original_users = {}

        for question in question_corpus:

            aux = asked_users.get(question.id, None)
            if aux is not None:
                original_users[question.id] = aux
                similar_users[
                    question.
                    id] = self.esa_importer.load_similarities_for_question(
                        question.id, -1, False)

        self.stack_importer.close_stack_db()
        self.esa_importer.close_esa_db()

        # Calculate avg precision and recall for each case
        precision = {}
        recall = {}
        for limit in xrange(1, active_users + 1):
            #print "Calculating with limit " + str(limit)
            logging.info("Calculating with limit %s", str(limit))

            avg_precision, avg_recall = self.experiments.run_experiment_3_avg(
                asked_users, original_users, similar_users, experiment_type,
                limit)
            precision[limit] = avg_precision
            recall[limit] = avg_recall

        # Save into the database
        self.experiments.save_experiment_results(experiment_type, precision,
                                                 recall)

        # Write them in a file
        folder = self.setting[
            "experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
        self.experiments.write_pr_curve(experiment_type, folder)

        self.experiments.close_experiment_db()

        logging.info("\nDone")
Пример #11
0
import pandas as pd
from model_training import ModelTraining
from preprocessing import Preprocessing
from metrics import Metrics
from data_source import DataSource
from experiments import Experiments
from model_inference import ModelInference

model = Experiments().run_experiment()

ModelTraining().model_training()

ModelInference().predict()
class SocialExperimentsNode():
    def __init__(self):

        # parameters
        self.global_planner = rospy.get_param(
            'social_experiments/global_planner', '')
        self.local_planner = rospy.get_param(
            'social_experiments/local_planner', '')
        self.world_model_name = rospy.get_param(
            'social_experiments/world_model_name', '')
        self.robot_model_name = rospy.get_param(
            'social_experiments/robot_model_name', '')
        self.max_experiments = rospy.get_param(
            'social_experiments/max_experiments', 100)
        self.path_storage = rospy.get_param('social_experiments/path_storage',
                                            '')
        self.robot_vel = rospy.get_param('social_experiments/robot_vel', 0.3)
        self.space_factor_tolerance = rospy.get_param(
            'social_experiments/space_factor_tolerance', 5)
        self.time_factor_tolerance = rospy.get_param(
            'social_experiments/time_factor_tolerance', 5)
        # self.start_service = rospy.get_param('social_experiments/start_service', '/regions/start')
        # self.goal_service = rospy.get_param('social_experiments/goal_service', '/regions/goal')
        self.checkpoint_services = rospy.get_param(
            'social_experiments/checkpoint_services', '')

        if (self.checkpoint_services is ''):
            self.checkpoint_services = []
        else:
            self.checkpoint_services = list(
                self.checkpoint_services.split(" "))

        # log
        rospy.loginfo('global_planner: ' + self.global_planner)
        rospy.loginfo('local_planner: ' + self.local_planner)
        rospy.loginfo('world_model_name: ' + self.world_model_name)
        rospy.loginfo('robot: ' + self.robot_model_name)
        rospy.loginfo('robot vel: ' + str(self.robot_vel))
        rospy.loginfo('space factor tolerance: ' +
                      str(self.space_factor_tolerance))
        rospy.loginfo('time factor tolerance: ' +
                      str(self.time_factor_tolerance))
        rospy.loginfo('max experiments: ' + str(self.max_experiments))
        # rospy.loginfo('start service: ' + str(self.start_service))
        # rospy.loginfo('goal service: ' + str(self.goal_service))
        # rospy.loginfo('checkpoint services: ' + str(self.checkpoint_services))
        print('')

        # data
        self.data = []

        # init experiments
        self.ex = Experiments(self.global_planner, self.local_planner,
                              self.world_model_name, self.robot_model_name)

    def start_experiments(self):
        # experiments loop
        for i in range(0, self.max_experiments):
            rospy.loginfo('Preparing experiment %i/%i' %
                          (i + 1, self.max_experiments))
            self.data.append(Data())

            rospy.loginfo('Fiding checkpoints...')
            self.data[-1].checkpoints = self.ex.get_checkpoints_random(
                "/regions/path")
            self.data[-1].path_executed.append(
                self.data[-1].checkpoints[0].pose.position)
            for n, cp in enumerate(self.data[-1].checkpoints):
                rospy.loginfo('checkpoint ' + str(n) + ': ' + '(x=' +
                              str(cp.pose.position.x) + ',y=' +
                              str(cp.pose.position.x) + ',ang=' +
                              str(cp.pose.orientation.z) + ')')

            rospy.loginfo('Finding a path plan...')
            for n in range(1, len(self.data[-1].checkpoints)):
                plan = self.ex.find_new_path(
                    self.data[-1].checkpoints[n - 1],
                    self.data[-1].checkpoints[n]).poses
                rospy.loginfo('Path plan from checkpoint ' + str(n - 1) +
                              ' to ' + str(n) + ': ' + str(len(plan)))
                self.data[-1].path_plan += plan
            rospy.loginfo('Total path plan size: ' +
                          str(len(self.data[-1].path_plan)))

            self.ex.reset_world()
            rospy.loginfo('Resetting world model')
            self.ex.reset_model(self.world_model_name)
            rospy.loginfo('Resetting robot model')
            self.ex.reset_model(self.robot_model_name,
                                self.data[-1].checkpoints[0].pose)

            rospy.loginfo("setting min dist and time to reach destination")
            (self.data[-1].space_min,
             self.data[-1].time_min) = self.ex.get_min_dist_time(
                 self.data[-1].path_plan, self.robot_vel)
            rospy.loginfo('Space min: ' + str(self.data[-1].space_min) +
                          ' meters')
            rospy.loginfo('Time min: ' + str(self.data[-1].time_min) +
                          ' seconds')

            rospy.loginfo("setting max dist and time to reach destination")
            self.data[-1].space_max = self.data[
                -1].space_min * self.space_factor_tolerance
            self.data[-1].time_max = self.data[
                -1].time_min * self.time_factor_tolerance
            rospy.loginfo('Space max: ' + str(self.data[-1].space_max) +
                          ' meters')
            rospy.loginfo('Time max: ' + str(self.data[-1].time_max) +
                          ' seconds')

            self.ex.robot_update(self.data[-1].checkpoints[0])
            rospy.loginfo('Start experiment %i/%i' %
                          (i + 1, self.max_experiments))
            self.ex.send_move_base_command(self.data[-1].checkpoints[1])
            rospy.loginfo('Experiment in progress...')

            self.data[-1].delta_space.append(0)
            self.data[-1].delta_time.append(rospy.Time.now())
            self.data[-1].total_space = 0
            self.data[-1].total_time = 0

            self.ex.start(self.data[-1])
            self.ex.cancel_all_goals()

            rospy.loginfo('Space elapsed: ' + str(self.data[-1].total_space) +
                          ' meters')
            rospy.loginfo('Time elapsed: ' + str(self.data[-1].total_time) +
                          ' seconds')
            rospy.loginfo('Status: ' + self.data[-1].status)
            rospy.loginfo('Finish experiment ' + str(i + 1) + '/' +
                          str(self.max_experiments))
            print('')

    def generate_csv(self):
        # print params
        file_params = open(self.path_storage + "/params.yaml", "w+")
        file_params.write("environment: " + str(self.world_model_name) + "\n")
        file_params.write("robot_name: " + str(self.robot_model_name) + "\n")
        file_params.write("robot_vel: " + str(self.robot_vel) + "\n")
        file_params.write("space_factor_tolerance: " +
                          str(self.space_factor_tolerance) + "\n")
        file_params.write("time_factor_tolerance: " +
                          str(self.time_factor_tolerance) + "\n")
        file_params.write("max_experiments: " + str(self.max_experiments) +
                          "\n")
        file_params.close()

        # print real time factor
        file_factor = open(self.path_storage + "/real_time_factor.json", "w+")
        i = 0
        list_f = []
        for e1 in self.data:
            list_f.append('"' + str(i) + '":[' +
                          ','.join([str(x) for x in e1.factor_array]) + ']')
            i += 1
        file_factor.write('{' + ',\n'.join([str(x) for x in list_f]) + '}')
        file_factor.close()

        # print localization error
        file_loc_err = open(self.path_storage + "/localization_error.json",
                            "w+")
        i = 0
        list_e = []
        for e1 in self.data:
            list_e.append(
                '"' + str(i) + '":[' +
                ','.join([str(x) for x in e1.localization_error_array]) + ']')
            i += 1
        file_loc_err.write('{' + ',\n'.join([str(x) for x in list_e]) + '}')
        file_loc_err.close()

        # print path plan
        file_path_min_x = open(self.path_storage + "/path_plan_x.json", "w+")
        file_path_min_y = open(self.path_storage + "/path_plan_y.json", "w+")
        i = 0
        list_ex = []
        list_ey = []
        for e1 in self.data:
            list_x = []
            list_y = []
            for e2 in e1.path_plan:
                list_x.append(e2.pose.position.x)
                list_y.append(e2.pose.position.y)
            list_ex.append('"' + str(i) + '":[' +
                           ','.join([str(x) for x in list_x]) + ']')
            list_ey.append('"' + str(i) + '":[' +
                           ','.join([str(y) for y in list_y]) + ']')
            i += 1
        file_path_min_x.write('{' + ',\n'.join([str(x)
                                                for x in list_ex]) + '}')
        file_path_min_y.write('{' + ',\n'.join([str(y)
                                                for y in list_ey]) + '}')
        file_path_min_x.close()
        file_path_min_y.close()

        # print path executed
        file_path_elapsed_x = open(self.path_storage + "/path_executed_x.json",
                                   "w+")
        file_path_elapsed_y = open(self.path_storage + "/path_executed_y.json",
                                   "w+")
        i = 0
        list_ex = []
        list_ey = []
        for e1 in self.data:
            list_x = []
            list_y = []
            for e2 in e1.path_executed:
                list_x.append(e2.x)
                list_y.append(e2.y)
            list_ex.append('"' + str(i) + '":[' +
                           ','.join([str(x) for x in list_x]) + ']')
            list_ey.append('"' + str(i) + '":[' +
                           ','.join([str(y) for y in list_y]) + ']')
            i += 1
        file_path_elapsed_x.write('{' + ',\n'.join([str(x)
                                                    for x in list_ex]) + '}')
        file_path_elapsed_y.write('{' + ',\n'.join([str(y)
                                                    for y in list_ey]) + '}')
        file_path_elapsed_x.close()
        file_path_elapsed_y.close()

        # print people
        file_people = open(self.path_storage + "/people.json", "w+")
        i = 0
        list_1 = []
        for e1 in self.data:
            list_2 = []
            for e2 in e1.people_array:
                list_3 = []
                for e3 in e2:
                    list_3.append('[' + str(e3.position.x) + ',' +
                                  str(e3.position.y) + ']')
                list_2.append('[' + ','.join([str(x) for x in list_3]) + ']')
            list_1.append('"' + str(i) + '":[' +
                          ','.join([str(x) for x in list_2]) + ']')
            i += 1
        file_people.write('{' + ',\n'.join([str(x) for x in list_1]) + '}')
        file_people.close()

        # print result
        file_result = open(self.path_storage + "/result.csv", "w+")
        file_result.write(
            "i,start_x,start_y,start_ang,goal_x,goal_y,goal_ang," +
            "space_min,time_min,space_elapsed,time_elapsed,status\n")
        i = 0
        for e1 in self.data:
            (_, _, start_yaw) = tf.transformations.euler_from_quaternion([
                e1.checkpoints[0].pose.orientation.x,
                e1.checkpoints[0].pose.orientation.y,
                e1.checkpoints[0].pose.orientation.z,
                e1.checkpoints[0].pose.orientation.w
            ])
            (_, _, goal_yaw) = tf.transformations.euler_from_quaternion([
                e1.checkpoints[-1].pose.orientation.x,
                e1.checkpoints[-1].pose.orientation.y,
                e1.checkpoints[-1].pose.orientation.z,
                e1.checkpoints[-1].pose.orientation.w
            ])
            file_result.write(str(i) + ",")
            file_result.write(str(e1.checkpoints[0].pose.position.x) + ",")
            file_result.write(str(e1.checkpoints[0].pose.position.y) + ",")
            file_result.write(str(start_yaw) + ",")
            file_result.write(str(e1.checkpoints[-1].pose.position.x) + ",")
            file_result.write(str(e1.checkpoints[-1].pose.position.y) + ",")
            file_result.write(str(goal_yaw) + ",")
            file_result.write(str(e1.space_min) + ",")
            file_result.write(str(e1.time_min) + ",")
            file_result.write(str(e1.total_space) + ",")
            file_result.write(str(e1.total_time) + ",")
            file_result.write(str(e1.status) + "\n")
            i += 1
        file_result.close()
Пример #13
0
class ESA (object):
	""" ESA - Explicit Semantic Analysis """

	def __init__(self, setting):

		self.setting          = setting

		self.idf_values       = None

		self.wiki_corpus      = None
		self.wiki_dictionary  = None
		self.wiki_vectors     = []
		self.wiki_processor   = WikiPreprocessor(setting)
		self.wiki_importer    = WikiImporter(setting, self.wiki_processor)

		self.stack_corpus        = None
		self.answer_vectors      = {}
		self.question_vectors    = {}
		self.user_vectors        = {}
		self.user_content        = {}
		self.stack_importer      = StackImporter(setting)

		self.esa_importer        = ESAImporter(setting)
		self.inverted_index      = defaultdict(list)
		self.number_of_concepts  = 0

		self.experiments         = Experiments(setting)


	###############################################################################
	# Clean and load data
	###############################################################################
	def clean_and_load_data(self):
		""" Cleans the data and saves it in a database """

		self.wiki_importer.import_wiki_data()


	###############################################################################
	# Create and manage data used by ESA algorithm
	###############################################################################

	def build_esa_db(self):
		""" Initializes the ESA database """

		logging.info("\nCreating ESA database ...")

		self.esa_importer.open_esa_db()
	
		# Initialize database
		self.esa_importer.create_esa_db()

		# Save the dictionary and corpus of the Wikipedia data
		self.wiki_dictionary = self.wiki_importer.build_wiki_kb()

		# Save the inverse document frequencies in the ESA database
		number_of_documents = self.wiki_dictionary.num_docs #self.wiki_importer.get_number_of_concepts()
		self.esa_importer.save_wiki_inverse_document_frequencies(number_of_documents)

		self.esa_importer.close_esa_db()


	def load_esa_index(self):
		""" Gets the inverted index from the database """

		self.esa_importer.open_esa_db()

		self.esa_importer.get_pruned_inverted_index(self.inverted_index)
		logging.info("\nDone")

		self.esa_importer.close_esa_db()


	###############################################################################
	# Build TF-IDF Vectors
	###############################################################################

	def create_tf_idf_vectors(self):
		""" Creates them if not already in database """

		self.esa_importer.open_esa_db()

		# Calculate tfidf vectors for the Wikipedia articles
		self.create_tf_idf_wiki_vectors()

		# Save terms and vectors to ESA db
		#self.esa_importer.save_inverted_index(self.wiki_vectors)

		logging.info("\nDone")

		self.esa_importer.close_esa_db()


	def create_tf_idf_wiki_vectors(self):
		""" Keeping only non-zero entries of the vectors """

		wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary()
		
		logging.info("Retrieving idf values ...")
		inv_doc_freq = {}
		self.esa_importer.get_wiki_inverse_document_frequencies(inv_doc_freq)

		logging.info("Building the tfidf vectors and the inverse index ...")
		tfidf_model    = TfidfModel(self.wiki_dictionary, inv_doc_freq)
		inverted_index = defaultdict(list)

		for document in wiki_corpus:
			vector = tfidf_model[document]
			
			for term_id, value in vector:
				inverted_index[term_id].append( (document.document_id, value) )

			#print "Added " + str(document.document_id)
		
		logging.info("\n\tDone.")
		self.esa_importer.save_inverted_index(inverted_index)

		self.save_index_to_file(inverted_index)


	def _create_tf_idf_stack_vectors(self, only_questions=False):
		""" Create the tfidf vectors for the Stackexchange data. """

		# Load question and answer corpus
		logging.info("Loading stack corpus and dictionary ...")
		question_corpus = self.stack_importer.get_question_corpus()
		answer_corpus   = self.stack_importer.get_answer_corpus()

		corpus     = question_corpus + answer_corpus
		dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, answer_corpus])
		dict_size  = len(dictionary)

		# Save stack dictionary
		stack_dict = {}
		for word_id, word in enumerate(dictionary.token2id):
			stack_dict[unicode(word)] = word_id

		self.idf_values = zeros(dict_size)

		logging.info("Determining question vectors ...")
		questions = StackCorpus(self.stack_importer.connection, "question")
		for question in questions:
			question_vector = zeros(dict_size)

			for word in question.body:
				word_id = stack_dict.get(unicode(word), -1)

				if word_id != -1:
					question_vector[word_id] = self.tf_idf(word, word_id, question.body, corpus)

			self.question_vectors[question.id] = question_vector

		logging.info("\n\tDone.")

		if only_questions: # Skip the answers
			return stack_dict

		logging.info("Determining answer vectors ...")
		answers   = StackCorpus(self.stack_importer.connection, "answer")
		
		for answer in answers:
			answer_vector = zeros(dict_size)

			for word in answer.body:
				word_id = stack_dict.get(unicode(word), -1)

				if word_id != -1:
					tf_idf = self.tf_idf(word, word_id, answer.body, corpus)
					answer_vector[word_id] = tf_idf

			self.answer_vectors[answer.id] = answer_vector

		logging.info("\n\tDone.")

		return stack_dict


	def _create_local_tf_idf_stack_vectors(self, user_id):
		""" Create the tfidf vectors for the local Stackexchange data of the given user """

		# Load question and answer corpus
		#logging.info("Loading stack corpus and dictionary ...")
		question_corpus = self.stack_importer.get_user_question_corpus(user_id)
		answer_corpus   = self.stack_importer.get_user_answer_corpus(user_id)

		corpus     = question_corpus + answer_corpus
		dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, answer_corpus])
		dict_size  = len(dictionary)

		# Save stack dictionary
		stack_dict = {}
		for word_id, word in enumerate(dictionary.token2id):
			stack_dict[unicode(word)] = word_id

		self.idf_values = zeros(dict_size)

		#logging.info("Determining question vectors ...")
		questions = self.stack_importer.get_user_local_questions(user_id)

		for question in questions:
			question_vector = zeros(dict_size)

			for word in question.body:
				word_id = stack_dict.get(unicode(word), -1)

				if word_id != -1:
					question_vector[word_id] = self.tf_idf(word, word_id, question.body, corpus)

			self.question_vectors[question.id] = question_vector

		#logging.info("\n\tDone.")


		#logging.info("Determining answer vectors ...")
		answers = self.stack_importer.get_user_local_answers(user_id)

		for answer in answers:
			answer_vector = zeros(dict_size)

			for word in answer.body:
				word_id = stack_dict.get(unicode(word), -1)

				if word_id != -1:
					tf_idf = self.tf_idf(word, word_id, answer.body, corpus)
					answer_vector[word_id] = tf_idf

			self.answer_vectors[answer.id] = answer_vector

		#logging.info("\n\tDone.")

		return stack_dict


	def _create_user_tf_idf_stack_vector(self, user_id, stack_dict):
		""" Create the tfidf vector representation of a user, based on her answers"""
		
		aux = self.user_content.get(user_id, None)
		if aux is not None:
			return aux

		user_corpus = []
		user_words  = []
		answers = self.stack_importer.get_user_answers_to_questions(user_id)
		for answer in answers:
			user_corpus.append(answer.body)
			for word in answer.body:
				user_words.append(word)

		self.user_content[user_id] = user_words
		
		dict_size   = len(stack_dict)
		user_vector = zeros(dict_size)

		for word in set(user_words):
			word_id = stack_dict.get(unicode(word), -1)

			if word_id != -1:
				tf_idf = self.tf_idf(word, word_id, user_words, user_corpus)
				user_vector[word_id] = tf_idf

		self.user_vectors[user_id] = user_vector

		return user_words



	@staticmethod
	def tf(word, document):
		""" Returns the normalized frequency of the word in the given document """

		word_count = document.count(unicode(word))
		return float(word_count) / len(document)


	@staticmethod
	def df(word, corpus):
		""" Returns the number of documents in the collection that contain the given word """

		return sum(1 for document in corpus if unicode(word) in document)


	#@staticmethod
	def idf(self, word, corpus):
		""" Returns the inverse document frequency of the word in the documents collection """

		return math.log(len(corpus)) / self.df(word, corpus)


	def tf_idf(self, word, word_index, document, corpus):
		""" Returns the TF-IDF value for the given 
		word in the document of the corpus """

		# Calculate the term frequency value (tf)
		tf = self.tf(word, document)
		if tf == 0.0:
			return 0.0

		# Calculate the inverse document frequency value (idf)
		if self.idf_values[word_index] == 0.0:
			self.idf_values[word_index] = self.idf(word, corpus)

		return float(tf * self.idf_values[word_index])



	###############################################################################
	# Associations and Similarities of Stackexchange questions/answers using
	# Wikipedia's articles as concepts.
	###############################################################################

	def calculate_similarities(self):
		""" Applies the ESA algorithm to the global stack data """

		# Open database connections
		self.stack_importer.open_stack_db()
		self.esa_importer.open_esa_db()

		# Clean tables
		logging.info("Cleaning similarity tables ...")
		self.esa_importer.create_clean_concept_doc_relation()
		self.esa_importer.create_clean_similarities_table()

		logging.info("Loading the inverted index ...")
		self.esa_importer.get_pruned_inverted_index(self.inverted_index)

		#print "Has beer " + str(self.inverted_index.get(unicode("beer"), None))

		logging.info("Calculating stack tfidf vectors ...")
		stack_dictionary = self._create_tf_idf_stack_vectors()

		# For each question calculate similarity with each answer
		logging.info("\nCalculating questions-answers similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")
		
		for question in question_corpus:
			q_vector      = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1)
			q_vector_norm = norm(q_vector)
			similarities  = []

			answer_corpus = StackCorpus(self.stack_importer.connection, "answer")

			for answer in answer_corpus:
				a_vector  = self.get_esa_vector(answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2)
				sim       = self.similarity(q_vector, q_vector_norm, a_vector)
				similarities.append( (question.id, answer.id, sim) )
			
			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.esa_importer.save_similarities(similarities)

		self.esa_importer.close_esa_db()
		self.stack_importer.close_stack_db()

		logging.info("\nDone")


	def calculate_tf_idf_similarities(self):
		"""Applies the TF-IDF algorithm to the global stack data"""

		# Open database connections
		self.stack_importer.open_stack_db()
		self.esa_importer.open_esa_db()

		# Clean tables
		logging.info("Cleaning similarity tables ...")
		self.esa_importer.create_clean_similarities_table()

		logging.info("Calculating stack tfidf vectors ...")
		stack_dictionary = self._create_tf_idf_stack_vectors()

		# For each question calculate similarity with each answer
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		logging.info("\nCalculating questions-answers similarities ...")
		for question in question_corpus:
			q_vector      = self.question_vectors[question.id]
			q_vector_norm = norm(q_vector)
			similarities  = []

			answer_corpus = StackCorpus(self.stack_importer.connection, "answer")
			for answer in answer_corpus:
				a_vector  = self.answer_vectors[answer.id]
				sim       = self.similarity(q_vector, q_vector_norm, a_vector)
				similarities.append( (question.id, answer.id, sim) )
			
			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.esa_importer.save_similarities(similarities)

		self.esa_importer.close_esa_db()
		self.stack_importer.close_stack_db()

		logging.info("\nDone")


	def calculate_local_tfidf_similarities(self):
		""" Applies TF-IDF to the local stack data, in order
		to calculate questions/answers similarities. The local
		data is measured per user.
		Returns the list of users that were filtered. """

		# Keep filtered users
		filtered_users = []

		# Open database connections
		self.esa_importer.open_esa_db()
		self.stack_importer.open_stack_db()

		# Clean similarity table
		self.esa_importer.create_clean_similarities_table()

		# For each question calculate its similarity with the all the answers given
		# by the users who answered the given question
		logging.info("Calculating questions/answers similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		for question in question_corpus:

			print "Question " + str(question.id)
			similarities  = []

			# Get the users that gave an answer to the question
			users = self.stack_importer.get_users_from_question(question.id)
			print "Users that replied: " + str(len(users))

			# Calculate the similarities of question with all
			# answers from the given users (related or not to question)
			for user_id in users:
				user_answers = self.stack_importer.get_user_answers_to_questions(user_id)

				# Only consider users with more than 1 answer
				if len(user_answers) > 5:

					print "User " + str(user_id)
					a = []
					for answer in user_answers:
						a.append(answer.id)
					print a

					# Calculate tf_idf vectors for the given user
					self.question_vectors.clear()
					self.answer_vectors.clear()
					stack_dictionary = self._create_local_tf_idf_stack_vectors(user_id)

					q_vector      = self.question_vectors[question.id]
					q_vector_norm = norm(q_vector)

					for answer in user_answers:
						a_vector = self.answer_vectors[answer.id]
						sim      = self.similarity(q_vector, q_vector_norm, a_vector)
						similarities.append( (question.id, answer.id, sim) )

				else:
					filtered_users.append(user_id)


			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.esa_importer.save_similarities(similarities)

		# Close database connections
		self.esa_importer.close_esa_db()
		self.stack_importer.close_stack_db()
		logging.info("\nDone")

		return filtered_users


	def calculate_local_esa_similarities(self):
		""" Applies the ESA algorithm to the local stack data.
		This local data is measured per user. Returns the list
		of filtered users """

		# Keep filtered users
		filtered_users = []

		# Open database connections
		self.stack_importer.open_stack_db()
		self.esa_importer.open_esa_db()

		# Clean tables
		logging.info("Cleaning similarity tables ...")
		#self.esa_importer.create_clean_concept_doc_relation()
		self.esa_importer.create_clean_similarities_table()

		logging.info("Loading the inverted index ...")
		self.esa_importer.get_pruned_inverted_index(self.inverted_index)

		# For each question calculate its similarity with all the answers given
		# by the users who answered the given question
		logging.info("Calculating questions/answers similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		for question in question_corpus:

			print "Question " + str(question.id)
			similarities  = []

			# Get the users that gave an answer to the question
			users = self.stack_importer.get_users_from_question(question.id)
			print "Users that replied: " + str(len(users))

			# Calculate the similarities of question with all
			# answers from the given users (related or not to question)
			for user_id in users:
				user_answers = self.stack_importer.get_user_answers_to_questions(user_id)

				# Only consider users with more than 5 answers
				if len(user_answers) > 5:
					print "User " + str(user_id)

					# Calculate tf_idf vectors for the given user
					self.question_vectors.clear()
					self.answer_vectors.clear()
					stack_dictionary = self._create_local_tf_idf_stack_vectors(user_id)

					q_vector      = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1)
					q_vector_norm = norm(q_vector)

					for answer in user_answers:
						a_vector  = self.get_esa_vector(answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2)
						sim       = self.similarity(q_vector, q_vector_norm, a_vector)
						similarities.append( (question.id, answer.id, sim) )

				else:
					filtered_users.append(user_id)


			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.esa_importer.save_similarities(similarities)

		self.esa_importer.close_esa_db()
		self.stack_importer.close_stack_db()

		logging.info("\nDone")

		return filtered_users


	def get_esa_vector(self, id, document, tfidf_vector, dictionary, type):
		""" Creates the interpretation vector of the given document.
		- The document should be a set of tokens, already preprocessed
		- The vector represents the relatedness of the document
		with all the Wikipedia articles
		- Type indicates the type of document: question (1) or answer (2) """

		# Interpretation vector with dimensions = Wikipedia articles
		interpretation = zeros(2080905)

		for token in set(document):
			documents = self.inverted_index.get(unicode(token), None)
			word_id   = dictionary.get(unicode(token), -1)

			if documents is not None and word_id != -1:
				#print str(len(documents))
				for document_id, value in documents:
					interpretation[document_id] += (value * tfidf_vector[word_id])

		return interpretation


	def similarity(self, vector1, norm_vector1, vector2):
		""" Calculates the cosine similarity between the given vectors """

		# Cosine similartity
		sim = float(dot(vector1, vector2) / (norm_vector1 * norm(vector2)))
		return sim


	def save_relatedness_to_file(self, file_name):

		self.esa_importer.open_esa_db()
		self.esa_importer.write_relatedness_to_file(file_name)
		self.esa_importer.close_esa_db()



	### EXTRA ###
	def save_index_to_file(self, index=None, file_name='../data/ESA/index.txt'):

		index = defaultdict(list)

		# Extract it from DB
		self.esa_importer.open_esa_db()
		self.esa_importer.get_pruned_inverted_index(index)
		self.esa_importer.close_esa_db()

		# Copy to file
		logging.info("Saving them in a file ...")
		with open(file_name, 'a') as f:
			for word, doc_list in index.iteritems():
				#print word
				f.write(word + '\n')
				f.write(' '.join([str(x) for x in doc_list]))
				f.write('\n')


	def testing_beer_concept(self):

		tfidf_norm_values  = []
		tfidf_values       = []
		append_values      = tfidf_values.append
		append_norm_values = tfidf_norm_values.append

		self.esa_importer.open_esa_db()
		wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary()
		
		# IDF is fixed
		idf = 4.8225774331876625
		df  = 0

		for document in wiki_corpus:

			content = document.content.split(' ')

			if unicode("beer") in content:

				doc_tf  = defaultdict(float)
				size    = 0 # length of the document
				df     += 1

				# Faster than Counter
				for word in content:
					doc_tf[word] += 1.0
					size         += 1

				# Calculate tfidf value for word "beer" in Wiki data
				norm_value = (doc_tf[unicode("beer")] / size) * idf
				value      = doc_tf[unicode("beer")] * idf

				append_values( (document.document_id, value) )
				append_norm_values( (document.document_id, norm_value) )

		print "DF : " + str(df)

		# Sort each list
		sorted_norm_values = sorted(tfidf_norm_values, key=itemgetter(1))
		sorted_norm_values = sorted_norm_values[::-1]
		sorted_values      = sorted(tfidf_values, key=itemgetter(1))
		sorted_values      = sorted_values[::-1]

		# Print top 10 in each list
		print "Normalized : "
		print ' , '.join([str(id) + " " + str(value) for id,value in sorted_norm_values])

		print "\nNot normalized"
		print ' , '.join([str(id) + " " + str(value) for id,value in sorted_values])

		self.esa_importer.close_esa_db()


	def prun_inverted_index(self):
		""" Prun the inverted index """

		self.esa_importer.open_esa_db()

		index  = EsaIndex(self.esa_importer.connection)
		result = []
		append = result.append

		for term, vector in index:
			append( (term, vector) )

		self.esa_importer.save_pruned_index(result)

		self.esa_importer.close_esa_db()


	###############################################################################
	# Find the right person
	# Then, following a naive strong tie strategy, we could check for each question
	# which other users would have been asked following two strategies: (a) based 
	# on the social network ties (the ones with strongest ties) and (b) based on 
	# the content similarity (which answer is most similar to the question using 
	# TF-IDF or ESA, whatever you like best). Finally, we can compare both results 
	# with the ground truth (which users got actually asked in the dataset).
	###############################################################################

	def calculate_esa_similarities_to_users(self):

		# Open database connections
		self.stack_importer.open_stack_db()
		self.esa_importer.open_esa_db()

		# Clean tables
		logging.info("Cleaning similarity tables ...")
		self.esa_importer.create_clean_similarities_table()

		logging.info("Loading the inverted index ...")
		self.esa_importer.get_pruned_inverted_index(self.inverted_index)

		logging.info("Calculating questions tfidf vectors ...")
		stack_dictionary = self._create_tf_idf_stack_vectors(only_questions=True)

		# For each question determine which other users would have been asked
		logging.info("Calculating questions/users similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		users = self.stack_importer.get_active_users()

		for question in question_corpus:
			print "Question " + str(question.id)
			q_vector      = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1)
			q_vector_norm = norm(q_vector)
			similarities  = []

			for user_id in users:
				user_body = self._create_user_tf_idf_stack_vector(user_id, stack_dictionary)
				u_vector  = self.get_esa_vector(user_id, user_body, self.user_vectors[user_id], stack_dictionary, 2)
				sim       = self.similarity(q_vector, q_vector_norm, u_vector)
				similarities.append( (question.id, user_id, sim) )

			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.esa_importer.save_similarities(similarities)

		self.esa_importer.close_esa_db()
		self.stack_importer.close_stack_db()

		logging.info("\nDone")


	def calculate_tfidf_similarities_to_users(self):

		# Open database connections
		self.stack_importer.open_stack_db()
		self.esa_importer.open_esa_db()

		# Clean tables
		logging.info("Cleaning similarity tables ...")
		#self.esa_importer.create_clean_concept_doc_relation()
		self.esa_importer.create_clean_similarities_table()

		logging.info("Calculating questions tfidf vectors ...")
		stack_dictionary = self._create_tf_idf_stack_vectors(only_questions=True)

		# For each question determine which other users would have been asked
		logging.info("Calculating questions/users similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		users = self.stack_importer.get_active_users()
	
		for question in question_corpus:
			print "Question " + str(question.id)
			q_vector      = self.question_vectors[question.id]
			q_vector_norm = norm(q_vector)
			similarities  = []

			for user_id in users:
				user_body = self._create_user_tf_idf_stack_vector(user_id, stack_dictionary)
				u_vector  = self.user_vectors[user_id]
				sim       = self.similarity(q_vector, q_vector_norm, u_vector)
				similarities.append( (question.id, user_id, sim) )

			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.esa_importer.save_similarities(similarities)

		self.esa_importer.close_esa_db()
		self.stack_importer.close_stack_db()

		logging.info("\nDone")




	###############################################################################
	# Experiments - Calculate statistics on the data
	###############################################################################
	def initialize_experiments(self):

		self.experiments.open_experiment_db()
		self.experiments.create_experiments_db()
		self.experiments.close_experiment_db()

	def run_experiment_1(self):

		self.experiments.open_experiment_db()
		self.experiments.run_experiment_1(True)
		self.experiments.close_experiment_db()


	def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'):

		self.experiments.open_experiment_db()
		self.esa_importer.open_esa_db()
		self.stack_importer.open_stack_db()

		total_answers = self.stack_importer.get_number_of_answers()

		# Get number of answers for each question
		number_of_answers = self.stack_importer.get_number_of_original_answers()

		# Load similarities for each question
		logging.info("Loading similarities ...")
		question_corpus  = StackCorpus(self.stack_importer.connection, "question")
		similar_answers  = {}
		original_answers = {}
		
		for question in question_corpus:
			original_answers[question.id] = self.stack_importer.get_question_original_answers(question.id)
			similar_answers[question.id]  = self.esa_importer.load_similarities_for_question(question.id, -1, False)

		self.stack_importer.close_stack_db()
		self.esa_importer.close_esa_db()


		# Calculate avg precision and recall for each case
		precision = {}
		recall    = {}
		for limit in xrange(1,total_answers+1):
			logging.info("Calculating with limit %s", str(limit))

			avg_precision, avg_recall = self.experiments.run_experiment_1_avg(number_of_answers,
				original_answers, similar_answers, experiment_type, limit)
			precision[limit] = avg_precision
			recall[limit]    = avg_recall

		# Save into the database
		self.experiments.save_experiment_results(experiment_type, precision, recall)

		# Write them in a file
		folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
		self.experiments.write_pr_curve(experiment_type, folder)

		self.experiments.close_experiment_db()

		logging.info("\nDone")


	def run_experiment_2_avg(self, algorithm='esa'):
		""" Same as run_experiment_1_avg but similarities were 
		calculated with local data per user """

		self.run_experiment_1_avg('2_avg', algorithm)


	def run_experiment_3_avg(self, algorithm='esa', experiment_type='3_avg'):
		""" Similar to experiment_1, but checking users instead of answers """

		self.experiments.open_experiment_db()
		self.esa_importer.open_esa_db()
		self.stack_importer.open_stack_db()

		# Get the number of active users
		active_users = len(self.stack_importer.get_active_users())

		# Get the users that gave an answer to each question
		asked_users = self.stack_importer.get_original_users()

		# Load similarities for each question
		logging.info("Loading similarities ...")
		question_corpus  = StackCorpus(self.stack_importer.connection, "question")
		similar_users  = {}
		original_users = {}

		for question in question_corpus:

			aux = asked_users.get(question.id, None)
			if aux is not None:
				original_users[question.id] = aux
				similar_users[question.id]  = self.esa_importer.load_similarities_for_question(question.id, -1, False)

		self.stack_importer.close_stack_db()
		self.esa_importer.close_esa_db()


		# Calculate avg precision and recall for each case
		precision = {}
		recall    = {}
		for limit in xrange(1,active_users+1):
			#print "Calculating with limit " + str(limit)
			logging.info("Calculating with limit %s", str(limit))

			avg_precision, avg_recall = self.experiments.run_experiment_3_avg(asked_users,
				original_users, similar_users, experiment_type, limit)
			precision[limit] = avg_precision
			recall[limit]    = avg_recall

		# Save into the database
		self.experiments.save_experiment_results(experiment_type, precision, recall)

		# Write them in a file
		folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
		self.experiments.write_pr_curve(experiment_type, folder)

		self.experiments.close_experiment_db()

		logging.info("\nDone")
Пример #14
0
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
myapp runs planout as a service
"""
from flask import Flask, jsonify, request
from experiments import Experiments

experiments = Experiments()

# Create the application, elastic beanstalk expects the name "application"
app = Flask(__name__)

@app.route("/")
def get_experiments_for_team():
    """Return JSON for team's experiments

    get_expirments_for_team returns experiments json of all experiments
    associated with a team

    Args:
        team_name: name of the team (group_id)
        unit: unique identifier for user
Пример #15
0
    if args.plot:
        plot_times_by_batch(args.database)
    else:
        if args.load_database:
            exps = pkl.load(open(args.database))
        else:
            ## Determine the type of sparsity layer to use
            if args.layer_class == 'HiddenRandomBlockLayer':
                layer_class = HiddenRandomBlockLayer
            else:
                layer_class = HiddenBlockLayer

            ## Create experiments
            exps = Experiments(
                input_dim=784,  # data.train_set_x.shape[-1].eval(),
                num_classes=10)

            # Add descriptions of models
            exps.add_layers_description(
                0, {
                    'n_hids': (25, ),
                    'n_units_per': args.units_per_block,
                    'k_pers': (1, 1),
                    'activations': (T.tanh, None),
                    'layer_classes': [
                        HiddenBlockLayer,
                        HiddenBlockLayer,
                    ],
                })
            exps.add_layers_description(
Пример #16
0
class LDA(object):
    """ LDA - Latent Dirichlet Porcesses """
    def __init__(self, setting):

        self.setting = setting
        self.mallet_path = setting['malletpath']
        self.number_of_topics = setting['nooftopics']
        self.number_of_iter = setting['noofiterations']

        self.stack_importer = StackImporter(setting)
        self.lda_importer = LDAImporter(setting)
        self.experiments = Experiments(setting)

        self.model = None
        self.corpus = None
        self.dictionary = None
        self.answer_corpus = None

        directory = self.setting['lda_folder']
        file_name = 'local_lda_model' + self.setting['theme'] + '.gs'
        self.path = ''.join([directory, file_name])

    def __iter__(self):

        for document in self.corpus:
            yield self.dictionary.doc2bow(document)

    def calculate_similarities(self):

        # Open database connections
        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        # Clean similarity table
        self.lda_importer.create_clean_similarities_table()

        self._learn_model()

        logging.info("Loading dictionary ...")
        self._load_dictionary()

        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)
            similarities = []
            answer_corpus = StackCorpus(self.stack_importer.connection,
                                        "answer")

            # Get topics in the question
            bow = self.dictionary.doc2bow(question.body)
            question_topics = self.model[bow]

            for answer in answer_corpus:

                # Get topics in the answer
                bow = self.dictionary.doc2bow(answer.body)
                answer_topics = self.model[bow]

                # Similarities
                similarities.append(
                    (question.id, answer.id,
                     self._compare_documents(question_topics, answer_topics)))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.lda_importer.save_similarities(similarities)

        # Close database connections
        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()

    def _learn_model(self):
        self.model = models.wrappers.LdaMallet(
            self.mallet_path,
            corpus=self,
            num_topics=self.number_of_topics,
            id2word=self.dictionary,
            iterations=self.number_of_iter)

    def _load_dictionary(self):

        self.stack_importer.open_stack_db()

        # Load dictionary
        question_corpus = self.stack_importer.get_question_corpus()
        answer_corpus = self.stack_importer.get_answer_corpus()
        corpus = question_corpus + answer_corpus
        self.dictionary = self.stack_importer.get_dictionary_from_corpora(
            [question_corpus, answer_corpus])

        self.stack_importer.close_stack_db()

    def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'):

        self.experiments.open_experiment_db()
        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        total_answers = self.stack_importer.get_number_of_answers()

        # Get number of answers for each question
        number_of_answers = self.stack_importer.get_number_of_original_answers(
        )

        # Load similarities for each question
        logging.info("Loading similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")
        similar_answers = {}
        original_answers = {}

        for question in question_corpus:
            original_answers[
                question.
                id] = self.stack_importer.get_question_original_answers(
                    question.id)
            similar_answers[
                question.
                id] = self.esa_importer.load_similarities_for_question(
                    question.id, -1, False)

        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()

        # Calculate avg precision and recall for each case
        precision = {}
        recall = {}
        for limit in xrange(1, total_answers + 1):
            logging.info("Calculating with limit %s", str(limit))

            avg_precision, avg_recall = self.experiments.run_experiment_1_avg(
                number_of_answers, original_answers, similar_answers,
                experiment_type, limit)
            precision[limit] = avg_precision
            recall[limit] = avg_recall

        # Save into the database
        self.experiments.save_experiment_results(experiment_type, precision,
                                                 recall)

        # Write them in a file
        folder = self.setting[
            "experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
        self.experiments.write_pr_curve(experiment_type, folder)

        self.experiments.close_experiment_db()

        logging.info("\nDone")

    ###############################################################################
    # Create the local model
    ###############################################################################

    def calculate_local_similarities(self):
        """ Calculates similarities between local questions/answers.
			Returns the list of filtered users """

        # Keep filtered users
        filtered_users = []

        # Open database connections
        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        # Clean similarity table
        self.lda_importer.create_clean_similarities_table()

        # For each question calculate its similarity with the all the answers given
        # by the users who answered the given question
        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)

            similarities = []

            # Get the users that gave an answer to the question
            users = self.stack_importer.get_users_from_question(question.id)
            print "Users that replied: " + str(len(users))

            # Calculate the similarities of question with all
            # answers from the given users (related or not to question)
            for user_id in users:
                user_answers = self.stack_importer.get_user_answers_to_questions(
                    user_id)

                # Only consider users with more than 1 answer
                if len(user_answers) > 5:
                    print "User " + str(user_id)

                    self._learn_local_model(user_id)

                    # Get topics in the question
                    bow = self.dictionary.doc2bow(question.body)
                    question_topics = self.model[bow]

                    # Get topics in the answers and calculate similarities with current question
                    for answer in user_answers:
                        bow = self.dictionary.doc2bow(answer.body)
                        answer_topics = self.model[bow]

                        # Similarities
                        similarities.append(
                            (question.id, answer.id,
                             self._compare_documents(question_topics,
                                                     answer_topics)))
                else:
                    filtered_users.append(user_id)

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.lda_importer.save_similarities(similarities)

        # Close database connections
        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()

        return filtered_users

    def _learn_local_model(self, user_id):
        """ Learns the LDA model with local knowledge """

        # Load question and answer corpus
        question_corpus = self.stack_importer.get_user_question_corpus(user_id)
        self.answer_corpus = self.stack_importer.get_user_answer_corpus(
            user_id)
        self.corpus = question_corpus + self.answer_corpus
        self.dictionary = self.stack_importer.get_dictionary_from_corpora(
            [question_corpus, self.answer_corpus])

        # Create model
        self.model = models.wrappers.LdaMallet(
            self.mallet_path,
            corpus=self,
            num_topics=self.number_of_topics,
            id2word=self.dictionary,
            iterations=self.number_of_iter)

    @staticmethod
    def _compare_documents(document1, document2):
        """ Calculates the distance between the given documents """

        doc1_topic_description = []
        doc2_topic_description = []

        for (topic, weight) in document1:
            doc1_topic_description.append(weight)

        for (topic, weight) in document2:
            doc2_topic_description.append(weight)

        return Metric.js_distance(doc1_topic_description,
                                  doc2_topic_description)

    def run_experiment_2_avg(self,
                             experiment_type='2_avg',
                             algorithm='lda_local_2'):

        self.experiments.open_experiment_db()

        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        total_answers = self.stack_importer.get_number_of_answers()

        # Get number of answers for each question
        number_of_answers = self.stack_importer.get_number_of_original_answers(
        )

        # Load similarities for each question
        logging.info("Loading similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")
        similar_answers = {}
        original_answers = {}

        for question in question_corpus:
            original_answers[
                question.
                id] = self.stack_importer.get_question_original_answers(
                    question.id)
            similar_answers[
                question.
                id] = self.lda_importer.load_similarities_for_question(
                    question.id, -1, False)

        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()

        # Calculate avg precision and recall for each case
        precision = {}
        recall = {}
        for limit in xrange(1, total_answers + 1):
            print "Calculating with limit " + str(limit)

            avg_precision, avg_recall = self.experiments.run_experiment_1_avg(
                number_of_answers, original_answers, similar_answers,
                experiment_type, limit)
            precision[limit] = avg_precision
            recall[limit] = avg_recall

        # Save into the database
        self.experiments.save_experiment_results(experiment_type, precision,
                                                 recall)

        # Write them in a file
        folder = self.setting[
            "experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
        self.experiments.write_pr_curve(experiment_type, folder)

        self.experiments.close_experiment_db()

        logging.info("\nDone")
                                         classification_col=2)
#	test
test_data = np.loadtxt("data/test.txt", delimiter=",")
x_test, y_test = timestamped_to_vector(test_data,
                                       timestamp_col=0,
                                       time_start=1,
                                       classification_col=2)
#	all data
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

# random search of hyperparameters
expt = Experiments.Experiment(Configs.get_all(),
                              folds=10,
                              search_algorithm="random",
                              data=(x_train, y_train),
                              folder_name="random_search",
                              thresholding=True,
                              threshold=0.5)
expt.run_experiments(num_experiments=400)

# Config A with separate test set
params_A = Configs.get_A()
params_A["sequence_length"] = list(range(1, 31))  # total real time length

expt = Experiments.Experiment(params_A,
                              search_algorithm="grid",
                              x_test=x_test,
                              y_test=y_test,
                              x_train=x_train,
                              y_train=y_train,
Пример #18
0
class LDA (object):
	""" LDA - Latent Dirichlet Porcesses """

	def __init__(self, setting):

		self.setting          = setting
		self.mallet_path      = setting['malletpath']
		self.number_of_topics = setting['nooftopics']
		self.number_of_iter   = setting['noofiterations']

		self.stack_importer   = StackImporter(setting)
		self.lda_importer     = LDAImporter(setting)
		self.experiments      = Experiments(setting)

		self.model            = None
		self.corpus           = None
		self.dictionary       = None
		self.answer_corpus    = None

		directory = self.setting['lda_folder']
		file_name = 'local_lda_model' + self.setting['theme'] + '.gs'
		self.path = ''.join([directory, file_name])


	def __iter__(self):

		for document in self.corpus:
			yield self.dictionary.doc2bow(document)


	def calculate_similarities(self):

		# Open database connections
		self.lda_importer.open_lda_db()
		self.stack_importer.open_stack_db()

		# Clean similarity table
		self.lda_importer.create_clean_similarities_table()

		self._learn_model()

		logging.info("Loading dictionary ...")
		self._load_dictionary()

		logging.info("Calculating questions/answers similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		for question in question_corpus:

			print "Question " + str(question.id)
			similarities  = []
			answer_corpus = StackCorpus(self.stack_importer.connection, "answer")

			# Get topics in the question
			bow = self.dictionary.doc2bow(question.body)
			question_topics = self.model[bow]

			for answer in answer_corpus:

				# Get topics in the answer
				bow = self.dictionary.doc2bow(answer.body)
				answer_topics = self.model[bow]

				# Similarities
				similarities.append((question.id, answer.id, self._compare_documents(question_topics, answer_topics)))

			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.lda_importer.save_similarities(similarities)

		# Close database connections
		self.stack_importer.close_stack_db()
		self.lda_importer.close_lda_db()


	def _learn_model(self):
		self.model = models.wrappers.LdaMallet(self.mallet_path, corpus=self, num_topics=self.number_of_topics,
					id2word=self.dictionary, iterations=self.number_of_iter)


	def _load_dictionary(self):

		self.stack_importer.open_stack_db()

		# Load dictionary
		question_corpus = self.stack_importer.get_question_corpus()
		answer_corpus   = self.stack_importer.get_answer_corpus()
		corpus          = question_corpus + answer_corpus
		self.dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, answer_corpus])

		self.stack_importer.close_stack_db()


	def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'):

		self.experiments.open_experiment_db()
		self.lda_importer.open_lda_db()
		self.stack_importer.open_stack_db()

		total_answers = self.stack_importer.get_number_of_answers()

		# Get number of answers for each question
		number_of_answers = self.stack_importer.get_number_of_original_answers()

		# Load similarities for each question
		logging.info("Loading similarities ...")
		question_corpus  = StackCorpus(self.stack_importer.connection, "question")
		similar_answers  = {}
		original_answers = {}
		
		for question in question_corpus:
			original_answers[question.id] = self.stack_importer.get_question_original_answers(question.id)
			similar_answers[question.id]  = self.esa_importer.load_similarities_for_question(question.id, -1, False)

		self.stack_importer.close_stack_db()
		self.lda_importer.close_lda_db()

		# Calculate avg precision and recall for each case
		precision = {}
		recall    = {}
		for limit in xrange(1,total_answers+1):
			logging.info("Calculating with limit %s", str(limit))

			avg_precision, avg_recall = self.experiments.run_experiment_1_avg(number_of_answers,
				original_answers, similar_answers, experiment_type, limit)
			precision[limit] = avg_precision
			recall[limit]    = avg_recall

		# Save into the database
		self.experiments.save_experiment_results(experiment_type, precision, recall)

		# Write them in a file
		folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
		self.experiments.write_pr_curve(experiment_type, folder)

		self.experiments.close_experiment_db()

		logging.info("\nDone")


	###############################################################################
	# Create the local model
	###############################################################################

	def calculate_local_similarities(self):
		""" Calculates similarities between local questions/answers.
			Returns the list of filtered users """

		# Keep filtered users
		filtered_users = []

		# Open database connections
		self.lda_importer.open_lda_db()
		self.stack_importer.open_stack_db()

		# Clean similarity table
		self.lda_importer.create_clean_similarities_table()

		# For each question calculate its similarity with the all the answers given
		# by the users who answered the given question
		logging.info("Calculating questions/answers similarities ...")
		question_corpus = StackCorpus(self.stack_importer.connection, "question")

		for question in question_corpus:

			print "Question " + str(question.id)

			similarities  = []

			# Get the users that gave an answer to the question
			users = self.stack_importer.get_users_from_question(question.id)
			print "Users that replied: " + str(len(users))

			# Calculate the similarities of question with all
			# answers from the given users (related or not to question)
			for user_id in users:
				user_answers = self.stack_importer.get_user_answers_to_questions(user_id)

				# Only consider users with more than 1 answer
				if len(user_answers) > 5:
					print "User " + str(user_id)

					self._learn_local_model(user_id)

					# Get topics in the question
					bow = self.dictionary.doc2bow(question.body)
					question_topics = self.model[bow]

					# Get topics in the answers and calculate similarities with current question
					for answer in user_answers:
						bow = self.dictionary.doc2bow(answer.body)
						answer_topics = self.model[bow]

						# Similarities
						similarities.append((question.id, answer.id, self._compare_documents(question_topics, answer_topics)))
				else:
					filtered_users.append(user_id)

			# Save similarities to databse
			logging.info("\nSaving similarities to database ...")
			self.lda_importer.save_similarities(similarities)

		# Close database connections
		self.stack_importer.close_stack_db()
		self.lda_importer.close_lda_db()

		return filtered_users


	def _learn_local_model(self, user_id):
		""" Learns the LDA model with local knowledge """

		# Load question and answer corpus
		question_corpus    = self.stack_importer.get_user_question_corpus(user_id)
		self.answer_corpus = self.stack_importer.get_user_answer_corpus(user_id)
		self.corpus        = question_corpus + self.answer_corpus
		self.dictionary    = self.stack_importer.get_dictionary_from_corpora([question_corpus, self.answer_corpus])

		# Create model
		self.model = models.wrappers.LdaMallet(self.mallet_path, corpus=self, num_topics=self.number_of_topics,
					id2word=self.dictionary, iterations=self.number_of_iter)


	@staticmethod
	def _compare_documents(document1, document2):
		""" Calculates the distance between the given documents """

		doc1_topic_description = []
		doc2_topic_description = []

		for (topic, weight) in document1:
			doc1_topic_description.append(weight)

		for (topic, weight) in document2:
			doc2_topic_description.append(weight)

		return Metric.js_distance(doc1_topic_description, doc2_topic_description)



	def run_experiment_2_avg(self, experiment_type='2_avg', algorithm='lda_local_2'):

		self.experiments.open_experiment_db()

		self.lda_importer.open_lda_db()
		self.stack_importer.open_stack_db()

		total_answers = self.stack_importer.get_number_of_answers()

		# Get number of answers for each question
		number_of_answers = self.stack_importer.get_number_of_original_answers()

		# Load similarities for each question
		logging.info("Loading similarities ...")
		question_corpus  = StackCorpus(self.stack_importer.connection, "question")
		similar_answers  = {}
		original_answers = {}
		
		for question in question_corpus:
			original_answers[question.id] = self.stack_importer.get_question_original_answers(question.id)
			similar_answers[question.id]  = self.lda_importer.load_similarities_for_question(question.id, -1, False)

		self.stack_importer.close_stack_db()
		self.lda_importer.close_lda_db()

		# Calculate avg precision and recall for each case
		precision = {}
		recall    = {}
		for limit in xrange(1,total_answers+1):
			print "Calculating with limit " + str(limit)

			avg_precision, avg_recall = self.experiments.run_experiment_1_avg(number_of_answers,
				original_answers, similar_answers, experiment_type, limit)
			precision[limit] = avg_precision
			recall[limit]    = avg_recall

		# Save into the database
		self.experiments.save_experiment_results(experiment_type, precision, recall)

		# Write them in a file
		folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
		self.experiments.write_pr_curve(experiment_type, folder)

		self.experiments.close_experiment_db()

		logging.info("\nDone")
Пример #19
0
x_test, y_test = timestamped_to_vector(test,
                                       vector_col=v,
                                       time_start=0,
                                       classification_col=c)
x_train, y_train = timestamped_to_vector(train,
                                         vector_col=v,
                                         time_start=0,
                                         classification_col=c)

# Random search with thresholding
rand_params = Configs.get_all()

expt = Experiments.Experiment(rand_params,
                              search_algorithm="random",
                              data=(x_train, y_train),
                              folds=10,
                              folder_name="random_search_reults",
                              thresholding=True,
                              threshold=0.5)

# parameter configurations
A_B_C = Configs.get_A_B_C

# Ensemble model
ensemble_config = Experiments.Ensemble_configurations(
    list(A_B_C.values()),
    x_test=x_test,
    y_test=y_test,
    x_train=x_train,
    y_train=y_train,
    folder_name="test_train_results",
Пример #20
0
    if args.plot:
        plot_times_by_batch(args.database)
    else:
        if args.load_database:
            exps = pkl.load(open(args.database))
        else:
            ## Determine the type of sparsity layer to use
            if args.layer_class == 'HiddenRandomBlockLayer':
                layer_class = HiddenRandomBlockLayer
            else:
                layer_class = HiddenBlockLayer

            ## Create experiments
            exps = Experiments(
                input_dim=784,  # data.train_set_x.shape[-1].eval(),
                num_classes=10
            )

            # Add descriptions of models
            exps.add_layers_description(
                0,
                {
                    'n_hids': (25,),
                    'n_units_per': args.units_per_block,
                    'k_pers': (1, 1),
                    'activations': (T.tanh, None),
                    'layer_classes': [
                        HiddenBlockLayer,
                        HiddenBlockLayer,
                    ],
                }
Пример #21
0
	def test_init(self):
		experiments = Experiments()
		self.assertEqual( experiments.getNumOfExperiments(), 0 )
		self.assertEqual( experiments.getExperiments(), {} )

		try:
			experiments.runAllExperiments()

			fail(self)
		except ValueError as ve:
			self.assertEqual( str(ve), 'Experiments object has no models to run!')

		try:
			experiments.addExperiment('random forest')

			fail(self)
		except ValueError as ve:
			self.assertEqual( str(ve), 'Object must be Experiment object: random forest')

		try:
			experiments.addExperiment( Experiment(1) )

			fail(self)
		except ValueError as ve:
			self.assertEqual( str(ve), 'Experiment name attribute must be string, not <class \'int\'>' )
		self.assertEqual( experiments.getNumOfExperiments(), 0 )

		experiments.addExperiment( Experiment('1') )
		experiments.addExperiment( Experiment('2') )
		experiments.addExperiment( Experiment('3') )
		experiments.addExperiment( Experiment('4') )

		self.assertEqual( experiments.getNumOfExperiments(), 4 )
		self.assertEqual( experiments.getExperimentNames(), ['1', '2', '3', '4'] )