def create_max_freq_term_by_index_types(document_term_map, index_types):
    max_freq_map = {}
    for document_id in document_term_map:
        max_freq_map[document_id] = {}
        for index_type in index_types:
            to_freq_dist_method = n_gram_handler.get_to_freq_dist_function(index_type)
            freq_dist = to_freq_dist_method(document_term_map[document_id])
            max_freq = 0
            for document_term in freq_dist:
                if freq_dist[document_term] > max_freq:
                    max_freq = freq_dist[document_term]
            max_freq_map[document_id][index_type] = max_freq
    return max_freq_map
def __get_all_dice_neighbours(posting_list, index, trainingdata_handler, document_filters):
    dice_neighbours = {}
    index_type = index["index_type"]
    index_map = index["index"]
    to_index_term = n_gram_handler.get_to_index_term_function(index_type)
    to_freq_dist = n_gram_handler.get_to_freq_dist_function(index_type)
    for post in posting_list:
        document_file_id = str(post[0])
        document =  trainingdata_handler.get_training_data_file_string(document_file_id)
        document_terms = preprocessing_filters.apply_filters_to_document(document,document_filters)
        freq_dist = to_freq_dist(document_terms)
        documents_index_terms = [to_index_term(t) for t in freq_dist]
        for index_term in documents_index_terms:
            if index_term not in dice_neighbours and index_term in index_map:
                dice_coefficient = __calculate_dice_coefficient(posting_list,index_term,index_map)
                if dice_coefficient > 0:
                    dice_neighbours[index_term] = dice_coefficient

    dice_neighbours = list(dice_neighbours.items())
    dice_neighbours.sort(key=itemgetter(1), reverse=True)
    return dice_neighbours
def create_index(index_specification):
    dataset_id = index_specification["dataset_id"]
    index_type = index_specification["index_type"]
    filter_names = index_specification["filters"]

    index = {}

    # Save meta info about index
    index["id"] = get_index_id(index_specification)
    for key in index_specification:
        index[key] = index_specification[key]

    # Create traing data handler and assign index help methods according to index type

    training_dataset_handler = TrainingDatasetHandler(dataset_id)
    to_index_term = n_gram_handler.get_to_index_term_function(index_type)
    to_freq_dist = n_gram_handler.get_to_freq_dist_function(index_type)

    # create index
    index["index"] = {}
    n_documents = 0
    for document_data in training_dataset_handler:
        n_documents += 1
        document_id = document_data[0]
        document = document_data[1]
        document_terms = preprocessing_filters.apply_filters_to_document(document, filter_names)
        freq_dist = to_freq_dist(document_terms)
        for document_term in freq_dist:
            index_term = to_index_term(document_term)
            if not index_term in index["index"]:
                index["index"][index_term] = []
            posting = (document_id, freq_dist[document_term])
            index["index"][index_term].append(posting)

    index["n_terms"] = len(index["index"])
    index["n_documents"] = n_documents
    index["max_frequency"] = __get_frequency_of_most_common_term(index["index"])

    return index