def extend_vocabulary(root, stats_dir, knowledge_dir, freq_vocabulary, query, min_num_images, save_description, do_skip):
    num_vocabulary = len(freq_vocabulary)
    freq_vocabulary_list = [t[0] for t in freq_vocabulary]
    freq_vocabulary_nospace = ["".join(w.split()) for w in freq_vocabulary_list]
    freq_vocabulary_dict = dict(list(zip(freq_vocabulary_list + freq_vocabulary_nospace, list(range(0, len(freq_vocabulary_list)))+list(range(0, len(freq_vocabulary_list))))))
    freq_vocabulary_set = set(freq_vocabulary_list)

    #Check adjacent Freebase concepts for related vocabulary phrases
    #----------------------------------------------------------------
    from structured_knowledge.download_structured_knowledge import download_structured_knowledge
    from structured_knowledge.parse_cache import find_candidates
    download_structured_knowledge(knowledge_dir, freq_vocabulary_list, do_skip)
    freq_vocabulary_edit, candidate_vocabulary_list, errors = find_candidates(knowledge_dir, "Freebase", freq_vocabulary, save_description)

    #Find the frequency of two word phrases using co-occurrence matrix
    comatrix_path = os.path.join(stats_dir, save_description + '_flickr_comatrix_approx.mat')
    comatrix_dict = spio.loadmat(comatrix_path)
    comatrix = comatrix_dict['comatrix_images']
    comatrix[np.eye(np.shape(comatrix)[0], dtype=bool)] = 0 #Mask diagonal

    two_word_phrases = [w for w in candidate_vocabulary_list if len(w.split())==2]
    two_word_indices = [(freq_vocabulary_dict[t[0]], freq_vocabulary_dict[t[1]]) for t in (w.split() for w in two_word_phrases)]
    confirmed_new_vocab_pair = [(two_word_phrases[i], comatrix[two_word_indices[i]]) for i in range(0, len(two_word_phrases)) if (comatrix[two_word_indices[i]] > min_num_images) or ("".join(two_word_phrases[i].split()) in freq_vocabulary_set)]

    #Longer phrases remain in the candidate_vocabulary
    candidate_vocabulary = [(w, 0) for w in candidate_vocabulary_list if len(w.split()) > 2]

    #Count candidate frequency
    from database_builder.get_photo_meta_multiprocess import freq_pattern_count
    confirmed_new_vocab_long = freq_pattern_count(knowledge_dir, root, stats_dir, query, save_description, candidate_vocabulary, min_num_images)
    #confirmed_new_vocab_long = [('hard rock cafe', 262.0), ('empire state building', 221.0)]

    #Add concept pairs with high PMI to concepts list
    #----------------------------------------------------------------
    from analysis.python_pmi import high_pmi
    tic()
    score_threshold = 0.8
    new_vocab = high_pmi(comatrix_path, score_threshold, min_num_images, 'image', np.tril(np.ones((num_vocabulary,num_vocabulary)), -1), freq_vocabulary_list)

    #Exclude vocab that has already been discovered using Freebase
    vocab_split_set = set(zip(*freq_vocabulary_edit)[2])
    new_vocab = [w for w in new_vocab if not w[0] in two_word_phrases and not w[0] in vocab_split_set]
    new_vocab_list = [t[0] for t in new_vocab]
    new_vocab_set = set(new_vocab_list)

    #Don't use more that num_extra_terms new vocab. This is to limit the number of irrelevant terms downloaded
    num_extra_terms = 500
    new_vocab = new_vocab[:min(len(new_vocab), num_extra_terms)]
    new_vocab_list = [t[0] for t in new_vocab]

    from structured_knowledge.parse_cache import vet_candidates
    download_structured_knowledge(knowledge_dir, new_vocab_list, do_skip)
    confirmed_new_vocab_pmi = vet_candidates(knowledge_dir, "Freebase", new_vocab)
    toc()

    # Extend vocabulary and parse Knowledgebase with complete vocabulary
    #--------------------------------------------------------------------
    confirmed_new_vocab_pair = [(t[0], t[1], "".join(t[0].split())) for t in confirmed_new_vocab_pair]
    confirmed_new_vocab_long = [(t[0], t[1], "".join(t[0].split())) for t in confirmed_new_vocab_long]
    confirmed_new_vocab_pmi = [(t[0], t[1], "".join(t[0].split())) for t in confirmed_new_vocab_pmi]
    complete_vocabulary = freq_vocabulary_edit + confirmed_new_vocab_pair + confirmed_new_vocab_long + confirmed_new_vocab_pmi

    complete_vocabulary_dict = {}
    for tup in complete_vocabulary:
        if tup[2] in complete_vocabulary_dict:
            if tup[0] != complete_vocabulary_dict[tup[2]]:
                complete_vocabulary_dict[tup[2]] = (tup[0], tup[1] + complete_vocabulary_dict[tup[2]][1])
        else:
            complete_vocabulary_dict[tup[2]] = (tup[0], tup[1])

    complete_vocabulary = complete_vocabulary_dict.values()
    complete_vocabulary.sort(key=lambda x: x[1], reverse=True)
    return complete_vocabulary
def main():
    arg_helper = CmdArgumentsHelper()
    arg_helper.add_argument('query', 'q', 'query', 1)
    arg_helper.add_argument('root_dir', 'r', 'root', 1)
    arg_helper.add_argument('stats_dir', 's', 'stats', 1)
    arg_helper.add_argument('knowledge_dir', 'k', 'knowledge', 1)
    arg_helper.add_argument('min_num_images', 'n', 'min_num_images', 1)
    args = arg_helper.read_arguments()
    print (args)

    root = args['root_dir']
    min_num_images = int(args['min_num_images'])
    query = args['query']
    stats_dir = args['stats_dir']
    knowledge_dir = args['knowledge_dir']

    # Config file configuration stuff
    argv = cfg.vars
    numberOfThreads = int(argv["numberOfThreads"])

    save_description = "{}_{}".format(query, min_num_images)

    # If do_skip is set to 1, will skip the item that we have previously
    # generated data. You should be careful to set it as 1, unless you
    # are certain the data will not change, i.e. if you remove or add new
    # photo to the image id list, you should set it as 0 to regenerate
    # all related data.
    do_skip = True


    # Find most frequent concepts
    # Concept List will be saved in file
    concept_dir = os.path.join(root, "concepts")
    if not os.path.exists(concept_dir):
        os.mkdir(concept_dir)

    concept_file = os.path.join(concept_dir, '{}_owner_per_concept.txt'.format(save_description))

    from database_builder.get_photo_meta_multiprocess import find_vocabulary
    tic()
    find_vocabulary(root, stats_dir, query, min_num_images, save_description)
    toc()

    with open(concept_file, 'r') as f:
        all_concepts = all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]]

    all_concepts_list,  all_concepts_freq = zip(*all_concepts)
    spio.savemat(concept_file[:-3] + 'mat', {'concepts': all_concepts_list})

    #Remove some of the vocabulary fitting certain criteria
    filter_vocab_dir = os.path.join(root, 'filter_lists')
    all_concepts = filter_vocabulary(filter_vocab_dir, all_concepts)

    save_description = "{}_{}_extended".format(query, min_num_images)
    concept_file = os.path.join(concept_dir, '{}_filtered_owner_per_concept.txt'.format(save_description))
    with open(concept_file, 'w') as f:
        all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts]
        f.write("\n".join(all_concepts_str))

    with open(concept_file, 'r') as f:
        all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]]

    #Break concatenated word pairs
    all_concepts = break_pairs(all_concepts)

    concept_file = os.path.join(concept_dir, '{0}_split_owner_per_concept.txt'.format(save_description))
    with open(concept_file, 'w') as f:
        all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts]
        f.write("\n".join(all_concepts_str))

    with open(concept_file, 'r') as f:
        all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]]

    #Find approximate statistics for concept pairs
    #For initial vocabulary expansion
    web_dir = os.path.join(root, 'output')
    if not os.path.exists(web_dir):
        os.mkdir(web_dir)
    from database_builder.get_photo_meta_multiprocess import find_approximate_concept_pairs
    tic()
    find_approximate_concept_pairs(root, stats_dir, query, save_description, all_concepts)
    toc()

    # Expand vocabulary
    all_concepts = extend_vocabulary(root, stats_dir, knowledge_dir, all_concepts, query, min_num_images, save_description, do_skip)

    concept_file = os.path.join(concept_dir, '{}_owner_per_concept.txt'.format(save_description))
    with open(concept_file, 'w') as f:
        all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts]
        f.write("\n".join(all_concepts_str))

    with open(concept_file, 'r') as f:
        all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')] if len(x)>1]

    all_concepts = merge_pairs(all_concepts)
    all_concepts_list, all_concepts_freq = zip(*all_concepts)

    spio.savemat(concept_file[:-3] + 'mat', {'concepts': all_concepts_list})

    #Recount tag cooccurrence with final vocabulary
    from database_builder.get_photo_meta_multiprocess import find_concept_pairs
    #if total_new_concepts > 0:
    tic()
    web_dir = os.path.join(root, 'output')
    if not os.path.exists(web_dir):
        os.mkdir(web_dir)
    find_concept_pairs(root, stats_dir, web_dir, query, all_concepts)
    toc()

    #Process knowledge
    from structured_knowledge.parse_cache import parse_cache
    from structured_knowledge.download_structured_knowledge import download_structured_knowledge
    download_structured_knowledge(knowledge_dir, all_concepts_list, do_skip)
    parse_cache(knowledge_dir, "ConceptNet", all_concepts, save_description, stats_dir)
    parse_cache(knowledge_dir, "Freebase", all_concepts, save_description, stats_dir)

    #Generate adjacency matrices
    from structured_knowledge.build_adjacency_matrices import build_adjacency_matrices
    build_adjacency_matrices(knowledge_dir, stats_dir, all_concepts_list, save_description)

    from database_builder.gen_concept_structure import task_gen_synonym_mask
    from database_builder.gen_concept_structure import task_gen_lemma_mask

    task_gen_synonym_mask(all_concepts_list, stats_dir, save_description)
    task_gen_lemma_mask(all_concepts_list, stats_dir, save_description)

    from structured_knowledge.parts_of_speech import parse_language
    from structured_knowledge.parts_of_speech import parse_proper_nouns
    from structured_knowledge.parts_of_speech import parse_parts_of_speech
    from structured_knowledge.parse_object_scene import parse_object_concepts
    from structured_knowledge.parse_object_scene import parse_scene_concepts

    parse_language(all_concepts_list, stats_dir, save_description)
    parse_proper_nouns(all_concepts_list, stats_dir, save_description)
    parse_parts_of_speech(all_concepts_list, knowledge_dir, stats_dir, save_description)
    parse_scene_concepts(knowledge_dir, stats_dir, all_concepts_list, save_description)
    parse_object_concepts(knowledge_dir, stats_dir, all_concepts_list, save_description)

    gen_phrase_mask(all_concepts_list, stats_dir, save_description)

    from database_builder.get_vocab_features import get_glove
    print("Start GloVe Feature")
    model_file = 'E:\data\GloVe\glove.42B.300d.txt'
    save_model_file = ''
    dim = 300
    save_feature_file = "E:\data\Iconic\data\word2vec_features\\{}_extended_feature_glove.42B.300d.mat".format(save_description)
    get_glove(dim, model_file, save_model_file, save_feature_file, concept_file)