Exemplo n.º 1
0
def main():      
  arg_helper = CmdArgumentsHelper();
  arg_helper.add_argument('query', 'q', 'query', 1);
  arg_helper.add_argument('root_dir', 'r', 'root', 1);
  arg_helper.add_argument('division_id', 'd', 'division', 1);
  args = arg_helper.read_arguments();
  print (args);

  query_string = args['query'];
  division_id = int(args['division_id']);
  
  dbHelper = DBHelper();
  dbHelper.init(args['root_dir']);
  dbHelper.gen_config_file(query_string);
  configFileName = dbHelper.get_config_filepath(query_string);
  dbHelper.genPCDBFile(query_string);
  pcdbFileName = dbHelper.getPCDBPath(query_string);
  outputDir = dbHelper.rawdataDir;
  downloadQuery(query_string, configFileName, pcdbFileName, outputDir, division_id);
Exemplo n.º 2
0
def main():      
  arg_helper = CmdArgumentsHelper();
  arg_helper.add_argument('query', 'q', 'query', 1);
  arg_helper.add_argument('root_dir', 'r', 'root', 1);
  args = arg_helper.read_arguments();
  print (args);
  
  # Load keyword for querying Flickr.
  query_string = args['query'];
  
  dbHelper = DBHelper();
  dbHelper.init(args['root_dir'])
  dbHelper.gen_config_file(query_string)
  configFileName = dbHelper.get_config_filepath(query_string)
  dbHelper.genPCDBFile(query_string)
  pcdbFileName = dbHelper.getPCDBPath(query_string)
  outputDir = dbHelper.rawdataDir

  # Split the division further to config.numThreadPerDivisions divisions, and 
  # download data for each division sequentially.
  total_division = config.numSearchDivisions / config.numThreadPerDivisions
  for division_id in range(0, total_division):
    downloadQuery(query_string, configFileName, pcdbFileName, outputDir, division_id)
def main():
    arg_helper = CmdArgumentsHelper()
    arg_helper.add_argument('query', 'q', 'query', 1)
    arg_helper.add_argument('root_dir', 'r', 'root', 1)
    arg_helper.add_argument('stats_dir', 's', 'stats', 1)
    arg_helper.add_argument('knowledge_dir', 'k', 'knowledge', 1)
    arg_helper.add_argument('min_num_images', 'n', 'min_num_images', 1)
    args = arg_helper.read_arguments()
    print (args)

    root = args['root_dir']
    min_num_images = int(args['min_num_images'])
    query = args['query']
    stats_dir = args['stats_dir']
    knowledge_dir = args['knowledge_dir']

    # Config file configuration stuff
    argv = cfg.vars
    numberOfThreads = int(argv["numberOfThreads"])

    save_description = "{}_{}".format(query, min_num_images)

    # If do_skip is set to 1, will skip the item that we have previously
    # generated data. You should be careful to set it as 1, unless you
    # are certain the data will not change, i.e. if you remove or add new
    # photo to the image id list, you should set it as 0 to regenerate
    # all related data.
    do_skip = True


    # Find most frequent concepts
    # Concept List will be saved in file
    concept_dir = os.path.join(root, "concepts")
    if not os.path.exists(concept_dir):
        os.mkdir(concept_dir)

    concept_file = os.path.join(concept_dir, '{}_owner_per_concept.txt'.format(save_description))

    from database_builder.get_photo_meta_multiprocess import find_vocabulary
    tic()
    find_vocabulary(root, stats_dir, query, min_num_images, save_description)
    toc()

    with open(concept_file, 'r') as f:
        all_concepts = all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]]

    all_concepts_list,  all_concepts_freq = zip(*all_concepts)
    spio.savemat(concept_file[:-3] + 'mat', {'concepts': all_concepts_list})

    #Remove some of the vocabulary fitting certain criteria
    filter_vocab_dir = os.path.join(root, 'filter_lists')
    all_concepts = filter_vocabulary(filter_vocab_dir, all_concepts)

    save_description = "{}_{}_extended".format(query, min_num_images)
    concept_file = os.path.join(concept_dir, '{}_filtered_owner_per_concept.txt'.format(save_description))
    with open(concept_file, 'w') as f:
        all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts]
        f.write("\n".join(all_concepts_str))

    with open(concept_file, 'r') as f:
        all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]]

    #Break concatenated word pairs
    all_concepts = break_pairs(all_concepts)

    concept_file = os.path.join(concept_dir, '{0}_split_owner_per_concept.txt'.format(save_description))
    with open(concept_file, 'w') as f:
        all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts]
        f.write("\n".join(all_concepts_str))

    with open(concept_file, 'r') as f:
        all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]]

    #Find approximate statistics for concept pairs
    #For initial vocabulary expansion
    web_dir = os.path.join(root, 'output')
    if not os.path.exists(web_dir):
        os.mkdir(web_dir)
    from database_builder.get_photo_meta_multiprocess import find_approximate_concept_pairs
    tic()
    find_approximate_concept_pairs(root, stats_dir, query, save_description, all_concepts)
    toc()

    # Expand vocabulary
    all_concepts = extend_vocabulary(root, stats_dir, knowledge_dir, all_concepts, query, min_num_images, save_description, do_skip)

    concept_file = os.path.join(concept_dir, '{}_owner_per_concept.txt'.format(save_description))
    with open(concept_file, 'w') as f:
        all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts]
        f.write("\n".join(all_concepts_str))

    with open(concept_file, 'r') as f:
        all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')] if len(x)>1]

    all_concepts = merge_pairs(all_concepts)
    all_concepts_list, all_concepts_freq = zip(*all_concepts)

    spio.savemat(concept_file[:-3] + 'mat', {'concepts': all_concepts_list})

    #Recount tag cooccurrence with final vocabulary
    from database_builder.get_photo_meta_multiprocess import find_concept_pairs
    #if total_new_concepts > 0:
    tic()
    web_dir = os.path.join(root, 'output')
    if not os.path.exists(web_dir):
        os.mkdir(web_dir)
    find_concept_pairs(root, stats_dir, web_dir, query, all_concepts)
    toc()

    #Process knowledge
    from structured_knowledge.parse_cache import parse_cache
    from structured_knowledge.download_structured_knowledge import download_structured_knowledge
    download_structured_knowledge(knowledge_dir, all_concepts_list, do_skip)
    parse_cache(knowledge_dir, "ConceptNet", all_concepts, save_description, stats_dir)
    parse_cache(knowledge_dir, "Freebase", all_concepts, save_description, stats_dir)

    #Generate adjacency matrices
    from structured_knowledge.build_adjacency_matrices import build_adjacency_matrices
    build_adjacency_matrices(knowledge_dir, stats_dir, all_concepts_list, save_description)

    from database_builder.gen_concept_structure import task_gen_synonym_mask
    from database_builder.gen_concept_structure import task_gen_lemma_mask

    task_gen_synonym_mask(all_concepts_list, stats_dir, save_description)
    task_gen_lemma_mask(all_concepts_list, stats_dir, save_description)

    from structured_knowledge.parts_of_speech import parse_language
    from structured_knowledge.parts_of_speech import parse_proper_nouns
    from structured_knowledge.parts_of_speech import parse_parts_of_speech
    from structured_knowledge.parse_object_scene import parse_object_concepts
    from structured_knowledge.parse_object_scene import parse_scene_concepts

    parse_language(all_concepts_list, stats_dir, save_description)
    parse_proper_nouns(all_concepts_list, stats_dir, save_description)
    parse_parts_of_speech(all_concepts_list, knowledge_dir, stats_dir, save_description)
    parse_scene_concepts(knowledge_dir, stats_dir, all_concepts_list, save_description)
    parse_object_concepts(knowledge_dir, stats_dir, all_concepts_list, save_description)

    gen_phrase_mask(all_concepts_list, stats_dir, save_description)

    from database_builder.get_vocab_features import get_glove
    print("Start GloVe Feature")
    model_file = 'E:\data\GloVe\glove.42B.300d.txt'
    save_model_file = ''
    dim = 300
    save_feature_file = "E:\data\Iconic\data\word2vec_features\\{}_extended_feature_glove.42B.300d.mat".format(save_description)
    get_glove(dim, model_file, save_model_file, save_feature_file, concept_file)