def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) ## load entities from json file log("loading entity list") entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json")) self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
## get our filter algorithm import toy_kba_algorithm ## load entities filter_topics = json.load(open(args.entities)) ## set the topic set identifier in filter_run filter_run["topic_set_id"] = filter_topics["topic_set_id"] ## init our toy algorithm entities = filter_topics["targets"] if args.recall_filters: recall_filters = json.load(open(args.recall_filters)) else: recall_filters = {} entity_representations = toy_kba_algorithm.prepare_entities(entities, recall_filters) logger.info( json.dumps(entity_representations, indent=4, sort_keys=True) ) ## set the corpus identifier in filter_run corpus_id_parts = args.corpus.split("/") filter_run["corpus_id"] = corpus_id_parts[-1] or corpus_id_parts[-2] ## prepare to iterate over all hours in corpus in chronological order if args.date_hour: ## for parallel mode, we read a single date_hour dir from this ## argument date_hour_list = [args.date_hour] print_comments = False else: date_hour_list = os.listdir(args.corpus)
for slot_name, values in data['slots'].iteritems(): if slot_name.isupper() and args.mode == 'slots': for val in values: recall_filters[target_id].append(val['value']) elif args.mode == 'simple' and slot_name == 'canonical_name': recall_filters[target_id].append(values) recall_filters[target_id] += values.split() print recall_filters slot_names = {} if args.slot_names: slot_names = json.load(open(args.slot_names)) entity_representations = toy_kba_algorithm.prepare_entities( entities, recall_filters=recall_filters, slot_names=slot_names, ) logger.info( json.dumps(entity_representations, indent=4, sort_keys=True) ) ## set the corpus identifier in filter_run corpus_id_parts = args.corpus.split("/") filter_run["corpus_id"] = corpus_id_parts[-1] or corpus_id_parts[-2] ## store some non-required run info of our own design to the ## filter_run dict to store in our submission... not too much, just a ## bit of context for humans. filter_run["run_info"] = { "num_entities": len(entities), } print_comments = False