def generate_confidence_intervals(args): logger = Logger(args.log, args.log_specifications, sys.argv) if not args.input.endswith('.tab'): logger.record_event('DEFAULT_CRITICAL_ERROR', 'input filename should be a *.tab.') aggregate = {} for element in args.aggregate.split(','): key, value = element.split(':') if key not in aggregate: aggregate[key] = [] aggregate[key].append(value) confidence_interval = ConfidenceIntervals(logger, macro=args.macro, input=args.input, primary_key_col=args.primary_key, score=args.score, aggregate=aggregate, document_id_col=args.document_id, run_id_col=args.run_id, sizes=args.sizes, seed_value=args.seed) output = {'pretty': args.pretty_output, 'tab': args.tab_output} for output_format in output: fh = open(output[output_format], 'w') fh.write(confidence_interval.get('output', output_format)) fh.close() exit(ALLOK_EXIT_CODE)
def align_clusters(args): logger = Logger(args.log, args.log_specifications, sys.argv) document_mappings = DocumentMappings( logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } annotated_regions = AnnotatedRegions(logger, document_mappings, document_boundaries, args.regions) os.mkdir(args.similarities) os.mkdir(args.alignment) for entry in sorted(os.scandir(args.gold), key=str): if entry.is_dir() and entry.name.endswith('.ttl'): kb = entry.name message = 'aligning clusters in {}'.format(entry.name) logger.record_event('DEFAULT_INFO', message) print('At {}: {}'.format( time.strftime("%m/%d/%Y %H:%M:%S", time.localtime()), message)) document_id = kb.replace('.ttl', '') gold_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format( args.gold, kb) gold_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format( args.gold, kb) system_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format( args.system, kb) system_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format( args.system, kb) gold_mentions = gold_mentions if os.path.exists( gold_mentions) else None gold_edges = gold_edges if os.path.exists(gold_edges) else None system_mentions = system_mentions if os.path.exists( system_mentions) else None system_edges = system_edges if os.path.exists( system_edges) else None similarities = '{}/{}.tab'.format(args.similarities, document_id) alignment = '{}/{}.tab'.format(args.alignment, document_id) check_for_paths_non_existance([similarities, alignment]) clusters = Clusters(logger, document_mappings, document_boundaries, annotated_regions, gold_mentions, gold_edges, system_mentions, system_edges) clusters.print_similarities(similarities) clusters.print_alignment(alignment) exit(ALLOK_EXIT_CODE)
def filter_responses(args): logger = Logger(args.log, args.log_specifications, sys.argv) ontology_type_mappings = OntologyTypeMappings(logger, args.ontology_type_mappings) slot_mappings = SlotMappings(logger, args.slot_mappings) document_mappings = DocumentMappings( logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.input, args.runid) annotated_regions = AnnotatedRegions(logger, document_mappings, document_boundaries, args.regions) run_filter_on_all_responses(responses, annotated_regions, document_mappings, document_boundaries) os.mkdir(args.output) for input_filename in responses: output_filename = input_filename.replace(responses.get('path'), args.output) dirname = os.path.dirname(output_filename) if not os.path.exists(dirname): os.mkdir(dirname) output_fh = open(output_filename, 'w') header_printed = False for linenum in sorted(responses.get(input_filename), key=int): entry = responses.get(input_filename).get(str(linenum)) if not header_printed: output_fh.write('{}\n'.format(entry.get('header').get('line'))) header_printed = True if not entry.get('valid'): logger.record_event('EXPECTING_VALID_ENTRY', entry.get('where')) continue if entry.get('passes_filter'): output_fh.write(entry.__str__()) output_fh.close() exit(ALLOK_EXIT_CODE)
def __init__(self, log, batch_id, kit_size, previous_pools, log_specifications, encodings, core_documents, parent_children, sentence_boundaries, image_boundaries, keyframe_boundaries, video_boundaries, runs_to_pool, queries, input_dir, output_dir): check_for_paths_existance([ log_specifications, encodings, core_documents, parent_children, sentence_boundaries, image_boundaries, keyframe_boundaries, video_boundaries, runs_to_pool, queries, input_dir ]) check_for_paths_non_existance( ['{}-{}'.format(output_dir, self.get('batch_id'))]) self.log_filename = log self.batch_id = batch_id self.kit_size = kit_size self.previous_pools = previous_pools self.log_specifications = log_specifications self.encodings = encodings self.core_documents = core_documents self.parent_children = parent_children self.sentence_boundaries = sentence_boundaries self.image_boundaries = image_boundaries self.keyframe_boundaries = keyframe_boundaries self.video_boundaries = video_boundaries self.runs_to_pool = runs_to_pool self.queries = queries self.input = input_dir self.output = output_dir self.logger = Logger(self.get('log_filename'), self.get('log_specifications'), sys.argv)
def main(args): """ The main program for generating AIF """ check_paths(args) logger = Logger(args.log, args.log_specifications_filename, sys.argv) core_documents = CoreDocuments(logger, args.core_documents_filename) encodings = Encodings(logger, args.encodings_filename) document_mappings = DocumentMappings(logger, args.parent_children_filename, encodings, core_documents) text_boundaries = TextBoundaries(logger, args.sentence_boundaries_filename) image_boundaries = ImageBoundaries(logger, args.image_boundaries_filename) video_boundaries = VideoBoundaries(logger, args.video_boundaries_filename) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries_filename) type_mappings = Container(logger) for entry in FileHandler(logger, args.type_mappings_filename): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) slot_mappings = SlotMappings(logger, args.slot_mappings_filename) annotations = Annotations(logger, slot_mappings, document_mappings, text_boundaries, image_boundaries, video_boundaries, keyframe_boundaries, type_mappings, args.annotations, load_video_time_offsets_flag=args.notime) generator = AIFGenerator(logger, annotations, args.nochannel, args.reference_kb_id) generator.write_output(args.output) exit(ALLOK_EXIT_CODE)
def score_submission(args): logger = Logger(args.log, args.log_specifications, sys.argv) ontology_type_mappings = OntologyTypeMappings(logger, args.ontology_type_mappings) slot_mappings = SlotMappings(logger, args.slot_mappings) document_mappings = DocumentMappings(logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } gold_responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.gold, 'gold') system_responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.system, args.runid) cluster_alignment = ClusterAlignment(logger, args.alignment) cluster_self_similarities = ClusterSelfSimilarities(logger, args.similarities) scores = ScoresManager(logger, gold_responses, system_responses, cluster_alignment, cluster_self_similarities, args.separator) scores.print_scores(args.scores) exit(ALLOK_EXIT_CODE)
def main(args): logger = Logger(args.log, args.log_specifications, sys.argv) os.mkdir(args.sparql) columns = ['query_id', 'entrypoint_type', 'entrypoint', 'num_clusters', 'depth'] queries_fh = open(args.queries, 'w') queries_fh.write('{}\n'.format('\t'.join(columns))) query_num = 0 for entry in FileHandler(logger, args.input): query_num += 1 values = { 'depth' : args.depth, 'entrypoint_type': entry.get('entrypoint_type'), 'entrypoint' : entry.get('entrypoint'), 'num_clusters' : entry.get('num_clusters'), 'query_id' : '{prefix}{query_num}'.format(prefix=args.prefix, query_num=augment(query_num)) } line = '\t'.join([values[column] for column in columns]) queries_fh.write('{}\n'.format(line)) sparql_query_fh = open('{dir}/{query_id}.rq'.format(dir=args.sparql, query_id=values['query_id']), 'w') sparql_query_fh.write(get_sparql(logger, values['query_id'], values['entrypoint_type'], values['entrypoint'])) sparql_query_fh.close() queries_fh.close() exit(ALLOK_EXIT_CODE)
def main(args): check_if_path_exists(args) logger = Logger(args.log, args.log_specifications_filename, sys.argv) ontology = Ontology(logger, args.entities_ontology_filename, args.relations_ontology_filename, args.events_ontology_filename) mapping = {} for ere_container in [ ontology.get('entities'), ontology.get('relations'), ontology.get('events') ]: for spec in ere_container.values(): full_type = spec.get('cleaned_full_type') full_type_ov = spec.get('cleaned_full_type_ov') if full_type is None or full_type_ov is None: continue mapping[full_type_ov] = full_type program_output = open(args.output_filename, 'w') program_output.write('full_type_ov\tfull_type\n') for full_type_ov, full_type in mapping.items(): program_output.write('{}\t{}\n'.format(full_type_ov, full_type)) program_output.close() exit(ALLOK_EXIT_CODE)
def validate_responses(args): logger = Logger(args.log, args.log_specifications, sys.argv) logger.record_event('DEFAULT_INFO', 'validation started') document_mappings = DocumentMappings( logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } queries = TA3QuerySet(logger, args.queries) if args.queries else None responses = ResponseSet(logger, document_mappings, document_boundaries, args.input, args.runid, args.task, queries=queries) responses.write_valid_responses(args.output) num_warnings, num_errors = logger.get_stats() closing_message = 'validation finished (warnings:{}, errors:{})'.format( num_warnings, num_errors) logger.record_event('DEFAULT_INFO', closing_message) print(closing_message) if num_errors > 0: exit(ERROR_EXIT_CODE) exit(ALLOK_EXIT_CODE)
def __init__(self, log, log_specifications, queries, output): check_for_paths_existance([log_specifications, queries, output]) check_for_paths_non_existance([]) self.log_filename = log self.log_specifications = log_specifications self.queries = queries self.output = output self.logger = Logger(self.get('log_filename'), self.get('log_specifications'), sys.argv)
def __init__(self, log_filename, log_specifications, task, input_dir, output_dir): check_for_paths_existance([log_specifications, input_dir]) check_for_paths_non_existance([output_dir]) self.log_filename = log_filename self.log_specifications = log_specifications self.task = task self.input_dir = input_dir self.output_dir = output_dir self.logger = Logger(self.get('log_filename'), self.get('log_specifications'), sys.argv)
def __init__(self, log, batch_id, previous_pools, log_specifications, queries_to_pool, runs_to_pool, input_dir, output_dir): check_for_paths_existance( [log_specifications, runs_to_pool, queries_to_pool, input_dir]) check_for_paths_non_existance(['{}-{}'.format(output_dir, batch_id)]) self.log_filename = log self.batch_id = batch_id self.previous_pools = previous_pools self.log_specifications = log_specifications self.runs_to_pool = runs_to_pool self.queries_to_pool = queries_to_pool self.input = input_dir self.output = output_dir self.logger = Logger(self.get('log_filename'), self.get('log_specifications'), sys.argv)
def clean_sparql_output(args): logger = Logger(args.log, args.log_specifications, sys.argv) filenames = [] for root, dirs, files in os.walk(args.input): filenames.extend([ os.path.join(root, file) for file in files if file.endswith('.tsv') ]) os.mkdir(args.output) for input_filename in filenames: output_root = args.output output_basename = os.path.basename(input_filename) output_subdir = input_filename.replace(args.input, '').replace( output_basename, '').rstrip('/').lstrip('/') output_directory = '{}/{}'.format(output_root, output_subdir) output_filename = '{}/{}'.format(output_directory, output_basename) os.makedirs(output_directory, exist_ok=True) clean_a_sparql_output_file(logger, input_filename, output_filename) exit(ALLOK_EXIT_CODE)
def validate_responses(args): logger = Logger(args.log, args.log_specifications, sys.argv) ontology_type_mappings = OntologyTypeMappings(logger, args.ontology_type_mappings) slot_mappings = SlotMappings(logger, args.slot_mappings) document_mappings = DocumentMappings(logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.input, args.runid) responses.write_valid_responses(args.output) exit(ALLOK_EXIT_CODE)
def main(args): logger = Logger(args.log, args.log_specifications, sys.argv) type_mappings = Container(logger) for entry in FileHandler(logger, args.ontology_type_mappings): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } output = [] for entry in FileHandler(logger, args.input): document_id = entry.get('root_doc_id') document_element_id = entry.get('doc_element_id') modality = entry.get('media_type') type = entry.get('type') subtype = entry.get('subtype') subsubtype = entry.get('subsubtype') full_type = '{type}.{subtype}.{subsubtype}'.format( type=type, subtype=subtype, subsubtype=subsubtype) full_type_cleaned = full_type.replace('.unspecified', '') propercased_full_type = type_mappings.get(full_type_cleaned, None) span_string = entry.get('span') keyframe_id = None keyframe_num = 0 if span_string == 'ENTIRE_DOCUMENT_ELEMENT': document_boundary = document_boundaries.get(modality).get( document_element_id) span_string = document_boundary.__str__() elif '-' in span_string: start, end = span_string.split('-') span_string = '({start},0)-({end},0)'.format(start=start, end=end) elif '_' in span_string: keyframe_id = span_string keyframe_num = span_string.split('_')[1] document_boundary = document_boundaries.get('keyframe').get( keyframe_id) span_string = document_boundary.__str__() else: span_string = None output_object = { 'document_id': document_id, 'document_element_id': document_element_id, 'keyframe_id': keyframe_id, 'keyframe_num': int(keyframe_num), 'modality': modality, 'region': span_string, 'type': propercased_full_type, } output.append(output_object) printed = {} fh = open(args.output, 'w') header = [ 'document_id', 'document_element_or_keyframe_id', 'modality', 'region', 'type' ] fh.write('{}\n'.format('\t'.join(header))) for output_object in multisort( output, (('document_id', False), ('modality', False), ('document_element_id', False), ('keyframe_num', False), ('region', False), ('type', False))): line = get_line(output_object, header) if line not in printed: fh.write('{}\n'.format(line)) printed[line] = 1 fh.close() exit(ALLOK_EXIT_CODE)