Exemplo n.º 1
0
def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = defaultdict(dict)
    with tarfile.open(eval_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith(
                    'GuidedSumm2011_eval/manual/pyramids'):
                path = member.name.split('/')
                filename = path[-1]
                parts = filename.split('.')
                instance_id = parts[0].split('-')[0].lower()
                group = parts[0].split('-')[1]
                if group == 'AB':
                    # There are some errors with the AB pyramids, which I think are due to
                    # errors with encoding some of the characters. There is a weird character
                    # in the files (b'\xef\xbf\xbds'). I think it messes up identifying the
                    # summary index based on the offsets (len(b'\xef\xbf\xbds') == 4, but
                    # len(b'\xef\xbf\xbds'.decode()) == 2). It will take some work to update
                    # the summaries to remove this character (I think it should be replaced
                    # with "'") and update all of the offsets.
                    continue

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml)
                pyramids[instance_id][group] = pyramid

    return pyramids
def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = defaultdict(dict)
    with tarfile.open(eval_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith(
                    'UpdateSumm09_eval/manual/pyramids'):
                path = member.name.split('/')
                filename = path[-1]
                parts = filename.split('.')
                instance_id = parts[0].split('-')[0].lower()
                group = parts[0].split('-')[1]

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml)
                pyramids[instance_id][group] = pyramid

    return pyramids
def load_pyramids(pyramid_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = {}
    with tarfile.open(pyramid_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith('allpyramids/'):
                path = member.name.split('/')
                filename = path[-1]
                instance_id = filename.split('.')[0].lower()

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(
                    f'{instance_id}',
                    xml,
                    default_document_regex=
                    '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n')
                pyramids[instance_id] = pyramid

    return pyramids
Exemplo n.º 4
0
def load_update_pyramids(update_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = defaultdict(dict)
    with tarfile.open(update_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith(
                    'updateEval/Pyramid/allpyramids'):
                path = member.name.split('/')
                filename = path[-1]
                parts = filename.split('-')
                instance_id = parts[0].lower()
                group = parts[1]

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(
                    f'{instance_id}-{group}',
                    xml,
                    default_document_regex=
                    '[-]*\n D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n')
                pyramids[instance_id][group] = pyramid

    return pyramids
Exemplo n.º 5
0
    def score(self, annotation: PyramidAnnotation,
              pyramid: Pyramid) -> MetricsDict:
        # Create a mapping from the SCU id to its weight and count how many are at each weight
        scu_id_to_weight = {}
        weight_to_num_scus = Counter()
        for scu in pyramid.scus:
            weight = scu.get_weight()
            scu_id_to_weight[scu.scu_id] = weight
            weight_to_num_scus[weight] += 1

        # Calculate the total weight of the SCUs in the annotation
        total_weight = 0
        for scu in annotation.scus:
            # It's possible the SCU id isn't in the Pyramid, for example, if we are
            # doing jackknifing and the reference corresponding to an SCU of weight 1 was removed
            if scu.scu_id in scu_id_to_weight:
                total_weight += scu_id_to_weight[scu.scu_id]

        # Calculate the average number of SCUs in the pyramid summaries
        total_scus = 0
        for i in range(len(pyramid.summarizer_ids)):
            total_scus += len(pyramid.get_scu_id_set(i))
        average_num_scus = total_scus / len(pyramid.summarizer_ids)

        # Calculate the weight of an ideal summary with `average_num_scus` SCUs
        ideal_weight = 0
        scus_remaining = int(math.ceil(average_num_scus))
        for weight in sorted(weight_to_num_scus.keys(), reverse=True):
            if scus_remaining <= 0:
                break
            possible_scus = weight_to_num_scus[weight]
            num_scus_taken = min(scus_remaining, possible_scus)
            ideal_weight += num_scus_taken * weight
            scus_remaining -= num_scus_taken

        # The modified pyramid score is the ratio of the weight to the ideal weight
        return MetricsDict({self.name: total_weight / ideal_weight})
Exemplo n.º 6
0
def load_pyramids(pyramid_tar: str) -> Dict[str, Pyramid]:
    pyramids = {}
    with tarfile.open(pyramid_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith('pans/'):
                path = member.name.split('/')
                filename = path[-1]
                instance_id = filename.split('.')[1].lower()

                # For this dataset, the pyramid and annotations are all in the same file, one per
                # annotation. Therefore, we only need to load the pyramid once from the first file
                if instance_id in pyramids:
                    continue

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(
                    instance_id,
                    xml,
                    default_document_regex=
                    '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n',
                    is_combined_file=True)
                pyramids[instance_id] = pyramid

    return pyramids
Exemplo n.º 7
0
 def _get_scu_intersection(self, annotation: PyramidAnnotation, pyramid: Pyramid, index: int) -> Set[int]:
     annotation_scus = annotation.get_scu_id_set()
     reference_scus = pyramid.get_scu_id_set(index)
     return annotation_scus & reference_scus