예제 #1
0
    def run_apply_weighting_command(self):
        data = methods.load_json(self.file_path_field.getText())
        if not data:
            self.warning('Data is invalid')
            return False

        methods.apply_weighting(self.transform, data=data)
예제 #2
0
파일: gui.py 프로젝트: SEVEZ/skonverter
	def run_apply_weighting_command( self ):
		data = methods.load_json( self.file_path_field.getText( ) )
		if not data:
			self.warning('Data is invalid')
			return False
		
		methods.apply_weighting( self.transform, data = data )
def annotate_data(source_file, annotations_file, tag=False, compounds=False):
    "Function to annotate existing coco data"
    data = load_json(source_file)
    for entry in data:
        raw_description = entry['caption']
        doc = nlp.tokenizer(raw_description)
        entry['tokenized'] = [tok.orth_ for tok in doc]
        if tag:
            # Call the tagger on the document.
            nlp.tagger(doc)
            entry['tagged'] = [(tok.orth_, tok.tag_) for tok in doc]
        if compounds:
            list_of_compounds = compounds_from_doc(doc)
            entry['compounds'] = list_of_compounds
    save_json(data, annotations_file)
    return data
def coverage(name, target):
    """
    Compute coverage for a specific system.
    
    This function is agnostic to whether you want coverage over entire Val or only
    the set of learnable types.
    """
    base = './Data/Systems/'
    path = base + name + '/Val/stats.json'
    system = load_json(path)
    gen = set(system['types'])
    recalled = gen & target
    return {
        "recalled": recalled,
        "score": len(recalled) / len(target),
        "not_in_val": gen - target
    }
def run_all(args):
    "Run all metrics on the data and save JSON files with the results."
    # Annotate generated data.
    annotated = annotate_data(args.source_file,
                              args.annotations_file,
                              tag=True,
                              compounds=True)

    # Load training data. (For computing novelty.)
    train_data = load_json('./Data/COCO/Processed/tokenized_train2014.json')
    train_descriptions = [
        entry['caption'] for entry in train_data['annotations']
    ]

    # Load annotated data.
    sentences = sentences_from_file(args.annotations_file)

    # Analyze the data.
    stats = system_stats(sentences)

    # Get raw descriptions.
    gen_descriptions = [
        entry['caption'] for entry in load_json(args.source_file)
    ]
    extra_stats = sentence_stats(train_descriptions, gen_descriptions)
    stats.update(extra_stats)

    # Save statistics data.
    save_json(stats, args.stats_file)

    ################################
    # Global recall

    train_stats = load_json('./Data/COCO/Processed/train_stats.json')
    val_stats = load_json('./Data/COCO/Processed/val_stats.json')

    train = set(train_stats['types'])
    val = set(val_stats['types'])
    learnable = train & val

    gen = set(stats['types'])
    recalled = gen & val

    coverage = {
        "recalled": recalled,
        "score": len(recalled) / len(learnable),
        "not_in_val": gen - learnable
    }

    coverage['omissions'] = most_frequent_omissions(
        coverage['recalled'],
        val_stats,  # Use validation set as reference.
        n=None)
    val_count_list = get_count_list(val_stats)
    coverage['percentiles'] = percentiles(val_count_list, recalled)
    save_json(coverage, args.global_coverage_file)

    ####################################
    # Local recall

    val_index = index_from_file('./Data/COCO/Processed/tagged_val2014.json',
                                tagged=True,
                                lower=True)
    generated = {entry['image_id']: entry['tokenized'] for entry in annotated}
    local_recall_res = dict(scores=local_recall_scores(generated, val_index),
                            counts=local_recall_counts(generated, val_index))
    save_json(local_recall_res, args.local_coverage_file)

    ##################################
    # Nouns pps
    npdata = {
        'pp_data': pp_stats(annotated),
        'compound_data': compound_stats(annotated)
    }
    save_json(npdata, args.noun_pp_file)
if __name__ == "__main__":
    system2label = {
        'Dai-et-al-2017': 'Dai et al. 2017',
        'Liu-et-al-2017': 'Liu et al. 2017',
        'Mun-et-al-2017': 'Mun et al. 2017',
        'Shetty-et-al-2016': 'Shetty et al. 2016',
        'Shetty-et-al-2017': 'Shetty et al. 2017',
        'Tavakoli-et-al-2017': 'Tavakoli et al. 2017',
        'Vinyals-et-al-2017': 'Vinyals et al. 2017',
        'Wu-et-al-2016': 'Wu et al. 2016',
        'Zhou-et-al-2017': 'Zhou et al. 2017'
    }

    system2color = dict(zip(sorted(system2label), my_palette))

    train_stats = load_json('./Data/COCO/Processed/train_stats.json')
    val_stats = load_json('./Data/COCO/Processed/val_stats.json')

    train = set(train_stats['types'])
    val = set(val_stats['types'])
    learnable = train & val

    limit = len(learnable) / len(val)
    size_limit = len(val) - len(learnable)
    print(
        f'The limit is: {limit}. This means {size_limit} words in Val cannot be learned.'
    )

    ################################################################################
    # Run the script.
def load_system_stats(name):
    "Load system stats based on the system name."
    base = './Data/Systems/'
    path = base + name + '/Val/stats.json'
    return load_json(path)
    return get_values(data, keys)


systems = {
    'Dai-et-al-2017': "Dai et al. 2017",
    'Liu-et-al-2017': "Liu et al. 2017",
    'Mun-et-al-2017': "Mun et al. 2017",
    'Shetty-et-al-2016': 'Shetty et al. 2016',
    'Shetty-et-al-2017': 'Shetty et al. 2017',
    'Tavakoli-et-al-2017': 'Tavakoli et al. 2017',
    'Vinyals-et-al-2017': 'Vinyals et al. 2017',
    'Wu-et-al-2016': 'Wu et al. 2016',
    'Zhou-et-al-2017': 'Zhou et al. 2017'
}

train_stats = load_json('./Data/COCO/Processed/train_stats.json')
val_stats = load_json('./Data/COCO/Processed/val_stats.json')
system_stats = {sys_name: load_system_stats(sys_name) for sys_name in systems}
bleu_meteor = load_json('./Data/Systems/bleu_meteor.json')
global_recall = load_json('./Data/Output/global_recall.json')
local_recall = load_json('./Data/Output/local_recall.json')

headers = [
    'System', 'BLEU', 'Meteor', "ASL", "SDSL", "Types", "TTR1", 'TTR2',
    'Novel', 'Cov', 'Loc5'
]
system_keys = [
    "average_sentence_length", 'std_sentence_length', "num_types",
    "type_token_ratio", 'bittr', 'percentage_novel'
]
corpus_keys = [
예제 #9
0
def name_to_stats_path(name):
    "Get mapping based on system name."
    base = './Data/Systems/'
    path = base + name + '/Val/stats.json'
    return load_json(path)
예제 #10
0
                             if word in not_learned}
    
    # Convert to counter.
    omissions = Counter(omissions)
    
    # Clean the data.
    del omissions['..']
    for char in punctuation + ' \n':
        del omissions[char]
    
    # Return most common omissions.
    top_n = omissions.most_common(n)
    return list_from_counts(top_n)


train_stats = load_json('./Data/COCO/Processed/train_stats.json')
val_stats = load_json('./Data/COCO/Processed/val_stats.json')

train     = set(train_stats['types'])
val       = set(val_stats['types'])
not_learned = train & val

for name in systems:
    data = name_to_stats_path(name)
    not_learned -= set(data['types'])

global_train_ranking = get_top_n_omitted(train_stats, not_learned, n=ranking_length)
global_val_ranking   = get_top_n_omitted(val_stats, not_learned, n=ranking_length)


################################################################################
예제 #11
0
def load_system_data(name):
    base = './Data/Systems/'
    path = base + name + '/Val/annotated.json'
    return load_json(path)
예제 #12
0
    base = './Data/Systems/'
    path = base + name + '/Val/annotated.json'
    return load_json(path)


def get_keys(d, keys):
    return [d[key] for key in keys]


################################################################################
# Compute stats.

###########################
# Val

val_tagged = load_json('./Data/COCO/Processed/tagged_val2014.json')
parallel_entries = parallel_entries(val_tagged)
parallel_results = [
    depth_including_compounds(entries) for entries in parallel_entries
]
val_result = average_dicts(parallel_results)

parallel_histos = [
    get_depths_histogram(entries) for entries in parallel_entries
]
type_histos = [d['type_histogram'] for d in parallel_histos]
token_histos = [d['token_histogram'] for d in parallel_histos]
val_histo = dict(type_histogram=average_dicts(type_histos),
                 token_histogram=average_dicts(token_histos))

###########################
from methods import sentences_from_file, system_stats, load_json, save_json, sentence_stats

train_data = load_json('./Data/COCO/Processed/tokenized_train2014.json')
train_descriptions = [entry['caption'] for entry in train_data['annotations']]

for folder in [
        'Dai-et-al-2017', 'Liu-et-al-2017', 'Mun-et-al-2017',
        'Shetty-et-al-2016', 'Shetty-et-al-2017', 'Tavakoli-et-al-2017',
        'Vinyals-et-al-2017', 'Wu-et-al-2016', 'Zhou-et-al-2017'
]:
    print('Processing:', folder)

    # Define source and target.
    base = './Data/Systems/'
    source = base + folder + '/Val/annotated.json'
    target = base + folder + '/Val/stats.json'

    # Load data.
    sentences = sentences_from_file(source)

    # Process data.
    stats = system_stats(sentences)

    # Get raw descriptions.
    gen_descriptions = [entry['caption'] for entry in load_json(source)]
    extra_stats = sentence_stats(train_descriptions, gen_descriptions)

    stats.update(extra_stats)

    # Save data.
    save_json(stats, target)
예제 #14
0
from methods import parallel_sentences_from_file, parallel_stats, load_json, save_json, sentence_stats

train = parallel_sentences_from_file('./Data/COCO/Processed/tokenized_train2014.json',
                                     tagged=False,  # Don't load tags.
                                     lower=True)    # Lowercase all descriptions.

val   = parallel_sentences_from_file('./Data/COCO/Processed/tagged_val2014.json',
                                     tagged=False,  # Don't load tags.
                                     lower=True)    # Lowercase all descriptions.

# Compute stats for train and val data.
train_stats = parallel_stats(train)
val_stats   = parallel_stats(val)

# Extra stats.
train_data = load_json('./Data/COCO/Processed/tokenized_train2014.json')
train_descriptions = [entry['caption'] for entry in train_data['annotations']]

val_data = load_json('./Data/COCO/Processed/tagged_val2014.json')
val_descriptions = [entry['caption'] for entry in val_data['annotations']]

extra_stats = sentence_stats(train_descriptions, val_descriptions)

val_stats.update(extra_stats)

# Save data to file.
save_json(train_stats, './Data/COCO/Processed/train_stats.json')
save_json(val_stats, './Data/COCO/Processed/val_stats.json')
예제 #15
0
def load_system_stats(name):
    "Load system stats based on the system name."
    base = './Data/Systems/'
    path = base + name + '/Val/stats.json'
    return load_json(path)


systems = [
    'Dai-et-al-2017', 'Liu-et-al-2017', 'Mun-et-al-2017', 'Shetty-et-al-2016',
    'Shetty-et-al-2017', 'Tavakoli-et-al-2017', 'Vinyals-et-al-2017',
    'Wu-et-al-2016', 'Zhou-et-al-2017'
]

# Load the data
system_stats = {sys_name: load_system_stats(sys_name) for sys_name in systems}
global_recall = load_json('./Data/Output/global_recall.json')
local_recall = load_json('./Data/Output/local_recall.json')

# Values to be correlated.
system_keys = [
    "average_sentence_length", 'std_sentence_length', "num_types",
    "type_token_ratio", 'bittr', 'percentage_novel'
]

# Let's first index all scores by system.
# This is easiest to inspect, and we don't care about efficiency here.
result_rows = dict()
for system in systems:
    result_rows[system] = [system_stats[system][key] for key in system_keys]

    # Add local and global recall scores from their separate files.