예제 #1
0
def build_glove(graph):
    for version in CONFIG['glove-versions']:
        input = CONFIG['source-data-path'] + '%s.txt' % version
        graph['build_glove']['build_glove_labels'][version] = Dep(
            input, GloveLabels(version=version), 'glove_to_labels')
        graph['build_glove']['build_glove_vecs'][version] = Dep(
            input, GloveVectors(version=version), 'glove_to_vecs')
예제 #2
0
def standardize_ppdb(graph):
    graph['standardize_ppdb'] = Dep(
        CONFIG['source-data-path'] + 'ppdb-xl-lexical.csv',
        CONFIG['build-data-path'] + 'ppdb-xl-lexical-standardized.csv',
        'standardize_assoc')

    graph['combine_cnet_ppdb'] = Dep([
        CONFIG['source-data-path'] + CONCEPTNET_SOURCE_FILE,
        CONFIG['build-data-path'] + 'ppdb-xl-lexical-standardized.csv'
    ], CONFIG['build-data-path'] + 'cnet-ppdb-combined.csv', 'concatenate')
예제 #3
0
def retrofit(graph):
    for network in ['conceptnet5']:
        graph['assoc_to_labels'][network] = Dep(
            CONFIG['source-data-path'] + CONCEPTNET_SOURCE_FILE,
            GloveLabels(version=network), 'assoc_to_labels')

    for version in CONFIG['glove-versions'] + CONFIG[
            'word2vec-versions'] + CONFIG['extra-embeddings']:
        for network in CONFIG['retrofit-items']:
            for norm in ['l1', 'l2']:
                if 'conceptnet5-' in network and norm != 'l1':
                    # use only the l1 norm when trying dropping out datasets
                    continue
                graph['retrofit'][version][norm][network] = Dep(
                    [
                        GloveVectors(version=version,
                                     standardization='standardized',
                                     normalization=norm),
                        CONFIG['build-data-path'] + '%s.%s.self_loops.npz' %
                        (version, network)
                    ],
                    GloveVectors(version=version,
                                 standardization='standardized',
                                 retrofit=network,
                                 normalization=norm), 'retrofit')

            if CONFIG['run-filter']:
                graph['filter_vecs'][version][network] = Dep([
                    GloveLabels(version=version,
                                standardization='standardized',
                                retrofit=network),
                    GloveVectors(version=version,
                                 standardization='standardized',
                                 retrofit=network,
                                 normalization='l1'),
                    GloveLabels(version=network)
                ], [
                    GloveLabels(version=version,
                                standardization='filtered',
                                retrofit=network),
                    GloveVectors(version=version,
                                 standardization='filtered',
                                 retrofit=network,
                                 normalization='l1'),
                    GloveReplacements(version=version,
                                      standardization='filtered',
                                      retrofit=network)
                ], 'filter_vecs')
예제 #4
0
def test(graph):
    vector_files = defaultdict(list)

    for file in outputs(graph):
        if not isinstance(file, GloveVectors):
            continue
        vector_files[file.version, file.standardization,
                     file.retrofit].append(file)

    for (version, standardization, retrofit), files in vector_files.items():
        if version.startswith('combo') and standardization == 'raw':
            continue
        label = GloveLabels(version=version,
                            standardization=standardization,
                            retrofit=retrofit)
        for file in files:
            out = copy.copy(file)
            out.filetype = 'evaluation'

            # this is a hack
            inputs = [label, file]
            if str(label) == CONFIG[
                    'build-data-path'] + 'glove.840B.300d.filtered.conceptnet5.labels':
                inputs.append(
                    CONFIG['build-data-path'] +
                    'glove.840B.300d.filtered.conceptnet5.replacements.msgpack'
                )
            graph['test']['test_%s' % file] = Dep(inputs, out, 'test')
예제 #5
0
def filter_conceptnet(graph):
    for dataset in CONFIG['pos-filters']:
        filter_expr = regex_for_dataset(dataset)
        graph['filter_assoc']['pos'][dataset] = Dep(
            CONFIG['source-data-path'] + CONCEPTNET_SOURCE_FILE,
            CONFIG['build-data-path'] + 'conceptnet5-%s-only.csv' % dataset,
            'filter_assoc_pos',
            params={'filter': filter_expr})

    for dataset in CONFIG['neg-filters']:
        filter_expr = regex_for_dataset(dataset)
        graph['filter_assoc']['neg'][dataset] = Dep(
            CONFIG['source-data-path'] + CONCEPTNET_SOURCE_FILE,
            CONFIG['build-data-path'] + 'conceptnet5-minus-%s.csv' % dataset,
            'filter_assoc_neg',
            params={'filter': filter_expr})
예제 #6
0
def build_word2vec(graph):
    for version in CONFIG['word2vec-versions']:
        input = CONFIG['source-data-path'] + '%s.bin.gz' % version
        graph['build_word2vec'][version] = Dep(
            input,
            [GloveLabels(version=version),
             GloveVectors(version=version)], 'w2v_to_vecs')
예제 #7
0
def add_self_loops(graph):
    for version in CONFIG['glove-versions'] + CONFIG[
            'word2vec-versions'] + CONFIG['extra-embeddings']:
        for network in CONFIG['retrofit-items']:
            graph['add_self_loops'][network][version] = Dep(
                CONFIG['build-data-path'] + '%s.%s.npz' % (version, network),
                CONFIG['build-data-path'] + '%s.%s.self_loops.npz' %
                (version, network), 'add_self_loops')
예제 #8
0
def standardize_glove(graph):
    for version in CONFIG['glove-versions'] + CONFIG['word2vec-versions']:
        graph['standardize_glove'][version] = Dep(
            [GloveLabels(version=version),
             GloveVectors(version=version)], [
                 GloveLabels(version=version, standardization='standardized'),
                 GloveVectors(version=version, standardization='standardized'),
             ], 'standardize_vecs')
예제 #9
0
def normalize_glove(graph):
    for version in CONFIG['glove-versions'] + CONFIG[
            'word2vec-versions'] + CONFIG['extra-embeddings']:
        for norm in ('l1', 'l2'):
            for s13n in ('raw', 'standardized'):
                if version.startswith('combo') and s13n == 'raw':
                    continue
                graph['normalize_glove'][version][norm][s13n] = Dep(
                    GloveVectors(version=version, standardization=s13n),
                    GloveVectors(version=version,
                                 normalization=norm,
                                 standardization=s13n), '%s_normalize' % norm)
예제 #10
0
def build_assoc(graph):
    for version in CONFIG['glove-versions'] + CONFIG[
            'word2vec-versions'] + CONFIG['extra-embeddings']:
        for network in CONFIG['retrofit-items']:
            path = CONFIG['build-data-path']
            if network == 'conceptnet5':
                path = CONFIG['source-data-path']
            graph['network_to_assoc'][version][network] = Dep([
                GloveLabels(version=version, standardization='standardized'),
                path + network + '.csv'
            ], [
                GloveLabels(version=version,
                            standardization='standardized',
                            retrofit=network),
                CONFIG['build-data-path'] + '%s.%s.npz' % (version, network)
            ], 'network_to_assoc')
예제 #11
0
def latex_results(graph):
    inputs = []

    graph['latex_results'] = Dep(
        outputs(graph['test']),
        CONFIG['build-data-path'] + 'evaluations.latex', 'tests_to_latex')