Пример #1
0
def test_wiktionary_extraction():
    input_file = data_path("input/wiktionary.xml")
    reference_output = data_path("output/en_wiktionary.msgpack")
    with TemporaryDirectory() as tempdir:
        extract_wiktionary.handle_file(input_file, tempdir, "en", nfiles=1)
        reference_output = list(read_msgpack_stream(reference_output))
        actual_output = list(read_msgpack_stream(os.path.join(tempdir, "wiktionary_00.msgpack")))
        reference_output.sort(key=lambda x: x["title"])
        actual_output.sort(key=lambda x: x["title"])

        for (expected, actual) in zip_longest(reference_output, actual_output):
            eq_(expected, actual)
Пример #2
0
def test_wiktionary_extraction():
    input_file = data_path('input/wiktionary.xml')
    reference_output = data_path('output/en_wiktionary.msgpack')
    with TemporaryDirectory() as tempdir:
        extract_wiktionary.handle_file(input_file, tempdir, 'en', nfiles=1)
        reference_output = list(read_msgpack_stream(reference_output))
        actual_output = list(
            read_msgpack_stream(os.path.join(tempdir,
                                             'wiktionary_00.msgpack')))
        reference_output.sort(key=lambda x: x['title'])
        actual_output.sort(key=lambda x: x['title'])

        for (expected, actual) in zip_longest(reference_output, actual_output):
            eq_(expected, actual)
Пример #3
0
def index_assertions(input_dir, output_db, input_shards=8, output_shards=8):
    for writer_index in range(output_shards):
        print("Writing shard #%d" % writer_index)
        dbname = '%s.%d' % (output_db, writer_index)
        writer = EdgeIndexWriter(dbname,
                                 writer_index,
                                 output_shards,
                                 clear=True,
                                 allow_apsw=True)
        for filenum in range(input_shards):
            filename = 'part_%02d.msgpack' % filenum
            path = os.path.join(input_dir, filename)
            print("\tIndexing %s" % filename, end='')
            sys.stdout.flush()
            count = 0
            with writer.transaction():
                for assertion, offset in read_msgpack_stream(path,
                                                             offsets=True):
                    writer.add(assertion, filenum, offset)
                    count += 1
                    if count % 10000 == 0:
                        print('.', end='')
                        sys.stdout.flush()
            print()
        writer.close()
Пример #4
0
def convert_to_assoc(input_filename, output_filename):
    """
    Convert a JSON stream to a tab-separated "CSV" of concept-to-concept associations.

    The relation is mostly ignored, except:

    - Negative relations create associations between concepts suffixed with '/neg'
    - An assertion that means "People want X" in English or Chinese is converted to
      an assertion between X and "good", and also X and the negation of "bad"
    - Combining both of these, an assertion that "People don't want X" moves the
      negation so that X is associated with "not good" and "bad".

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.
    
    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
    
    for info in read_msgpack_stream(input_filename):
        startc = reduce_concept(info['start'])
        endc = reduce_concept(info['end'])
        rel = info['rel']
        weight = info['weight']

        if 'dbpedia' in info['source_uri'] and '/or/' not in info['source_uri']:
            # DBPedia associations are still too numerous and too weird to
            # associate.
            continue

        pairs = []
        if startc == '/c/en/person':
            if rel == '/r/Desires':
                pairs = [('/c/en/good', endc), ('/c/en/bad/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/en/bad', endc), ('/c/en/good/neg', endc)]
            else:
                pairs = [(startc, endc)]
        elif startc == '/c/zh/人':
            if rel == '/r/Desires':
                pairs = [('/c/zh/良好', endc), ('/c/zh/不良/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/zh/良好/neg', endc), ('/c/zh/不良', endc)]
            else:
                pairs = [(startc, endc)]
        else:
            negated = (rel.startswith('/r/Not') or rel.startswith('/r/Antonym'))
            if not negated:
                pairs = [(startc, endc)]
            else:
                pairs = [(startc, endc + '/neg'), (startc + '/neg', endc)]

        for (start, end) in pairs:
            line = "%(start)s\t%(end)s\t%(weight)s" % {
                'start': start,
                'end': end,
                'weight': weight,
            }
            print(line, file=out_stream)
Пример #5
0
def msgpack_to_json(input_filename, output_filename):
    """
    Convert a msgpack stream to a JSON stream (with one object per line).
    """
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Пример #6
0
def msgpack_to_json(input_filename, output_filename):
    """
    Convert a msgpack stream to a JSON stream (with one object per line).
    """
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Пример #7
0
def convert_to_tab_separated(input_filename, output_filename):
    out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
    for info in read_msgpack_stream(input_filename):
        if info.get('surfaceText') is None:
            info['surfaceText'] = ''
        info['weight'] = str(info['weight'])
        columns = [
            'uri', 'rel', 'start', 'end', 'context', 'weight', 'source_uri',
            'id', 'dataset', 'surfaceText'
        ]
        column_values = [info.get(col) for col in columns]
        line = '\t'.join(column_values)
        assert '\n' not in line
        print(line, file=out_stream)
Пример #8
0
def msgpack_to_csv(input_filename, output_filename):
	out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
	for info in read_msgpack_stream(input_filename):
		if info.get('surfaceText') is None:
			info['surfaceText'] = ''
		info['weight'] = str(info['weight'])
		if info.get('context') is not None:
			columns = ['uri', 'rel', 'start', 'end', 'context', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText']
		else:
			columns = ['uri', 'rel', 'start', 'end', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText']
		column_values = [info.get(col) for col in columns]
		line = '\t'.join(column_values)
		assert '\n' not in line
		print(line, file=out_stream)
Пример #9
0
def test_json_to_msgpack():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')
        
        writer = JSONStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        json_to_msgpack(json_path, msgpack_path)
        reader = read_msgpack_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Пример #10
0
def run_wiktionary(input_file, output_file, titledb=None, language='en',
                   verbosity=0, logger=None):
    if titledb is None:
        titledb = os.path.dirname(input_file) + '/titles.db'

    trace = (verbosity >= 2)
    sem = SEMANTICS[language](language, titledb=titledb, trace=trace,
                              logger=logger)
    output = MsgpackStreamWriter(output_file)
    for structure in read_msgpack_stream(input_file):
        for edge in sem.parse_structured_entry(structure):
            if verbosity >= 1:
                print(edge['rel'], edge['start'], edge['end'])
            output.write(edge)
Пример #11
0
def test_json_to_msgpack():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')

        writer = JSONStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        json_to_msgpack(json_path, msgpack_path)
        reader = read_msgpack_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Пример #12
0
def run_wiktionary(input_file,
                   output_file,
                   titledb=None,
                   language='en',
                   verbosity=0,
                   logger=None):
    if titledb is None:
        titledb = os.path.dirname(input_file) + '/titles.db'

    trace = (verbosity >= 2)
    sem = SEMANTICS[language](language,
                              titledb=titledb,
                              trace=trace,
                              logger=logger)
    output = MsgpackStreamWriter(output_file)
    for structure in read_msgpack_stream(input_file):
        for edge in sem.parse_structured_entry(structure):
            if verbosity >= 1:
                print(edge['rel'], edge['start'], edge['end'])
            output.write(edge)
Пример #13
0
def index_assertions(input_dir, output_db, input_shards=8, output_shards=8):
    for writer_index in range(output_shards):
        print("Writing shard #%d" % writer_index)
        dbname = '%s.%d' % (output_db, writer_index)
        writer = EdgeIndexWriter(dbname, writer_index, output_shards,
                                 clear=True, allow_apsw=True)
        for filenum in range(input_shards):
            filename = 'part_%02d.msgpack' % filenum
            path = os.path.join(input_dir, filename)
            print("\tIndexing %s" % filename, end='')
            sys.stdout.flush()
            count = 0
            with writer.transaction():
                for assertion, offset in read_msgpack_stream(path, offsets=True):
                    writer.add(assertion, filenum, offset)
                    count += 1
                    if count % 10000 == 0:
                        print('.', end='')
                        sys.stdout.flush()
            print()
        writer.close()
Пример #14
0
def msgpack_to_tab_separated(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV".
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        for info in read_msgpack_stream(input_filename):
            columns = ['uri', 'rel', 'start', 'end']
            extra_info = {
                'weight': round(info['weight'], 3),
                'sources': info['sources'],
                'dataset': info['dataset'],
                'license': info['license']
            }
            for extra_key in 'surfaceText', 'surfaceStart', 'surfaceEnd':
                if info.get(extra_key):
                    extra_info[extra_key] = info[extra_key]

            json_info = json.dumps(extra_info, ensure_ascii=False, sort_keys=True)
            column_values = [info[col] for col in columns] + [json_info]
            line = '\t'.join(column_values)
            assert '\n' not in line
            print(line, file=out_stream)
Пример #15
0
def msgpack_to_tab_separated(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV".
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        for info in read_msgpack_stream(input_filename):
            columns = ['uri', 'rel', 'start', 'end']
            extra_info = {
                'weight': round(info['weight'], 3),
                'sources': info['sources'],
                'dataset': info['dataset'],
                'license': info['license']
            }
            for extra_key in 'surfaceText', 'surfaceStart', 'surfaceEnd':
                if info.get(extra_key):
                    extra_info[extra_key] = info[extra_key]

            json_info = json.dumps(extra_info, ensure_ascii=False, sort_keys=True)
            column_values = [info[col] for col in columns] + [json_info]
            line = '\t'.join(column_values)
            assert '\n' not in line
            print(line, file=out_stream)
Пример #16
0
def convert_to_assoc(input_filename, output_filename):
    """
    Convert a JSON stream to a tab-separated "CSV" of concept-to-concept associations.

    The relation is mostly ignored, except:

    - Negative relations create associations between concepts suffixed with '/neg'
    - An assertion that means "People want X" in English or Chinese is converted to
      an assertion between X and "good", and also X and the negation of "bad"
    - Combining both of these, an assertion that "People don't want X" moves the
      negation so that X is associated with "not good" and "bad".

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.
    
    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    out_stream = codecs.open(output_filename, 'w', encoding='utf-8')

    for info in read_msgpack_stream(input_filename):
        startc = reduce_concept(info['start'])
        endc = reduce_concept(info['end'])
        rel = info['rel']
        weight = info['weight']

        if 'dbpedia' in info['source_uri'] and '/or/' not in info['source_uri']:
            # DBPedia associations are still too numerous and too weird to
            # associate.
            continue

        pairs = []
        if startc == '/c/en/person':
            if rel == '/r/Desires':
                pairs = [('/c/en/good', endc), ('/c/en/bad/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/en/bad', endc), ('/c/en/good/neg', endc)]
            else:
                pairs = [(startc, endc)]
        elif startc == '/c/zh/人':
            if rel == '/r/Desires':
                pairs = [('/c/zh/良好', endc), ('/c/zh/不良/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/zh/良好/neg', endc), ('/c/zh/不良', endc)]
            else:
                pairs = [(startc, endc)]
        else:
            negated = (rel.startswith('/r/Not')
                       or rel.startswith('/r/Antonym'))
            if not negated:
                pairs = [(startc, endc)]
            else:
                pairs = [(startc, endc + '/neg'), (startc + '/neg', endc)]

        for (start, end) in pairs:
            line = "%(start)s\t%(end)s\t%(weight)s" % {
                'start': start,
                'end': end,
                'weight': weight,
            }
            print(line, file=out_stream)
Пример #17
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES
                and get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf',
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset, rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Пример #18
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_edge_sources = output_dir + '/edge_sources.csv'
    output_node_prefixes = output_dir + '/node_prefixes.csv'
    output_features = output_dir + '/edge_features.csv'

    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()
    seen_prefixes = set()

    edge_file = open(output_edges, 'w', encoding='utf-8')
    edge_source_file = open(output_edge_sources, 'w', encoding='utf-8')
    node_prefix_file = open(output_node_prefixes, 'w', encoding='utf-8')
    feature_file = open(output_features, 'w', encoding='utf-8')

    for assertion in read_msgpack_stream(msgpack_filename):
        if assertion['uri'] in assertion_list:
            continue
        assertion_idx = assertion_list.add(assertion['uri'])
        rel_idx = relation_list.add(assertion['rel'])
        start_idx = node_list.add(assertion['start'])
        end_idx = node_list.add(assertion['end'])

        source_indices = []
        sources = assertion['sources']
        for source in sources:
            for sourceval in sorted(source.values()):
                source_idx = source_list.add(sourceval)
                source_indices.append(source_idx)

        jsondata = json.dumps(assertion, ensure_ascii=False, sort_keys=True)
        weight = assertion['weight']
        write_row(
            edge_file,
            [assertion_idx, assertion['uri'],
             rel_idx, start_idx, end_idx,
             weight, jsondata]
        )
        for node in (assertion['start'], assertion['end'], assertion['dataset']):
            write_prefixes(node_prefix_file, seen_prefixes, node_list, node)
        for source_idx in sorted(set(source_indices)):
            write_row(edge_source_file, [assertion_idx, source_idx])

        if assertion['rel'] in SYMMETRIC_RELATIONS:
            features = [(0, start_idx), (0, end_idx)]
        else:
            features = [(1, start_idx), (-1, end_idx)]

        for direction, node_idx in features:
            write_row(feature_file, [rel_idx, direction, node_idx, assertion_idx])

    edge_file.close()
    edge_source_file.close()
    node_prefix_file.close()
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)
Пример #19
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    """
    Scan through the list of assertions (edges that are unique in their
    start, end, and relation) and produce CSV files that can be loaded
    into PostgreSQL tables.

    The columns of these CSV files are unlabeled, but they correspond
    to the order of the table columns defined in schema.py.
    """
    # Construct the filenames of the CSV files, one per table
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_features = output_dir + '/edge_features.csv'
    output_edges_gin = output_dir + '/edges_gin.csv'

    # We can't rely on Postgres to assign IDs, because we need to know the
    # IDs to refer to them _before_ they're in Postgres. So we track our own
    # unique IDs using OrderedSet.
    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()

    # These are three files that we will write incrementally as we iterate
    # through the edges. The syntax restrictions on 'with' leave me with no
    # way to format this that satisfies my style checker and auto-formatter.
    with open(output_edges, 'w', encoding='utf-8') as edge_file,\
         open(output_edges_gin, 'w', encoding='utf-8') as edge_gin_file,\
         open(output_features, 'w', encoding='utf-8') as feature_file:
        for assertion in read_msgpack_stream(msgpack_filename):
            # Assertions are supposed to be unique. If they're not, we should
            # find out and the build should fail.
            if assertion['uri'] in assertion_list:
                raise ValueError("Duplicate assertion: {!r}".format(assertion))

            # Get unique IDs for the relation, start, and end, and the assertion
            # itself. The relation, start, and end IDs may already exists; this is
            # handled by OrderedSet.
            assertion_idx = assertion_list.add(assertion['uri'])
            rel_idx = relation_list.add(assertion['rel'])
            start_idx = node_list.add(assertion['start'])
            end_idx = node_list.add(assertion['end'])

            # Also get unique IDs for each of the sources listed as contributing
            # to this assertion.
            source_indices = []
            sources = assertion['sources']
            for source in sources:
                for sourceval in sorted(source.values()):
                    source_idx = source_list.add(sourceval)
                    source_indices.append(source_idx)

            # Write the edge data to the `edge_file`.
            jsondata = json.dumps(assertion,
                                  ensure_ascii=False,
                                  sort_keys=True)
            weight = assertion['weight']
            write_row(
                edge_file,
                [
                    assertion_idx,
                    assertion['uri'],
                    rel_idx,
                    start_idx,
                    end_idx,
                    weight,
                    jsondata,
                ],
            )

            # Convert the edge to the form that we can easily filter using GIN
            # indexing, and write that to the `edge_gin_file`.
            write_row(
                edge_gin_file,
                [
                    assertion_idx,
                    weight,
                    json.dumps(
                        gin_indexable_edge(assertion),
                        ensure_ascii=False,
                        sort_keys=True,
                    ),
                ],
            )

            # Extract the 'features' (combinations of the relation and one node)
            # that are present in the edge. We may need to match the node using
            # a prefix of that node, so store the feature separately for each
            # prefix.
            features = []

            # Get the IDs in the node table for each prefix of the nodes
            start_p_indices = [
                node_list.add(prefix)
                for prefix in uri_prefixes(assertion['start'], 3)
            ]
            end_p_indices = [
                node_list.add(prefix)
                for prefix in uri_prefixes(assertion['end'], 3)
            ]

            # Write the feature data, the 'direction' (forward, backward, or
            # symmetric), and the edge ID to the feature table.
            if assertion['rel'] in SYMMETRIC_RELATIONS:
                for start_p_idx in start_p_indices:
                    features.append((0, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((0, end_p_idx))
            else:
                for start_p_idx in start_p_indices:
                    features.append((1, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((-1, end_p_idx))

            for direction, node_idx in features:
                write_row(feature_file,
                          [rel_idx, direction, node_idx, assertion_idx])

    # Write our tables of unique IDs
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)
Пример #20
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    FIXME: the above is out of date, we use conceptnet5.vectors now

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Пример #21
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_edge_sources = output_dir + '/edge_sources.csv'
    output_node_prefixes = output_dir + '/node_prefixes.csv'
    output_features = output_dir + '/edge_features.csv'

    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()
    seen_prefixes = set()

    edge_file = open(output_edges, 'w', encoding='utf-8')
    edge_source_file = open(output_edge_sources, 'w', encoding='utf-8')
    node_prefix_file = open(output_node_prefixes, 'w', encoding='utf-8')
    feature_file = open(output_features, 'w', encoding='utf-8')

    for assertion in read_msgpack_stream(msgpack_filename):
        if assertion['uri'] in assertion_list:
            continue
        assertion_idx = assertion_list.add(assertion['uri'])
        rel_idx = relation_list.add(assertion['rel'])
        start_idx = node_list.add(assertion['start'])
        end_idx = node_list.add(assertion['end'])

        source_indices = []
        sources = assertion['sources']
        for source in sources:
            for sourceval in sorted(source.values()):
                source_idx = source_list.add(sourceval)
                source_indices.append(source_idx)

        jsondata = json.dumps(assertion, ensure_ascii=False, sort_keys=True)
        weight = assertion['weight']
        write_row(edge_file, [
            assertion_idx, assertion['uri'], rel_idx, start_idx, end_idx,
            weight, jsondata
        ])
        for node in (assertion['start'], assertion['end'],
                     assertion['dataset']):
            write_prefixes(node_prefix_file, seen_prefixes, node_list, node)
        for source_idx in sorted(set(source_indices)):
            write_row(edge_source_file, [assertion_idx, source_idx])

        if assertion['rel'] in SYMMETRIC_RELATIONS:
            features = [(0, start_idx), (0, end_idx)]
        else:
            features = [(1, start_idx), (-1, end_idx)]

        for direction, node_idx in features:
            write_row(feature_file,
                      [rel_idx, direction, node_idx, assertion_idx])

    edge_file.close()
    edge_source_file.close()
    node_prefix_file.close()
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)
Пример #22
0
def convert_to_json(input_filename, output_filename):
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Пример #23
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                start_uri.startswith('/c/') and end_uri.startswith('/c/') and
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Пример #24
0
def msgpack_to_json(input_filename, output_filename):
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Пример #25
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (get_uri_language(start_uri) in COMMON_LANGUAGES
                    and get_uri_language(end_uri) in COMMON_LANGUAGES):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf')
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start,
                    end=end,
                    weight=weight,
                    dataset=dataset,
                    rel=rel)
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Пример #26
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    """
    Scan through the list of assertions (edges that are unique in their
    start, end, and relation) and produce CSV files that can be loaded
    into PostgreSQL tables.

    The columns of these CSV files are unlabeled, but they correspond
    to the order of the table columns defined in schema.py.
    """
    # Construct the filenames of the CSV files, one per table
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_features = output_dir + '/edge_features.csv'
    output_edges_gin = output_dir + '/edges_gin.csv'

    # We can't rely on Postgres to assign IDs, because we need to know the
    # IDs to refer to them _before_ they're in Postgres. So we track our own
    # unique IDs using OrderedSet.
    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()

    # These are three files that we will write incrementally as we iterate
    # through the edges. The syntax restrictions on 'with' leave me with no
    # way to format this that satisfies my style checker and auto-formatter.
    with open(output_edges, 'w', encoding='utf-8') as edge_file,\
         open(output_edges_gin, 'w', encoding='utf-8') as edge_gin_file,\
         open(output_features, 'w', encoding='utf-8') as feature_file:
        for assertion in read_msgpack_stream(msgpack_filename):
            # Assertions are supposed to be unique. If they're not, we should
            # find out and the build should fail.
            if assertion['uri'] in assertion_list:
                raise ValueError("Duplicate assertion: {!r}".format(assertion))

            # Get unique IDs for the relation, start, and end, and the assertion
            # itself. The relation, start, and end IDs may already exists; this is
            # handled by OrderedSet.
            assertion_idx = assertion_list.add(assertion['uri'])
            rel_idx = relation_list.add(assertion['rel'])
            start_idx = node_list.add(assertion['start'])
            end_idx = node_list.add(assertion['end'])

            # Also get unique IDs for each of the sources listed as contributing
            # to this assertion.
            source_indices = []
            sources = assertion['sources']
            for source in sources:
                for sourceval in sorted(source.values()):
                    source_idx = source_list.add(sourceval)
                    source_indices.append(source_idx)

            # Write the edge data to the `edge_file`.
            jsondata = json.dumps(assertion, ensure_ascii=False, sort_keys=True)
            weight = assertion['weight']
            write_row(
                edge_file,
                [
                    assertion_idx,
                    assertion['uri'],
                    rel_idx,
                    start_idx,
                    end_idx,
                    weight,
                    jsondata,
                ],
            )

            # Convert the edge to the form that we can easily filter using GIN
            # indexing, and write that to the `edge_gin_file`.
            write_row(
                edge_gin_file,
                [
                    assertion_idx,
                    weight,
                    json.dumps(
                        gin_indexable_edge(assertion),
                        ensure_ascii=False,
                        sort_keys=True,
                    ),
                ],
            )

            # Extract the 'features' (combinations of the relation and one node)
            # that are present in the edge. We may need to match the node using
            # a prefix of that node, so store the feature separately for each
            # prefix.
            features = []

            # Get the IDs in the node table for each prefix of the nodes
            start_p_indices = [
                node_list.add(prefix) for prefix in uri_prefixes(assertion['start'], 3)
            ]
            end_p_indices = [
                node_list.add(prefix) for prefix in uri_prefixes(assertion['end'], 3)
            ]

            # Write the feature data, the 'direction' (forward, backward, or
            # symmetric), and the edge ID to the feature table.
            if assertion['rel'] in SYMMETRIC_RELATIONS:
                for start_p_idx in start_p_indices:
                    features.append((0, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((0, end_p_idx))
            else:
                for start_p_idx in start_p_indices:
                    features.append((1, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((-1, end_p_idx))

            for direction, node_idx in features:
                write_row(feature_file, [rel_idx, direction, node_idx, assertion_idx])

    # Write our tables of unique IDs
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)