Exemplo n.º 1
0
def prepare_db(inputs, dbfile):
    """
    Build a SQLite database that extracts some information from our parsed
    versions of Wiktionary. This is information that is needed by later reader
    steps, such as which words are known in which languages, and which words
    are forms of other words.
    """
    # If the database already exists, delete it first
    try:
        os.unlink(dbfile)
    except FileNotFoundError:
        pass

    db = sqlite3.connect(dbfile)
    make_tables(db)
    try:
        for filename in inputs:
            filepath = pathlib.Path(filename)
            file_language = filepath.name.split('.')[0]
            for item in read_json_stream(filename):
                if 'rel' in item:
                    tfrom = item['from']
                    tto = item['to']
                    # For all non-definition relations, record the fact that
                    # the given entry name exists in the given language. We'll
                    # use these to disambiguate definitions later.
                    if item['rel'] != 'definition':
                        if 'language' in tfrom and valid_language(tfrom['language']):
                            add_title(
                                db, file_language, tfrom['language'], tfrom['text']
                            )
                        if 'language' in tto and valid_language(tto['language']):
                            add_title(db, file_language, tto['language'], tto['text'])

                    # Record word forms so we can build a lemmatizer from them.
                    if item['rel'].startswith('form/'):
                        form_name = item['rel'][5:]
                        # Look for the part of speech, first in the 'from' term,
                        # then in the 'to' term.
                        pos = tfrom.get('pos', tto.get('pos', '?'))

                        # Use only Etymology 1 entries for learning word forms.
                        if (tfrom.get('etym') or '1') == '1':
                            language = tfrom.get('language', tto.get('language'))
                            if (
                                valid_language(language)
                                and tfrom['text'] != tto['text']
                            ):
                                add_form(
                                    db,
                                    file_language,
                                    language,
                                    tfrom['text'],
                                    pos,
                                    tto['text'],
                                    form_name,
                                )
            db.commit()
    finally:
        db.close()
Exemplo n.º 2
0
def convert_to_assoc(input_filename, output_filename):
    """
    Convert a JSON stream to a tab-separated "CSV" of concept-to-concept associations.

    The relation is mostly ignored, except:

    - Negative relations create associations between concepts suffixed with '/neg'
    - An assertion that means "People want X" in English or Chinese is converted to
      an assertion between X and "good", and also X and the negation of "bad"
    - Combining both of these, an assertion that "People don't want X" moves the
      negation so that X is associated with "not good" and "bad".

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.
    
    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
    
    for info in read_json_stream(input_filename):
        startc = reduce_concept(info['start'])
        endc = reduce_concept(info['end'])
        rel = info['rel']
        weight = info['weight']

        if 'dbpedia' in info['sources'] and '/or/' not in info['sources']:
            # DBPedia associations are still too numerous and too weird to
            # associate.
            continue

        pairs = []
        if startc == '/c/en/person':
            if rel == '/r/Desires':
                pairs = [('/c/en/good', endc), ('/c/en/bad/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/en/bad', endc), ('/c/en/good/neg', endc)]
            else:
                pairs = [(startc, endc)]
        elif startc == '/c/zh/人':
            if rel == '/r/Desires':
                pairs = [('/c/zh/良好', endc), ('/c/zh/不良/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/zh/良好/neg', endc), ('/c/zh/不良', endc)]
            else:
                pairs = [(startc, endc)]
        else:
            negated = (rel.startswith('/r/Not') or rel.startswith('/r/Antonym'))
            if not negated:
                pairs = [(startc, endc)]
            else:
                pairs = [(startc, endc + '/neg'), (startc + '/neg', endc)]

        for (start, end) in pairs:
            line = "%(start)s\t%(end)s\t%(weight)s" % {
                'start': start,
                'end': end,
                'weight': weight,
            }
            print(line, file=out_stream)
Exemplo n.º 3
0
def prepare_db(inputs, dbfile):
    """
    Build a SQLite database that extracts some information from our parsed
    versions of Wiktionary. This is information that is needed by later reader
    steps, such as which words are known in which languages, and which words
    are forms of other words.
    """
    # If the database already exists, delete it first
    try:
        os.unlink(dbfile)
    except FileNotFoundError:
        pass

    db = sqlite3.connect(dbfile)
    make_tables(db)
    try:
        for filename in inputs:
            filepath = pathlib.Path(filename)
            file_language = filepath.name.split('.')[0]
            for item in read_json_stream(filename):
                if 'rel' in item:
                    tfrom = item['from']
                    tto = item['to']
                    # For all non-definition relations, record the fact that
                    # the given entry name exists in the given language. We'll
                    # use these to disambiguate definitions later.
                    if item['rel'] != 'definition':
                        if 'language' in tfrom and valid_language(tfrom['language']):
                            add_title(
                                db, file_language, tfrom['language'], tfrom['text']
                            )
                        if 'language' in tto and valid_language(tto['language']):
                            add_title(db, file_language, tto['language'], tto['text'])

                    # Record word forms so we can build a lemmatizer from them.
                    if item['rel'].startswith('form/'):
                        form_name = item['rel'][5:]
                        # Look for the part of speech, first in the 'from' term,
                        # then in the 'to' term.
                        pos = tfrom.get('pos', tto.get('pos', '?'))

                        # Use only Etymology 1 entries for learning word forms.
                        if (tfrom.get('etym') or '1') == '1':
                            language = tfrom.get('language', tto.get('language'))
                            if (
                                valid_language(language)
                                and tfrom['text'] != tto['text']
                            ):
                                add_form(
                                    db,
                                    file_language,
                                    language,
                                    tfrom['text'],
                                    pos,
                                    tto['text'],
                                    form_name,
                                )
            db.commit()
    finally:
        db.close()
Exemplo n.º 4
0
def json_to_msgpack(input_filename, output_filename):
    """
    Convert a JSON stream (with one object per line) to a msgpack stream.
    """
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Exemplo n.º 5
0
def json_to_msgpack(input_filename, output_filename):
    """
    Convert a JSON stream (with one object per line) to a msgpack stream.
    """
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Exemplo n.º 6
0
def json_to_csv(input_filename, output_filename):
	out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
	for info in read_json_stream(input_filename):
		if info.get('surfaceText') is None:
			info['surfaceText'] = ''
		if info.get('context') is None:
			info['context'] = ''
		info['weight'] = str(info['weight'])
		columns = ['uri', 'rel', 'start', 'end', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText']
		column_values = [info.get(col) for col in columns]
		line = '\t'.join(column_values)
		assert '\n' not in line
		print(line, file=out_stream)
Exemplo n.º 7
0
def convert_to_tab_separated(input_filename, output_filename):
    out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
    for info in read_json_stream(input_filename):
        if info['surfaceText'] is None:
            info['surfaceText'] = ''
        info['weight'] = str(info['weight'])
        columns = [
            'uri', 'rel', 'start', 'end', 'context', 'weight', 'source_uri',
            'id', 'dataset', 'surfaceText'
        ]
        column_values = [info.get(col) for col in columns]
        line = '\t'.join(column_values)
        print(line, file=out_stream)
Exemplo n.º 8
0
def test_msgpack_to_json():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')
        
        writer = MsgpackStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        msgpack_to_json(json_path, msgpack_path)
        reader = read_json_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Exemplo n.º 9
0
def test_msgpack_to_json():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')

        writer = MsgpackStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        msgpack_to_json(json_path, msgpack_path)
        reader = read_json_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Exemplo n.º 10
0
def json_to_unique_csv(input_filename, output_filename):
	out_stream = codecs.open(output_filename, 'w', encoding='utf-8')
	cache = set()
	cached = itemgetter('uri', 'source_uri', 'dataset', 'weight')
	for info in read_json_stream(input_filename):
		if info.get('surfaceText') is None:
			info['surfaceText'] = ''
		if info.get('context') is None:
			info['context'] = ''
		info['weight'] = str(info['weight'])
		cached_item = ' '.join(cached(info))
		if cached_item in cache:
			pass
		else:
			cache.add(cached_item)
			columns = ['uri', 'rel', 'start', 'end', 'context', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText']
			column_values = [info.get(col) for col in columns]
			line = '\t'.join(column_values)
			assert '\n' not in line
			print(line, file=out_stream)
Exemplo n.º 11
0
def segmented_stream(input_file):
    """
    Read a JSON stream delimited by 'heading' entries, marking where the parser
    started parsing a new page. We distinguish these entries by the fact that
    they contain a 'title' key.

    Yield tuples of (heading, [items]), where [items] are the stream items
    that appear under the given heading.
    """
    heading = None
    items = []
    for item in read_json_stream(input_file):
        if 'title' in item:
            if heading is not None:
                yield heading, items
            heading = item
            items.clear()
        else:
            items.append(item)
    if heading is not None:
        yield heading, items
Exemplo n.º 12
0
def segmented_stream(input_file):
    """
    Read a JSON stream delimited by 'heading' entries, marking where the parser
    started parsing a new page. We distinguish these entries by the fact that
    they contain a 'title' key.

    Yield tuples of (heading, [items]), where [items] are the stream items
    that appear under the given heading.
    """
    heading = None
    items = []
    for item in read_json_stream(input_file):
        if 'title' in item:
            if heading is not None:
                yield heading, items
            heading = item
            items.clear()
        else:
            items.append(item)
    if heading is not None:
        yield heading, items
Exemplo n.º 13
0
def convert_to_solr(input_filename, output_filename):
    """
    Convert a JSON stream to a different JSON file that can be loaded into
    Solr.

    A JSON stream differs from standard JSON in that it contains several
    objects separated by line breaks.

    A Solr input file differs from standard JSON in a different way: it is
    represented as a single object with many fields. The values of these
    fields are the various different objects, but the key of each field
    must be "add".

    Having many values with the same key is incompatible with Python
    dictionaries, but is technically allowed by the JSON grammar. To create the
    output JSON file in Python, we have to write its components incrementally.
    """
    out = codecs.open(output_filename, "w", encoding="utf-8")

    print("{", file=out)
    for info in read_json_stream(input_filename):
        boost = info["weight"]

        # Handle searchable lemmas
        info["relLemmas"] = ""
        info["startLemmas"] = " ".join(uri_to_lemmas(info["start"]))
        info["endLemmas"] = " ".join(uri_to_lemmas(info["end"]))

        if boost > 0:
            if "surfaceText" in info and info["surfaceText"] is None:
                del info["surfaceText"]

            solr_struct = {"doc": info, "boost": boost}
            solr_fragment = '\t"add": %s,' % json.dumps(solr_struct)
            print(solr_fragment, file=out)
    print('\t"commit": {}', file=out)
    print("}", file=out)
Exemplo n.º 14
0
def convert_to_solr(input_filename, output_filename):
    """
    Convert a JSON stream to a different JSON file that can be loaded into
    Solr.

    A JSON stream differs from standard JSON in that it contains several
    objects separated by line breaks.

    A Solr input file differs from standard JSON in a different way: it is
    represented as a single object with many fields. The values of these
    fields are the various different objects, but the key of each field
    must be "add".

    Having many values with the same key is incompatible with Python
    dictionaries, but is technically allowed by the JSON grammar. To create the
    output JSON file in Python, we have to write its components incrementally.
    """
    out = codecs.open(output_filename, 'w', encoding='utf-8')

    print("{", file=out)
    for info in read_json_stream(input_filename):
        boost = info['weight']

        # Handle searchable lemmas
        info['relLemmas'] = ''
        info['startLemmas'] = ' '.join(uri_to_lemmas(info['start']))
        info['endLemmas'] = ' '.join(uri_to_lemmas(info['end']))

        if boost > 0:
            if 'surfaceText' in info and info['surfaceText'] is None:
                del info['surfaceText']

            solr_struct = {'doc': info, 'boost': boost}
            solr_fragment = '\t"add": %s,' % json.dumps(solr_struct)
            print(solr_fragment, file=out)
    print('\t"commit": {}', file=out)
    print('}', file=out)
Exemplo n.º 15
0
def json_to_msgpack(input_filename, output_filename):
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Exemplo n.º 16
0
 def transform_file(self, input_filename, output_file):
     out = MsgpackStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Exemplo n.º 17
0
def json_to_msgpack(input_filename, output_filename):
	out_stream = MsgpackStreamWriter(output_filename)
	for obj in read_json_stream(input_filename):
		out_stream.write(obj)
	out_stream.close()
Exemplo n.º 18
0
 def transform_file(self, input_filename, output_file):
     out = MsgpackStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Exemplo n.º 19
0
def convert_to_assoc(input_filename, output_filename):
    """
    Convert a JSON stream to a tab-separated "CSV" of concept-to-concept associations.

    The relation is mostly ignored, except:

    - Negative relations create associations between concepts suffixed with '/neg'
    - An assertion that means "People want X" in English or Chinese is converted to
      an assertion between X and "good", and also X and the negation of "bad"
    - Combining both of these, an assertion that "People don't want X" moves the
      negation so that X is associated with "not good" and "bad".

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.
    
    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    out_stream = codecs.open(output_filename, 'w', encoding='utf-8')

    for info in read_json_stream(input_filename):
        startc = reduce_concept(info['start'])
        endc = reduce_concept(info['end'])
        rel = info['rel']
        weight = info['weight']

        if 'dbpedia' in info['sources'] and '/or/' not in info['sources']:
            # DBPedia associations are still too numerous and too weird to
            # associate.
            continue

        pairs = []
        if startc == '/c/en/person':
            if rel == '/r/Desires':
                pairs = [('/c/en/good', endc), ('/c/en/bad/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/en/bad', endc), ('/c/en/good/neg', endc)]
            else:
                pairs = [(startc, endc)]
        elif startc == '/c/zh/人':
            if rel == '/r/Desires':
                pairs = [('/c/zh/良好', endc), ('/c/zh/不良/neg', endc)]
            elif rel == '/r/NotDesires':
                pairs = [('/c/zh/良好/neg', endc), ('/c/zh/不良', endc)]
            else:
                pairs = [(startc, endc)]
        else:
            negated = (rel.startswith('/r/Not')
                       or rel.startswith('/r/Antonym'))
            if not negated:
                pairs = [(startc, endc)]
            else:
                pairs = [(startc, endc + '/neg'), (startc + '/neg', endc)]

        for (start, end) in pairs:
            line = "%(start)s\t%(end)s\t%(weight)s" % {
                'start': start,
                'end': end,
                'weight': weight,
            }
            print(line, file=out_stream)