Exemplo n.º 1
0
def keep_concept(uri):
    # FIXME: possibly we should use the 'is_valid_concept' check that we use
    # elsewhere
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    if not valid_language(get_uri_language(uri)):
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
Exemplo n.º 2
0
def keep_concept(uri):
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
Exemplo n.º 3
0
def prepare_vocab_for_morphology(language, input, output):
    vocab_counts = defaultdict(int)
    for line in input:
        countstr, uri = line.strip().split(' ', 1)
        if get_uri_language(uri) == language:
            term = split_uri(uri)[2]
            if language in ATOMIC_SPACE_LANGUAGES:
                term += '_'
            vocab_counts[term] += int(countstr)

    for term, count in sorted(list(vocab_counts.items())):
        print(count, term, file=output)
Exemplo n.º 4
0
def export_plain_text(table, uri_file, file_base):
    from ..vectors.query import VectorSpaceWrapper

    def vec_to_text_line(label, vec):
        cells = [label] + ['%4.4f' % val for val in vec]
        return ' '.join(cells)

    uri_main_file = gzip.open(file_base + '_uris_main.txt.gz', 'wt')
    english_main_file = gzip.open(file_base + '_en_main.txt.gz', 'wt')
    english_extra_file = gzip.open(file_base + '_en_extra.txt.gz', 'wt')
    wrap = VectorSpaceWrapper(frame=table)

    for line in open(uri_file, encoding='utf-8'):
        uri = line.strip()
        if uri.count('/') == 3 and get_uri_language(uri) in COMMON_LANGUAGES:
            if uri in table.index:
                vec = table.loc[uri].values
                print(vec_to_text_line(uri, vec), file=uri_main_file)
            else:
                if not uri.startswith('/c/en') or '_' in uri:
                    continue
                vec = wrap.get_vector(uri)

            if vec.dot(vec) == 0:
                continue

            if uri.startswith('/c/en/'):
                label = uri[6:]
                if uri in table.index:
                    print(vec_to_text_line(label, vec), file=english_main_file)
                else:
                    print(vec_to_text_line(label, vec),
                          file=english_extra_file)

    uri_main_file.close()
    english_main_file.close()
    english_extra_file.close()
Exemplo n.º 5
0
def export_plain_text(table, uri_file, file_base):
    from ..vectors.query import VectorSpaceWrapper

    def vec_to_text_line(label, vec):
        cells = [label] + ['%4.4f' % val for val in vec]
        return ' '.join(cells)

    uri_main_file = gzip.open(file_base + '_uris_main.txt.gz', 'wt')
    english_main_file = gzip.open(file_base + '_en_main.txt.gz', 'wt')
    english_extra_file = gzip.open(file_base + '_en_extra.txt.gz', 'wt')
    wrap = VectorSpaceWrapper(frame=table)

    for line in open(uri_file, encoding='utf-8'):
        uri = line.strip()
        if uri.count('/') == 3 and get_uri_language(uri) in COMMON_LANGUAGES:
            if uri in table.index:
                vec = table.loc[uri].values
                print(vec_to_text_line(uri, vec), file=uri_main_file)
            else:
                if not uri.startswith('/c/en') or '_' in uri:
                    continue
                vec = wrap.get_vector(uri)

            if vec.dot(vec) == 0:
                continue

            if uri.startswith('/c/en/'):
                label = uri[6:]
                if uri in table.index:
                    print(vec_to_text_line(label, vec), file=english_main_file)
                else:
                    print(vec_to_text_line(label, vec), file=english_extra_file)

    uri_main_file.close()
    english_main_file.close()
    english_extra_file.close()
Exemplo n.º 6
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    FIXME: the above is out of date, we use conceptnet5.vectors now

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Exemplo n.º 7
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                start_uri.startswith('/c/') and end_uri.startswith('/c/') and
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)