Python standardize_text示例，conceptnet5.nodes.standardize_text Python示例

示例#1

0

显示文件

文件： conceptnet4.py 项目： superRookie007/conceptnet5

def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    activity = parts_dict["activity"]

    creator_node = join_uri('/s/contributor/omcs',
                            standardize_text(parts_dict["creator"]))
    activity_node = join_uri('/s/activity/omcs', standardize_text(activity))
    if preposition_fix:
        conjunction = [creator_node, activity_node, '/s/rule/preposition_fix']
    else:
        conjunction = [creator_node, activity_node]
    weighted_sources = [(conjunction, 1)]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        conjunction = [
            join_uri('/s/contributor/omcs', standardize_text(username)),
            '/s/activity/omcs/vote'
        ]
        weighted_sources.append((conjunction, vote_int))
    return weighted_sources

示例#2

0

显示文件

文件： conceptnet4.py 项目： 205113/conceptnet5

def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    activity = parts_dict["activity"]

    creator_node = join_uri(
        '/s/contributor/omcs',
        standardize_text(parts_dict["creator"])
    )
    activity_node = join_uri('/s/activity/omcs', standardize_text(activity))
    if preposition_fix:
        conjunction = [creator_node, activity_node, '/s/rule/preposition_fix']
    else:
        conjunction = [creator_node, activity_node]
    weighted_sources = [(conjunction, 1)]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        conjunction = [
            join_uri('/s/contributor/omcs', standardize_text(username)),
            '/s/activity/omcs/vote'
        ]
        weighted_sources.append((conjunction, vote_int))
    return weighted_sources

示例#3

0

显示文件

文件： conceptnet4.py 项目： commonsense/conceptnet5

def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri("/s/contributor/omcs", standardize_username(parts_dict["creator"]))
    creator_source["contributor"] = creator_node

    activity = parts_dict["activity"]
    activity_node = join_uri("/s/activity/omcs", standardize_text(activity))
    creator_source["activity"] = activity_node

    if preposition_fix:
        creator_source["process"] = "/s/process/preposition_fix"
    creator_source["weight"] = 1.0
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            "contributor": join_uri("/s/contributor/omcs", standardize_username(username)),
            "activity": "/s/activity/omcs/vote",
            "weight": float(vote_int),
        }
        sources.append(vote_source)
    return sources

示例#4

0

显示文件

def standardize_username(username):
    """
    Convert usernames into a canonical form that can be used in URIs.

    If the username is an e-mail address, just keep the part before the @ sign.
    """
    name = username.strip('@').split('@')[0]
    return standardize_text(name)

示例#5

0

显示文件

文件： conceptnet4.py 项目： commonsense/conceptnet5

def standardize_username(username):
    """
    Convert usernames into a canonical form that can be used in URIs.

    If the username is an e-mail address, just keep the part before the @ sign.
    """
    name = username.strip("@").split("@")[0]
    return standardize_text(name)

示例#6

0

显示文件

def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri('/s/contributor/omcs',
                            standardize_username(parts_dict["creator"]))
    creator_source['contributor'] = creator_node

    activity = parts_dict["activity"]
    activity_node = join_uri('/s/activity/omcs', standardize_text(activity))
    creator_source['activity'] = activity_node

    if preposition_fix:
        creator_source['process'] = '/s/process/preposition_fix'
    creator_source['weight'] = 1.
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            'contributor':
            join_uri('/s/contributor/omcs', standardize_username(username)),
            'activity':
            '/s/activity/omcs/vote',
            'weight':
            float(vote_int)
        }
        sources.append(vote_source)
    return sources

示例#7

0

显示文件

def filtered_uri(lang, text):
    if lang == 'en':
        text = filter_stopwords(text)
        return concept_uri('en', standardize_text(text, english_filter))
    else:
        return standardized_concept_uri(lang, text)

示例#8

0

显示文件

文件： umbel.py 项目： zhengyueling1993/conceptnet5

def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name(
                        'en', name) != standardized_concept_name(
                            'en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri(
                            'en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(
                                name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri(
                                'en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(
                            umbel_edge('/r/Synonym', alt_label, main_label,
                                       surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(
            dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

示例#9

0

显示文件

文件： conceptnet4.py 项目： commonsense/conceptnet5

def filtered_uri(lang, text):
    if lang == "en":
        text = filter_stopwords(text)
        return concept_uri("en", standardize_text(text, english_filter))
    else:
        return standardized_concept_uri(lang, text)

示例#10

0

显示文件

文件： umbel.py 项目： 205113/conceptnet5

def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri('en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri('en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))