예제 #1
0
def choose_small_vocabulary(big_frame, concepts_filename, language):
    """
    Choose the vocabulary of the small frame, by eliminating the terms which:
     - contain more than one word
     - are not in ConceptNet
     - are not frequent
    """
    concepts = set(line.strip() for line in open(concepts_filename))
    vocab = []
    for term in big_frame.index:
        if '_' not in term and term in concepts:
            try:
                frequency = word_frequency(uri_to_label(term),
                                           language,
                                           wordlist='large')
            except LookupError:
                frequency = word_frequency(uri_to_label(term),
                                           language,
                                           wordlist='combined')
            vocab.append((term, frequency))
    small_vocab = [
        term for term, frequency in sorted(
            vocab, key=lambda x: x[1], reverse=True)[:50000]
    ]
    return small_vocab
예제 #2
0
def _get_concepts(uri_word, conceptnet_dict=None):
    #What about duplicates ??-Don't care, just keep track.
    associated_nodes = []
    normalized_word = uri_to_label(uri_word)
    normalized_word = uri_to_label(uri_word)
    associated_nodes.append(normalized_word)
    if conceptnet_dict == None:
        try:
            obj = requests.get('{}{}'.format("http://api.conceptnet.io",
                                             uri_word)).json()
            edges = obj["edges"]
        except Exception:
            print("Request error")
            edges = []

    else:
        if normalized_word not in conceptnet_dict:
            try:
                obj = requests.get('{}{}'.format("http://api.conceptnet.io",
                                                 uri_word)).json()
                edges = obj["edges"]
                conceptnet_dict[normalized_word] = edges
            except Exception:
                print("Request error")
                edges = []
        else:
            edges = conceptnet_dict[normalized_word]

    at_least_one_edge = False
    for edge in edges:
        start_node = edge["start"]["term"]
        end_node = edge["end"]["term"]
        rel = edge["rel"]["label"]
        normalized_start_word = uri_to_label(
            start_node)  # uri_to_label("/c/en/movies") -> "movies"
        normalized_end_word = uri_to_label(end_node)

        if (rel in DISCARDED_RELATIONS or rel not in RELATIONS
                or edge["start"]["language"] != "en"
                or edge["end"]["language"] != "en"
                or (normalized_start_word in associated_nodes
                    and normalized_end_word in associated_nodes)):
            continue
            #total_vocab_list-->associated_nodes
            at_least_one_edge = True

        if normalized_start_word not in associated_nodes:
            associated_nodes.append(normalized_start_word)
            #total_vocab_list-->associated_node

        if normalized_end_word not in associated_nodes:
            associated_nodes.append(normalized_end_word)
            #total_vocab_list-->associated_node
    return associated_nodes
예제 #3
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #4
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {
        '@id': uri,
        'label': label
    }
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
예제 #5
0
def get_vector(frame, label, language=None):
    """
    Returns the row of a vector-space DataFrame `frame` corresponding
    to the text `text`. If `language` is set, this can take in plain text
    and normalize it to ConceptNet form. Either way, it can also take in
    a label that is already in ConceptNet form.
    """
    if frame.index[1].startswith('/c/'):  # This frame has URIs in its index
        if not label.startswith('/'):
            label = standardized_uri(language, label)
        try:
            return frame.loc[label]
        except KeyError:
            return pd.Series(index=frame.columns)
    else:
        if label.startswith('/'):
            label = uri_to_label(label)
        try:
            return frame.loc[replace_numbers(label)]
        except KeyError:
            # Return a vector of all NaNs
            return pd.Series(index=frame.columns)
예제 #6
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = True
        if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}:
            ld['site_available'] = False
        ld['path'] = urlparse(uri).path
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #7
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #8
0
def generate_graph(toks,
                   postags,
                   nertags,
                   conceptnet_dict=None,
                   verbose=False):
    SEED = 1234
    random.seed(SEED)

    primary_vocab_list = []
    total_vocab_list = []
    edges_list = []

    concepts = []
    toks_len = len(toks)

    i = 0
    while i < toks_len:
        # Adds entities (n_grams)
        if nertags[i] in ENTITY_BEGINNING:
            ngram_list = [toks[i].text]
            j = i + 1
            while j < toks_len and nertags[j] in ENTITY_INSIDE:
                ngram_list.append(toks[j].text)
                j += 1
            ngram = ' '.join(ngram_list)
            concepts.append(ngram)
            i = j

        # Adds tokens that are considered as concepts under the Part Of Speech conditions
        elif postags[i] in POS_CONCEPTS and not toks[i].is_stop:
            concepts.append(toks[i].lemma_)  # May change lemma by text...
            i += 1
        else:
            i += 1

    concepts_uri = [
        standardized_concept_uri('en', ngram) for ngram in concepts
    ]
    unique_concepts_uri = list(OrderedDict.fromkeys(concepts_uri))

    if verbose:
        print("")
        print("Preprocessing generate_graph")
        print("concepts_uri:", concepts_uri)
        print("unique_concepts_uri:", unique_concepts_uri)
        print("")

    for uri_word in tqdm(unique_concepts_uri, total=len(unique_concepts_uri)):
        normalized_word = uri_to_label(uri_word)
        primary_vocab_list.append(normalized_word)

        if normalized_word not in total_vocab_list:
            total_vocab_list.append(normalized_word)

        if conceptnet_dict == None:
            try:
                obj = requests.get('{}{}'.format("http://api.conceptnet.io",
                                                 uri_word)).json()
                edges = obj["edges"]
            except Exception:
                print("Request error")
                edges = []

        else:
            if normalized_word not in conceptnet_dict:
                try:
                    obj = requests.get('{}{}'.format(
                        "http://api.conceptnet.io", uri_word)).json()
                    edges = obj["edges"]
                    conceptnet_dict[normalized_word] = edges
                except Exception:
                    print("Request error")
                    edges = []
            else:
                edges = conceptnet_dict[normalized_word]

        at_least_one_edge = False
        for edge in edges:
            start_node = edge["start"]["term"]
            end_node = edge["end"]["term"]
            rel = edge["rel"]["label"]
            normalized_start_word = uri_to_label(
                start_node)  # uri_to_label("/c/en/movies") -> "movies"
            normalized_end_word = uri_to_label(end_node)

            if (rel in DISCARDED_RELATIONS or rel not in RELATIONS
                    or edge["start"]["language"] != "en"
                    or edge["end"]["language"] != "en"
                    or (normalized_start_word in total_vocab_list
                        and normalized_end_word in total_vocab_list)):
                continue
            at_least_one_edge = True

            if normalized_start_word not in total_vocab_list:
                total_vocab_list.append(normalized_start_word)

            if normalized_end_word not in total_vocab_list:
                total_vocab_list.append(normalized_end_word)

            normalized_start_word_idx = total_vocab_list.index(
                normalized_start_word)
            normalized_end_word_idx = total_vocab_list.index(
                normalized_end_word)
            edges_list.append(
                ((normalized_start_word_idx, normalized_end_word_idx),
                 RELATIONS.index(rel)))

    total_vocab_list = [
        concept.replace(" ", "_") for concept in total_vocab_list
    ]

    return (total_vocab_list, edges_list)
예제 #9
0
def generate_graph_for_concept_list(concepts,
                                    conceptnet_dict=None,
                                    verbose=False):
    """Given a list of concepts, and not a sequence, build the KG from 
    ConceptNet based on this concept list."""
    primary_vocab_list = []
    total_vocab_list = []
    edges_list = []
    concepts_uri = [
        standardized_concept_uri('en', ngram) for ngram in concepts
    ]
    unique_concepts_uri = list(OrderedDict.fromkeys(concepts_uri))
    for uri_word in tqdm(unique_concepts_uri, total=len(unique_concepts_uri)):
        normalized_word = uri_to_label(uri_word)
        primary_vocab_list.append(normalized_word)

        if normalized_word not in total_vocab_list:
            total_vocab_list.append(normalized_word)

        if conceptnet_dict == None:
            try:
                obj = requests.get('{}{}'.format("http://api.conceptnet.io",
                                                 uri_word)).json()
                edges = obj["edges"]
            except Exception:
                print("Request error")
                edges = []

        else:
            if normalized_word not in conceptnet_dict:
                try:
                    obj = requests.get('{}{}'.format(
                        "http://api.conceptnet.io", uri_word)).json()
                    edges = obj["edges"]
                    conceptnet_dict[normalized_word] = edges
                except Exception:
                    print("Request error")
                    edges = []
            else:
                edges = conceptnet_dict[normalized_word]

        at_least_one_edge = False
        for edge in edges:
            start_node = edge["start"]["term"]
            end_node = edge["end"]["term"]
            rel = edge["rel"]["label"]
            normalized_start_word = uri_to_label(
                start_node)  # uri_to_label("/c/en/movies") -> "movies"
            normalized_end_word = uri_to_label(end_node)

            if (rel in DISCARDED_RELATIONS or rel not in RELATIONS
                    or edge["start"]["language"] != "en"
                    or edge["end"]["language"] != "en"
                    or (normalized_start_word in total_vocab_list
                        and normalized_end_word in total_vocab_list)):
                continue
            at_least_one_edge = True

            if normalized_start_word not in total_vocab_list:
                total_vocab_list.append(normalized_start_word)

            if normalized_end_word not in total_vocab_list:
                total_vocab_list.append(normalized_end_word)

            normalized_start_word_idx = total_vocab_list.index(
                normalized_start_word)
            normalized_end_word_idx = total_vocab_list.index(
                normalized_end_word)
            edges_list.append(
                ((normalized_start_word_idx, normalized_end_word_idx),
                 RELATIONS.index(rel)))

    total_vocab_list = [
        concept.replace(" ", "_") for concept in total_vocab_list
    ]

    return (total_vocab_list, edges_list)
예제 #10
0
def generate_graph_and_align(toks,
                             postags,
                             nertags,
                             conceptnet_dict=None,
                             verbose=False):
    def _get_concepts(uri_word,
                      associated_nodes=[],
                      edges_list=[],
                      conceptnet_dict=None):
        #What about duplicates ??-Don't care, just keep track.
        #associated_nodes = []
        normalized_word = uri_to_label(uri_word)
        normalized_word = uri_to_label(uri_word)
        associated_nodes.append(normalized_word)
        if conceptnet_dict == None:
            try:
                obj = requests.get('{}{}'.format("http://api.conceptnet.io",
                                                 uri_word)).json()
                edges = obj["edges"]
            except Exception:
                print("Request error")
                edges = []

        else:
            if normalized_word not in conceptnet_dict:
                try:
                    obj = requests.get('{}{}'.format(
                        "http://api.conceptnet.io", uri_word)).json()
                    edges = obj["edges"]
                    conceptnet_dict[normalized_word] = edges
                except Exception:
                    print("Request error")
                    edges = []
            else:
                edges = conceptnet_dict[normalized_word]

        at_least_one_edge = False
        for edge in edges:
            start_node = edge["start"]["term"]
            end_node = edge["end"]["term"]
            rel = edge["rel"]["label"]
            normalized_start_word = uri_to_label(
                start_node)  # uri_to_label("/c/en/movies") -> "movies"
            normalized_end_word = uri_to_label(end_node)

            if (rel in DISCARDED_RELATIONS or rel not in RELATIONS
                    or edge["start"]["language"] != "en"
                    or edge["end"]["language"] != "en"
                    or (normalized_start_word in associated_nodes
                        and normalized_end_word in associated_nodes)):
                continue
                #total_vocab_list-->associated_nodes
            at_least_one_edge = True

            if normalized_start_word not in associated_nodes:
                associated_nodes.append(normalized_start_word)
            #total_vocab_list-->associated_node

            if normalized_end_word not in associated_nodes:
                associated_nodes.append(normalized_end_word)

            normalized_start_word_idx = associated_nodes.index(
                normalized_start_word)
            normalized_end_word_idx = associated_nodes.index(
                normalized_end_word)
            edges_list.append(
                ((normalized_start_word_idx, normalized_end_word_idx),
                 RELATIONS.index(rel)))

            #total_vocab_list = [concept.replace(" ", "_") for concept in total_vocab_list]
            #total_vocab_
        return associated_nodes, edges_list

    SEED = 1234
    random.seed(SEED)

    primary_vocab_list = []
    total_vocab_list = []
    edges_list = []

    SEQ_LIST = []
    concepts = []
    toks_len = len(toks)

    i = 0
    while i < toks_len:
        # Adds entities (n_grams)
        if nertags[i] in ENTITY_BEGINNING:
            ngram_list = [toks[i].text]
            j = i + 1
            while j < toks_len and nertags[j] in ENTITY_INSIDE:
                ngram_list.append(toks[j].text)
                j += 1
            ngram = ' '.join(ngram_list)
            concepts.append(ngram)
            #modif
            uri_word = standardized_concept_uri('en', ngram)
            norm_word = uri_to_label(uri_word)
            if norm_word in total_vocab_list:
                SEQ_LIST.append(
                    (i, norm_word, total_vocab_list.index(norm_word)))
            else:
                total_vocab_list, edges_list = _get_concepts(
                    uri_word, total_vocab_list, edges_list, conceptnet_dict)
                SEQ_LIST.append(
                    (i, norm_word, total_vocab_list.index(norm_word)))
            i = j

        # Adds tokens that are considered as concepts under the Part Of Speech conditions
        elif postags[i] in POS_CONCEPTS and not toks[i].is_stop:
            concepts.append(toks[i].lemma_)  # May change lemma by text...
            lemma = toks[i].lemma_
            uri_word = standardized_concept_uri('en', lemma)
            norm_word = uri_to_label(uri_word)
            if norm_word in total_vocab_list:
                SEQ_LIST.append(
                    (i, norm_word, total_vocab_list.index(norm_word)))
            else:
                total_vocab_list, edges_list = _get_concepts(
                    uri_word, total_vocab_list, edges_list, conceptnet_dict)
                SEQ_LIST.append(
                    (i, norm_word, total_vocab_list.index(norm_word)))
            i += 1
        else:
            i += 1

    return (total_vocab_list, edges_list), SEQ_LIST