def choose_small_vocabulary(big_frame, concepts_filename, language): """ Choose the vocabulary of the small frame, by eliminating the terms which: - contain more than one word - are not in ConceptNet - are not frequent """ concepts = set(line.strip() for line in open(concepts_filename)) vocab = [] for term in big_frame.index: if '_' not in term and term in concepts: try: frequency = word_frequency(uri_to_label(term), language, wordlist='large') except LookupError: frequency = word_frequency(uri_to_label(term), language, wordlist='combined') vocab.append((term, frequency)) small_vocab = [ term for term, frequency in sorted( vocab, key=lambda x: x[1], reverse=True)[:50000] ] return small_vocab
def _get_concepts(uri_word, conceptnet_dict=None): #What about duplicates ??-Don't care, just keep track. associated_nodes = [] normalized_word = uri_to_label(uri_word) normalized_word = uri_to_label(uri_word) associated_nodes.append(normalized_word) if conceptnet_dict == None: try: obj = requests.get('{}{}'.format("http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] except Exception: print("Request error") edges = [] else: if normalized_word not in conceptnet_dict: try: obj = requests.get('{}{}'.format("http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] conceptnet_dict[normalized_word] = edges except Exception: print("Request error") edges = [] else: edges = conceptnet_dict[normalized_word] at_least_one_edge = False for edge in edges: start_node = edge["start"]["term"] end_node = edge["end"]["term"] rel = edge["rel"]["label"] normalized_start_word = uri_to_label( start_node) # uri_to_label("/c/en/movies") -> "movies" normalized_end_word = uri_to_label(end_node) if (rel in DISCARDED_RELATIONS or rel not in RELATIONS or edge["start"]["language"] != "en" or edge["end"]["language"] != "en" or (normalized_start_word in associated_nodes and normalized_end_word in associated_nodes)): continue #total_vocab_list-->associated_nodes at_least_one_edge = True if normalized_start_word not in associated_nodes: associated_nodes.append(normalized_start_word) #total_vocab_list-->associated_node if normalized_end_word not in associated_nodes: associated_nodes.append(normalized_end_word) #total_vocab_list-->associated_node return associated_nodes
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = { '@id': uri, 'label': label } if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def get_vector(frame, label, language=None): """ Returns the row of a vector-space DataFrame `frame` corresponding to the text `text`. If `language` is set, this can take in plain text and normalize it to ConceptNet form. Either way, it can also take in a label that is already in ConceptNet form. """ if frame.index[1].startswith('/c/'): # This frame has URIs in its index if not label.startswith('/'): label = standardized_uri(language, label) try: return frame.loc[label] except KeyError: return pd.Series(index=frame.columns) else: if label.startswith('/'): label = uri_to_label(label) try: return frame.loc[replace_numbers(label)] except KeyError: # Return a vector of all NaNs return pd.Series(index=frame.columns)
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = True if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}: ld['site_available'] = False ld['path'] = urlparse(uri).path ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def generate_graph(toks, postags, nertags, conceptnet_dict=None, verbose=False): SEED = 1234 random.seed(SEED) primary_vocab_list = [] total_vocab_list = [] edges_list = [] concepts = [] toks_len = len(toks) i = 0 while i < toks_len: # Adds entities (n_grams) if nertags[i] in ENTITY_BEGINNING: ngram_list = [toks[i].text] j = i + 1 while j < toks_len and nertags[j] in ENTITY_INSIDE: ngram_list.append(toks[j].text) j += 1 ngram = ' '.join(ngram_list) concepts.append(ngram) i = j # Adds tokens that are considered as concepts under the Part Of Speech conditions elif postags[i] in POS_CONCEPTS and not toks[i].is_stop: concepts.append(toks[i].lemma_) # May change lemma by text... i += 1 else: i += 1 concepts_uri = [ standardized_concept_uri('en', ngram) for ngram in concepts ] unique_concepts_uri = list(OrderedDict.fromkeys(concepts_uri)) if verbose: print("") print("Preprocessing generate_graph") print("concepts_uri:", concepts_uri) print("unique_concepts_uri:", unique_concepts_uri) print("") for uri_word in tqdm(unique_concepts_uri, total=len(unique_concepts_uri)): normalized_word = uri_to_label(uri_word) primary_vocab_list.append(normalized_word) if normalized_word not in total_vocab_list: total_vocab_list.append(normalized_word) if conceptnet_dict == None: try: obj = requests.get('{}{}'.format("http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] except Exception: print("Request error") edges = [] else: if normalized_word not in conceptnet_dict: try: obj = requests.get('{}{}'.format( "http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] conceptnet_dict[normalized_word] = edges except Exception: print("Request error") edges = [] else: edges = conceptnet_dict[normalized_word] at_least_one_edge = False for edge in edges: start_node = edge["start"]["term"] end_node = edge["end"]["term"] rel = edge["rel"]["label"] normalized_start_word = uri_to_label( start_node) # uri_to_label("/c/en/movies") -> "movies" normalized_end_word = uri_to_label(end_node) if (rel in DISCARDED_RELATIONS or rel not in RELATIONS or edge["start"]["language"] != "en" or edge["end"]["language"] != "en" or (normalized_start_word in total_vocab_list and normalized_end_word in total_vocab_list)): continue at_least_one_edge = True if normalized_start_word not in total_vocab_list: total_vocab_list.append(normalized_start_word) if normalized_end_word not in total_vocab_list: total_vocab_list.append(normalized_end_word) normalized_start_word_idx = total_vocab_list.index( normalized_start_word) normalized_end_word_idx = total_vocab_list.index( normalized_end_word) edges_list.append( ((normalized_start_word_idx, normalized_end_word_idx), RELATIONS.index(rel))) total_vocab_list = [ concept.replace(" ", "_") for concept in total_vocab_list ] return (total_vocab_list, edges_list)
def generate_graph_for_concept_list(concepts, conceptnet_dict=None, verbose=False): """Given a list of concepts, and not a sequence, build the KG from ConceptNet based on this concept list.""" primary_vocab_list = [] total_vocab_list = [] edges_list = [] concepts_uri = [ standardized_concept_uri('en', ngram) for ngram in concepts ] unique_concepts_uri = list(OrderedDict.fromkeys(concepts_uri)) for uri_word in tqdm(unique_concepts_uri, total=len(unique_concepts_uri)): normalized_word = uri_to_label(uri_word) primary_vocab_list.append(normalized_word) if normalized_word not in total_vocab_list: total_vocab_list.append(normalized_word) if conceptnet_dict == None: try: obj = requests.get('{}{}'.format("http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] except Exception: print("Request error") edges = [] else: if normalized_word not in conceptnet_dict: try: obj = requests.get('{}{}'.format( "http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] conceptnet_dict[normalized_word] = edges except Exception: print("Request error") edges = [] else: edges = conceptnet_dict[normalized_word] at_least_one_edge = False for edge in edges: start_node = edge["start"]["term"] end_node = edge["end"]["term"] rel = edge["rel"]["label"] normalized_start_word = uri_to_label( start_node) # uri_to_label("/c/en/movies") -> "movies" normalized_end_word = uri_to_label(end_node) if (rel in DISCARDED_RELATIONS or rel not in RELATIONS or edge["start"]["language"] != "en" or edge["end"]["language"] != "en" or (normalized_start_word in total_vocab_list and normalized_end_word in total_vocab_list)): continue at_least_one_edge = True if normalized_start_word not in total_vocab_list: total_vocab_list.append(normalized_start_word) if normalized_end_word not in total_vocab_list: total_vocab_list.append(normalized_end_word) normalized_start_word_idx = total_vocab_list.index( normalized_start_word) normalized_end_word_idx = total_vocab_list.index( normalized_end_word) edges_list.append( ((normalized_start_word_idx, normalized_end_word_idx), RELATIONS.index(rel))) total_vocab_list = [ concept.replace(" ", "_") for concept in total_vocab_list ] return (total_vocab_list, edges_list)
def generate_graph_and_align(toks, postags, nertags, conceptnet_dict=None, verbose=False): def _get_concepts(uri_word, associated_nodes=[], edges_list=[], conceptnet_dict=None): #What about duplicates ??-Don't care, just keep track. #associated_nodes = [] normalized_word = uri_to_label(uri_word) normalized_word = uri_to_label(uri_word) associated_nodes.append(normalized_word) if conceptnet_dict == None: try: obj = requests.get('{}{}'.format("http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] except Exception: print("Request error") edges = [] else: if normalized_word not in conceptnet_dict: try: obj = requests.get('{}{}'.format( "http://api.conceptnet.io", uri_word)).json() edges = obj["edges"] conceptnet_dict[normalized_word] = edges except Exception: print("Request error") edges = [] else: edges = conceptnet_dict[normalized_word] at_least_one_edge = False for edge in edges: start_node = edge["start"]["term"] end_node = edge["end"]["term"] rel = edge["rel"]["label"] normalized_start_word = uri_to_label( start_node) # uri_to_label("/c/en/movies") -> "movies" normalized_end_word = uri_to_label(end_node) if (rel in DISCARDED_RELATIONS or rel not in RELATIONS or edge["start"]["language"] != "en" or edge["end"]["language"] != "en" or (normalized_start_word in associated_nodes and normalized_end_word in associated_nodes)): continue #total_vocab_list-->associated_nodes at_least_one_edge = True if normalized_start_word not in associated_nodes: associated_nodes.append(normalized_start_word) #total_vocab_list-->associated_node if normalized_end_word not in associated_nodes: associated_nodes.append(normalized_end_word) normalized_start_word_idx = associated_nodes.index( normalized_start_word) normalized_end_word_idx = associated_nodes.index( normalized_end_word) edges_list.append( ((normalized_start_word_idx, normalized_end_word_idx), RELATIONS.index(rel))) #total_vocab_list = [concept.replace(" ", "_") for concept in total_vocab_list] #total_vocab_ return associated_nodes, edges_list SEED = 1234 random.seed(SEED) primary_vocab_list = [] total_vocab_list = [] edges_list = [] SEQ_LIST = [] concepts = [] toks_len = len(toks) i = 0 while i < toks_len: # Adds entities (n_grams) if nertags[i] in ENTITY_BEGINNING: ngram_list = [toks[i].text] j = i + 1 while j < toks_len and nertags[j] in ENTITY_INSIDE: ngram_list.append(toks[j].text) j += 1 ngram = ' '.join(ngram_list) concepts.append(ngram) #modif uri_word = standardized_concept_uri('en', ngram) norm_word = uri_to_label(uri_word) if norm_word in total_vocab_list: SEQ_LIST.append( (i, norm_word, total_vocab_list.index(norm_word))) else: total_vocab_list, edges_list = _get_concepts( uri_word, total_vocab_list, edges_list, conceptnet_dict) SEQ_LIST.append( (i, norm_word, total_vocab_list.index(norm_word))) i = j # Adds tokens that are considered as concepts under the Part Of Speech conditions elif postags[i] in POS_CONCEPTS and not toks[i].is_stop: concepts.append(toks[i].lemma_) # May change lemma by text... lemma = toks[i].lemma_ uri_word = standardized_concept_uri('en', lemma) norm_word = uri_to_label(uri_word) if norm_word in total_vocab_list: SEQ_LIST.append( (i, norm_word, total_vocab_list.index(norm_word))) else: total_vocab_list, edges_list = _get_concepts( uri_word, total_vocab_list, edges_list, conceptnet_dict) SEQ_LIST.append( (i, norm_word, total_vocab_list.index(norm_word))) i += 1 else: i += 1 return (total_vocab_list, edges_list), SEQ_LIST