def eval_analogies(frame): filename = get_support_data_filename('google-analogies/questions-words.txt') quads = read_google_analogies(filename) vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', 200000) ] wrap = VectorSpaceWrapper(frame=frame) vecs = np.vstack([wrap.get_vector(word) for word in vocab]) tframe = pd.DataFrame(vecs, index=vocab) total = 0 correct = 0 seen_mistakes = set() for quad in quads: prompt = quad[:3] answer = quad[3] vector = analogy_func(frame, *prompt) similar = similar_to_vec(tframe, vector) result = None for match in similar.index: if match not in prompt: result = match break if result == answer: correct += 1 else: if result not in seen_mistakes: print( "%s : %s :: %s : [%s] (should be %s)" % (quad[0], quad[1], quad[2], result, answer) ) seen_mistakes.add(result) total += 1 low, high = proportion_confint(correct, total) return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def read_mturk(): lang1, lang2 = 'en', 'en' with open(get_support_data_filename('mturk/MTURK-771.csv')) as file: for line in file: term1, term2, sscore = line.split(',') gold_score = float(sscore) yield term1, term2, gold_score, lang1, lang2
def evaluate(frame, subset='val'): """ Evaluate a DataFrame containing term vectors on its ability to predict term relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a VectorSpaceWrapper to fill missing vocabulary from ConceptNet. Return a Series containing these labeled results. """ # Make subset names consistent with other datasets if subset == 'dev': subset = 'val' elif subset == 'all': # for the final evaluation, use just the test data subset = 'test' filename = get_support_data_filename('story-cloze/cloze_test_spring2016_%s.tsv' % subset) vectors = VectorSpaceWrapper(frame=frame) total = 0 correct = 0 for sentences, answers in read_cloze(filename): text = ' '.join(sentences) right_answer, wrong_answer = answers probe_vec = vectors.text_to_vector('en', text) right_vec = vectors.text_to_vector('en', right_answer) wrong_vec = vectors.text_to_vector('en', wrong_answer) right_sim = cosine_similarity(probe_vec, right_vec) wrong_sim = cosine_similarity(probe_vec, wrong_vec) if right_sim > wrong_sim: correct += 1 total += 1 # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer)) low, high = proportion_confint(correct, total) return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def read_turk_answers_semeval2012(subset, subclass, test_questions): """ A line represents one turker's answer to a given question. An answer has the following format: pair1, pair2, pair3, pair4, least_prototypical_pair, most_prototypical_pair, relation_name This function returns two dictionaries: * pairqnum2least - * pairqnum2most """ filename = 'semeval12-2/{}/Phase2Answers-{}.txt'.format(subset, subclass) with open(get_support_data_filename(filename)) as file: answers = [] for i, line in enumerate(file): if i == 0: continue pairs = tuple(line.split('\t')) answers.append(pairs) pairqnum2least = defaultdict(int) pairqnum2most = defaultdict(int) for question, answers in groupby(answers, key=lambda x: x[:4]): question_num = test_questions.index(question) for answer in answers: pairqnum2least[(question_num, answer[4])] += 1 pairqnum2most[(question_num, answer[5])] += 1 return pairqnum2least, pairqnum2most
def evaluate(frame, subset='val'): """ Evaluate a DataFrame containing term vectors on its ability to predict term relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a VectorSpaceWrapper to fill missing vocabulary from ConceptNet. Return a Series containing these labeled results. """ # Make subset names consistent with other datasets if subset == 'dev': subset = 'val' elif subset == 'all': # for the final evaluation, use just the test data subset = 'test' filename = get_support_data_filename( 'story-cloze/cloze_test_spring2016_%s.tsv' % subset) vectors = VectorSpaceWrapper(frame=frame) total = 0 correct = 0 for sentences, answers in read_cloze(filename): text = ' '.join(sentences) right_answer, wrong_answer = answers probe_vec = vectors.text_to_vector('en', text) right_vec = vectors.text_to_vector('en', right_answer) wrong_vec = vectors.text_to_vector('en', wrong_answer) right_sim = cosine_similarity(probe_vec, right_vec) wrong_sim = cosine_similarity(probe_vec, wrong_vec) if right_sim > wrong_sim: correct += 1 total += 1 # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer)) low, high = proportion_confint(correct, total) return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def combine_assertions(input_filename, core_filename, output_filename): """ Take in a tab-separated, sorted "CSV" files, indicated by `input_filename`, that should be grouped together into assertions. Output a msgpack stream of assertions the file indicated by `output_filename`. The input file should be made from multiple sources of assertions by concatenating and sorting them. The combined assertions will all have the dataset of the first edge that produces them, and the license of the strongest license being combined. This process requires its input to be a sorted CSV so that all edges for the same assertion will appear consecutively. """ def group_func(line): "Group lines by their URI (their first column)." return line.split('\t', 1)[0] out = MsgpackStreamWriter(output_filename) out_bad = MsgpackStreamWriter(output_filename + '.reject') core_prefixes = set() for line in open(core_filename, encoding='utf-8'): core_prefixes.add(uri_prefix(line.strip(), 3)) # Scan through the assertions twice to add derived words to the blocklist blocklist = Blocklist.load(get_support_data_filename(BLOCK_FILENAME)) for iter in range(2): with open(input_filename, encoding='utf-8') as stream: for line in stream: tmp_assertion = _make_assertion([line.strip()]) if tmp_assertion is None: continue blocklist.propagate_blocks(tmp_assertion) with open(input_filename, encoding='utf-8') as stream: for key, line_group in itertools.groupby(stream, group_func): assertion = _make_assertion(line_group) destination = out if assertion is None: continue if assertion['weight'] <= 0: destination = out_bad if blocklist.is_blocked(assertion): destination = out_bad if assertion['rel'] == 'ExternalURL': # discard ExternalURL edges for things that aren't otherwise # in ConceptNet prefix = uri_prefix(assertion['start'], 3) if prefix not in core_prefixes: destination = out_bad destination.write(assertion) out.close() out_bad.close()
def read_mc(): """ Parses the Miller and Charles word similarity test collection. """ filename = get_support_data_filename('mc/EN-MC-30.txt') with open(filename) as file: for line in file: parts = line.split() yield parts[0], parts[1], float(parts[2])
def read_ws353_multilingual(language): if language == 'es': language = 'es.fixed' filename = 'wordsim-353/{}.tab'.format(language) with open(get_support_data_filename(filename)) as file: for line in file: term1, term2, sscore = line.split('\t') gold_score = float(sscore) yield term1, term2, gold_score
def read_rg65(): """ Parses the Rubenstein and Goodenough word similarity test collection. """ filename = get_support_data_filename('rg65/EN-RG-65.txt') with open(filename) as file: for line in file: parts = line.split() yield parts[0], parts[1], float(parts[2])
def read_rw(subset='dev'): """ Parses the rare word similarity test collection. """ filename = get_support_data_filename('rw/rw-{}.csv'.format(subset)) with open(filename) as file: for line in file: parts = line.split() yield parts[0], parts[1], float(parts[2])
def read_rw(): """ Parses the rare word similarity test collection. """ filename = get_support_data_filename('rw/rw.txt') with open(filename) as file: for line in file: parts = line.split() yield parts[0], parts[1], float(parts[2])
def read_simlex(): lang1, lang2 = 'en', 'en' with open(get_support_data_filename('simlex/SimLex-999.txt')) as file: for line in file: if line.startswith("word1"): continue term1, term2, _, sscore, _, _, _, ascore, _, _ = line.split('\t') gold_score = float(sscore) yield term1, term2, gold_score, lang1, lang2
def read_semeval_monolingual(lang, subset='test'): """ Parses Semeval2017-Task2 monolingual word similarity (subtask 1) test collection. """ lang1, lang2 = lang, lang filename = get_support_data_filename('semeval17-2/{}.{}.txt'.format(lang, subset)) with open(filename) as file: for line in file: parts = line.split('\t') yield parts[0], parts[1], float(parts[2]), lang1, lang2
def read_pku500(): lang1, lang2 = 'zh', 'zh' filename = 'pku-500/pku-500.csv' with open(get_support_data_filename(filename)) as file: for line in file: if line.startswith('#'): continue term1, term2, sscore = line.split('\t') gold_score = float(sscore) yield term1, term2, gold_score, lang1, lang2
def read_rw(): """ Parses the rare word similarity test collection. """ G = Graph() filename = get_support_data_filename('rw/rw.txt') with open(filename) as file: for line in file: parts = line.split() G.add_edge(parts[0], parts[1], weight=(float(parts[2])/10))
def read_mc(): """ Parses the Miller and Charles word similarity test collection. """ G=Graph() filename = get_support_data_filename('mc/EN-MC-30.txt') with open(filename) as file: for line in file: parts = line.split() G.add_edge(parts[0], parts[1], weight=float(parts[2]))
def read_gurevych(setname): # The 'setname' here is a number indicating the number of word pairs # in the set. filename = 'gurevych/wortpaare{}.gold.pos.txt'.format(setname) with open(get_support_data_filename(filename)) as file: for line in file: if line.startswith('#'): continue term1, term2, sscore, _pos1, _pos2 = line.rstrip().split(':') gold_score = float(sscore) yield term1, term2, gold_score
def read_rg65(): """ Parses the Rubenstein and Goodenough word similarity test collection. """ G = Graph() filename = get_support_data_filename('rg65/EN-RG-65.txt') with open(filename) as file: for line in file: parts = line.split() G.add_edge(parts[0], parts[1], weight=float(parts[2])) return G
def read_symrel(): """ Parses the symantic analogy relations from Mikolov et al. """ filename = get_support_data_filename('rel/questions-words.txt') with open(filename) as file: for line in file: if line.startswith(': gram'): break if line.startswith(':'): continue yield line.split()
def read_test_questions_semeval2012(subset, subclass): """ Read test questions for a specific subclass. A test question has the following format: pair1,pair2,pair3,pair4 """ filename = 'semeval12-2/{}/Phase2Questions-{}.txt'.format(subset, subclass) with open(get_support_data_filename(filename)) as file: test_questions = [] for line in file: pairs = tuple(line.strip().split(',')) test_questions.append(pairs) return test_questions
def _setup(): """ Read the dictionary file, creating a mapping from words to their phonetics. When multiple pronunciations are given, keep the last one. """ with open(get_support_data_filename('cmudict.0.7a')) as rhymelist: for line in rhymelist: if line.startswith(';;;'): continue word, phon = line.strip().split(' ') phon = phon.split(' ') PHONETIC_DICT[word] = phon
def read_jsim(): """ Read the Japanese rare-words dataset from Tokyo Metropolitan University. """ lang1, lang2 = 'ja', 'ja' for pos in ('noun', 'verb', 'adj', 'adv'): filename = get_support_data_filename('jSIM/similarity_full/score_{}_new_full.csv'.format(pos)) with open(filename, encoding='utf-8') as file: for line in file: if line.startswith('word1'): continue parts = line.split(',') yield parts[0].strip(), parts[1].strip(), float(parts[2]), lang1, lang2
def read_train_pairs_semeval2012(subset, subclass): """ Read a set of three training pairs for a given subclass. These pairs are used as prototypical examples of a given relation to which test pairs are compared. """ filename = 'semeval12-2/{}/Phase1Questions-{}.txt'.format(subset, subclass) with open(get_support_data_filename(filename)) as file: train_pairs = [] for i, line in enumerate(file): if i in [4, 5, 6]: pair = line.strip().split(':') pair = tuple(pair) train_pairs.append(pair) return train_pairs
def read_bats(category): """ Read BATS dataset pairs for a specific category. Turn them into questions. For some questions, BATS contains multiple answers. For example, the answer to an analogy question Nicaragua:Spanish::Switzerland:? could be German, French, or Italian. These will all be supplied as a list if they are an answer (b2). However, if they are a part of a question (b1), only the first one will be used. """ filename = 'bats/{}.txt'.format(category) pairs = [] with open(get_support_data_filename(filename)) as file: for line in file: if '\t' in line: left, right = line.lower().split('\t') else: left, right = line.lower().split() right = right.strip() if '/' in right: right = [i.strip() for i in right.split('/')] else: right = [i.strip() for i in right.split(',')] pairs.append([left, right]) quads = [] for i in range(len(pairs)): first_pair = pairs[i] first_pair[1] = first_pair[1][ 0] # select only one term for b1, even if more may be available second_pairs = [pair for j, pair in enumerate(pairs) if j != i] for second_pair in second_pairs: quad = [] # the first three elements of a quad are the two terms in first_pair and the first # term of the second_pair quad.extend([ standardized_uri('en', term) for term in first_pair + second_pair[:1] ]) # if the second element of the second pair (b2) is a list, it means there are multiple # correct answers for b2. We want to keep all of them. if isinstance(second_pair[1], list): quad.append( [standardized_uri('en', term) for term in second_pair[1]]) else: quad.append(standardized_uri('en', second_pair[1])) quads.append(quad) return quads
def read_men3000(): """ Parses the MEN test collection. MEN is a collection of 3000 english word pairs, each with a relatedness rating between 0 and 50. The relatedness of a pair of words was determined by the number of times the pair was selected as more related compared to another randomly chosen pair. """ filename = get_support_data_filename('mensim/MEN_dataset_lemma_form.dev') with open(filename) as file: for line in file: parts = line.rstrip().split() term1 = parts[0].split('-')[0] # remove part of speech term2 = parts[1].split('-')[0] # as above gold_score = float(parts[2]) yield term1, term2, gold_score
def read_ws353(): """ Parses the word-similarity 353 test collection (ws353). ws353 is a collection of 353 english word pairs, each with a relatedness rating between 0 (totally unrelated) to 10 (very related or identical). The relatedness of a pair of words was determined by the average scores of either 13 or 16 native english speakers. """ with open(get_support_data_filename('wordsim-353/combined.csv')) as file: for line in file: if line.startswith('Word 1'): # Skip the header continue term1, term2, sscore = line.split(',') gold_score = float(sscore) yield term1, term2, gold_score
def read_jsim(): """ Read the updated Japanese rare-words dataset from Karpinska et al. (http://www.aclweb.org/anthology/W18-2905) """ lang1, lang2 = 'ja', 'ja' for pos in ('noun', 'verb', 'adj', 'adv'): filename = get_support_data_filename( 'jSIM/similarity_full/score_{}_new_full.csv'.format(pos)) with open(filename, encoding='utf-8') as file: for line in file: if line.startswith('word1'): continue parts = line.split(',') yield parts[0].strip(), parts[1].strip(), float( parts[2]), lang1, lang2
def eval_google_analogies(vectors, subset='semantic', vocab_size=200000, verbose=False): """ Evaluate the Google Research analogies, released by Mikolov et al. along with word2vec. These analogies come in two flavors: semantic and syntactic. Numberbatch is intended to be a semantic space, so we focus on semantic analogies. The syntactic analogies are about whether you can inflect or conjugate a particular word. The semantic analogies are about whether you can sort words by their gender, and about geographic trivia. I (Rob) think this data set is not very representative, but evaluating against it is all the rage. """ filename = get_support_data_filename('google-analogies/{}-words.txt'.format(subset)) quads = read_google_analogies(filename) return eval_open_vocab_analogies(vectors, quads, vocab_size, verbose)
def read_turk_ranks_semeval2012(subset, subclass): """ Read gold rankings of prototypicality, as computed using turkers answers to MaxDiff questions. A score is defined as the difference between the number of times the turkers judged a pair the most prototypical and the number of times they judged it as the least prototypical. """ filename = 'semeval12-2/{}/GoldRatings-{}.txt'.format(subset, subclass) with open(get_support_data_filename(filename)) as file: gold_ranks = [] for line in file: if line.startswith('#'): continue gold_score, pair = line.split() gold_score = float(gold_score) gold_ranks.append((pair, gold_score)) return sorted(gold_ranks)
'French' >>> CODE_TO_NAME['en']['fra'] 'French' >>> NAME_TO_CODE['en']['French'] 'fr' >>> NAME_TO_CODE['en']['Mandarin'] 'cmn' >>> NAME_TO_CODE['de']['Dutch'] 'ndl' """ from conceptnet5.util import get_support_data_filename import codecs import re ISO_DATA_FILENAME = get_support_data_filename('iso639-enfrde.txt') CODE_TO_NAME = {'en': {}, 'de': {}, 'fr': {}} NAME_TO_CODE = {'en': {}, 'de': {}, 'fr': {}} # The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the # browsable Web interface. # # This might be too many. SUPPORTED_LANGUAGE_CODES = [ 'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', 'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'io',
# coding: utf-8 from __future__ import unicode_literals from conceptnet5.util import get_support_data_filename from conceptnet5.builders.index_assertions import index_assertions from nose.tools import eq_ from conceptnet5.api import app, configure_api import os import json TESTDATA_DIR = get_support_data_filename("testdata") ASSERTIONS_DIR = os.path.join(TESTDATA_DIR, 'input/assertions') DB_PATH = os.path.join(TESTDATA_DIR, 'output/assertions.db') ASSOC_DIR = os.path.join(TESTDATA_DIR, 'input/assoc_space') SPANISH_EXAMPLE = '/a/[/r/RelatedTo/,/c/es/verbigracia/n/,/c/en/example/]' CLIENT = None def setup(): global CLIENT index_assertions(ASSERTIONS_DIR, DB_PATH, input_shards=1, output_shards=1) configure_api(DB_PATH, ASSERTIONS_DIR, ASSOC_DIR, nshards=1) CLIENT = app.test_client() def teardown(): os.unlink(DB_PATH + '.0') def uris(response): assertions = response['edges']
def setUp(): global context context_filename = get_support_data_filename('ld/context.ld.json') context = json.load(open(context_filename))
import json from pyld import jsonld from conceptnet5.api import lookup_grouped_by_feature, lookup_paginated from conceptnet5.util import get_support_data_filename from conceptnet5.tests.conftest import run_build CONTEXT = json.load(open(get_support_data_filename('ld/context.ld.json'))) def flat_map(response): """ Transform a response using JSON-LD's "flatten" operation, and return a dictionary mapping resources (as fully-qualified URLs) to their values (also containing fully-qualified URLs). """ # The URL in '@context' may not be available yet, because we probably # haven't deployed. So replace the response's "@context" with the # contents of that file. response['@context'] = CONTEXT['@context'] # jsonld.flatten gives us a list of objects, which all have @id values # (unless they're awkward "blank nodes", like definitions of features). # The @id values are unique after flattening, so we can make a dictionary # keyed by them. result = {} flat_objects = jsonld.flatten(response) for obj in flat_objects: if '@id' in obj: result[obj['@id']] = obj
def read_mturk(): with open(get_support_data_filename('mturk/MTURK-771.csv')) as file: for line in file: term1, term2, sscore = line.split(',') gold_score = float(sscore) yield term1, term2, gold_score
def get_blacklist(): filename = get_support_data_filename('blacklist.txt') return set(open(filename).readlines())
'French' >>> CODE_TO_NAME['en']['fra'] 'French' >>> NAME_TO_CODE['en']['French'] 'fr' >>> NAME_TO_CODE['en']['Mandarin'] 'cmn' >>> NAME_TO_CODE['de']['Dutch'] 'ndl' """ from conceptnet5.util import get_support_data_filename import codecs import re ISO_DATA_FILENAME = get_support_data_filename('iso639-enfrde.txt') CODE_TO_NAME = {'en': {}, 'de': {}, 'fr': {}} NAME_TO_CODE = {'en': {}, 'de': {}, 'fr': {}} # The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the # browsable Web interface. # # This might be too many. SUPPORTED_LANGUAGE_CODES = [ 'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', 'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik',
# coding: utf-8 from __future__ import unicode_literals from conceptnet5.wiktparse.rules import EnWiktionarySemantics from conceptnet5.util import get_support_data_filename from nose.tools import eq_ import os TESTDATA_DIR = get_support_data_filename("testdata") def data_path(filename): return os.path.join(TESTDATA_DIR, filename) ENTRY = { 'site': 'en.wiktionary.org', 'sections': [{ 'sections': [], 'text': '*[[odečítat]]', 'heading': 'Alternative forms' }, { 'sections': [{ 'sections': [], 'text': '{{cs-conj-at|odčít}}', 'heading': 'Conjugation' }, { 'sections': [], 'text': '* [[sčítat]]', 'heading': 'Antonyms' }, {
from __future__ import unicode_literals import codecs import json from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.nodes import standardized_concept_uri from conceptnet5.edges import make_edge from conceptnet5.util import get_support_data_filename from conceptnet5.uri import Licenses FRAME_DATA = json.load( codecs.open(get_support_data_filename('zh_frames.json'), encoding='utf-8')) def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace( '{2}', '[[' + concept2 + ']]') # We mark surface texts with * if {2} comes before {1}. if ftext.find('{2}') < ftext.find('{1}'): surfaceText = '*' + surfaceText start = standardized_concept_uri('zh_TW', concept1) end = standardized_concept_uri('zh_TW', concept2) source = { 'contributor': '/s/contributor/petgame/' + user, 'activity': '/s/activity/ptt/petgame'
from __future__ import unicode_literals import codecs import json from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.nodes import standardized_concept_uri from conceptnet5.edges import make_edge from conceptnet5.util import get_support_data_filename from conceptnet5.uri import Licenses FRAME_DATA = json.load( codecs.open(get_support_data_filename('zh_frames.json'), encoding='utf-8') ) def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]') # We mark surface texts with * if {2} comes before {1}. if ftext.find('{2}') < ftext.find('{1}'): surfaceText = '*' + surfaceText start = standardized_concept_uri('zh_TW', concept1) end = standardized_concept_uri('zh_TW', concept2) source = { 'contributor': '/s/contributor/petgame/' + user,